Generate dataset containing 

Vertices:
- Company
- Person

Edges:
- claims_dependent (Person -> Person)
- owned_by (Company -> Company) [maybe this could also be Company -> Person]
- employed_by (Person -> Company)


In [80]:
from faker import Faker
import numpy as np
import pandas as pd

NUM_COMPANIES = 20
NUM_PEOPLE = 100

DEPENDENCY_RATE = 0.47
# Roughly what fraction of Persons are in a claims_dependent relationship?
SUBSIDIARY_RATE = 0.3
# Roughly what fraction of Companies are in an owned_by relationship?
UNEMPLOYMENT_RATE = 0.02
# Roughly what fraction of Persons are not in an employed_by relationship?

NUM_DEPENDENTS = int(NUM_PEOPLE * DEPENDENCY_RATE)
NUM_OWNERSHIPS = int(NUM_COMPANIES * SUBSIDIARY_RATE)
NUM_EMPLOYMENTS = int(NUM_PEOPLE * (1 - UNEMPLOYMENT_RATE))

In [18]:
f = Faker()

In [19]:
companies = [{"type":"company", "name":f.company(), "address":f.address().replace("\n"," ")} for n in range(0, NUM_COMPANIES)]

In [20]:
company_df = pd.DataFrame(companies)[['type', 'name', 'address']]
company_df.head()

Unnamed: 0,type,name,address
0,company,Cooper-Green,"6647 Roger Walks Suite 088 Julieburgh, NY 53881"
1,company,Williams-Figueroa,"65694 Maureen Mountain Morganhaven, ND 01430"
2,company,Montes LLC,"2464 Mark Unions Suite 345 Johnview, MN 95700"
3,company,Gomez-Morgan,"565 Jason Park Thomasmouth, TX 84271"
4,company,Bowen-Payne,"6161 Lynn Summit Suite 881 South Darleneshire,..."


In [26]:
adults = [{"type":"person", "name":f.name(), "address":f.address().replace("\n"," ")} for n in range(0, NUM_ADULTS)]

In [27]:
adults_df = pd.DataFrame(adults)[['type', 'name', 'address']]

In [28]:
adults_df.head()

Unnamed: 0,type,name,address
0,person,Margaret Khan,"1533 Frederick Alley Jamesmouth, SD 62854"
1,person,Terry Herring,"7340 Simmons Square Apt. 770 Isabellaville, MS..."
2,person,Mariah Peterson,"829 Combs Expressway Jamesstad, VT 68867"
3,person,Jeffrey Baird,"49842 Christopher Ports Christianton, AR 61646"
4,person,Grant Smith,"5123 Buckley Harbor Apt. 998 Alexandertown, KS..."


In [33]:
vertices = company_df.append(adults_df, ignore_index=True).reset_index().rename(columns={"index":"id"})
vertices[NUM_COMPANIES - 5: NUM_COMPANIES + 5]

Unnamed: 0,id,type,name,address
15,15,company,"Smith, Reed and Flores","6892 Sanchez Union Suite 513 Stewarttown, NC 6..."
16,16,company,Harris Inc,"024 Good Overpass New Lesliehaven, NC 62986"
17,17,company,"Diaz, Sanchez and Williams",00807 Meadows Prairie Apt. 382 East Melaniebur...
18,18,company,Jones Inc,Unit 5674 Box 1556 DPO AA 42502
19,19,company,Holmes Ltd,"90746 Beasley Shoal Suite 136 New Joseph, WA 4..."
20,20,person,Margaret Khan,"1533 Frederick Alley Jamesmouth, SD 62854"
21,21,person,Terry Herring,"7340 Simmons Square Apt. 770 Isabellaville, MS..."
22,22,person,Mariah Peterson,"829 Combs Expressway Jamesstad, VT 68867"
23,23,person,Jeffrey Baird,"49842 Christopher Ports Christianton, AR 61646"
24,24,person,Grant Smith,"5123 Buckley Harbor Apt. 998 Alexandertown, KS..."


In [76]:
# claims_dependent()
random_company = lambda: np.random.randint(0,NUM_COMPANIES)
random_person = lambda: np.random.randint(NUM_COMPANIES, NUM_PEOPLE)

In [82]:
claims_dependent = [{"src": random_person(), "dst": random_person(), "relationship": "claims_dependent"} for x in range(0, NUM_DEPENDENTS)]
owned_by = [{"src": random_company(), "dst": random_company(), "relationship": "owned_by"} for x in range(0, NUM_OWNERSHIPS)]
employed_by = [{"src": random_person(), "dst": random_company(), "relationship": "employed_by"} for x in range(0, NUM_EMPLOYMENTS)]

In [93]:
edges = pd.DataFrame(claims_dependent).append(pd.DataFrame(owned_by)).append(pd.DataFrame(employed_by))[['src', 'dst', 'relationship']]

In [94]:
edges.relationship.value_counts()
# These values should match the parameters at the top of the notebook

employed_by         98
claims_dependent    47
owned_by             6
Name: relationship, dtype: int64

In [96]:
edges[NUM_DEPENDENTS - 2: NUM_DEPENDENTS + 2]

Unnamed: 0,src,dst,relationship
45,49,66,claims_dependent
46,63,76,claims_dependent
0,13,12,owned_by
1,7,13,owned_by


In [99]:
vertices.to_csv("./peopleAndCompanies_vertices.csv", index=False)
edges.to_csv("./peopleAndCompanies_edges.csv", index=False)