In [5]:
import pandas as pd
import numpy as np
import faker

# create some fake data
fake = faker.Faker()


def make_workers(num):

    # lists to randomly assign to workers
    status_list = ['Full Time', 'Part Time', 'Contractual']

    fake_workers = [{'Client ID': x+1000,
                     'Name': fake.name(),
                     'Age': np.random.randint(18, 65),
                     'Marital': np.random.choice(['Married', 'Single', 'Divorced']),
                     'Housing': np.random.choice(['Own', 'Rent', 'Other']),
                     'Experience': fake.date_between(start_date='-10y', end_date='today'),
                     'Status': np.random.choice(status_list, p=[0.50, 0.30, 0.20]),
                     'Salary':fake.pyint(min_value=10000, max_value=100000, step=1000),
                    'Bonus':fake.pyint(min_value=1000, max_value=10000, step=1000),
                     'Loan':np.random.choice(['Yes', 'No']),
                     } for x in range(num)]

    return fake_workers


worker_df = pd.DataFrame(make_workers(num=50000))
worker_df.head()


Unnamed: 0,Client ID,Name,Age,Marital,Housing,Experience,Status,Salary,Bonus,Loan
0,1000,Renee Mccoy,59,Married,Own,2019-03-03,Full Time,33000,10000,Yes
1,1001,Terry Thompson MD,47,Divorced,Own,2015-05-21,Contractual,93000,2000,Yes
2,1002,Nicole Baker,39,Single,Other,2020-04-16,Part Time,93000,8000,Yes
3,1003,Aaron Roberts,36,Married,Rent,2018-11-16,Part Time,44000,9000,No
4,1004,John Mayo MD,56,Divorced,Own,2015-06-25,Full Time,59000,8000,Yes


In [6]:
# function to create widget data

import random


def make_widget_data(num):
    
    fake_widgets = [{'Item Number':id(y),
                     'Step 1':np.random.gamma(shape=3, scale=1),
                     'Step 2':np.random.normal(5), 
                     'Step 3':np.random.exponential(4)} for y in range(num)]
    
    return fake_widgets

# empty list to store our widget dataframes in    
dfs_list = []

# now lets make some widget data for each worker
# iterate through the worker dataframe
for index, row in worker_df.iterrows():
    
    # not all workers work at the same rate - or the same number of hours
    # randomly select a number of widgets for them to create based on 'worker status'
    if row['Status'] == 'Full Time':
        num_widgets = random.randrange(500, 1000)
    elif row['Status'] == 'Part Time':
        num_widgets = random.randrange(100, 500)
    else:
        num_widgets = random.randrange(1, 1000)
    
    # make widgets for each worker
    tmp_widgets = pd.DataFrame(make_widget_data(num=num_widgets))
    
    # add worker id so we know who made the widget
    tmp_widgets['Client ID'] = row['Client ID']
    
    # make sure item number is unique by appending worker id
    tmp_widgets['Item Number'] = tmp_widgets['Item Number'].astype(
        'str') + '-' + tmp_widgets['Client ID'].astype('str')
    
    # append to df list
    dfs_list.append(tmp_widgets)
    
# concatenate all the dfs 
widget_df = pd.concat(dfs_list)
print(widget_df.shape)
widget_df.head()

(28187376, 5)


Unnamed: 0,Item Number,Step 1,Step 2,Step 3,Client ID
0,9801216-1000,2.02094,3.664023,2.813459,1000
1,9801248-1000,1.334786,6.324156,1.050447,1000
2,9801280-1000,3.985387,5.033007,4.968739,1000
3,9801312-1000,2.836651,5.824551,2.874545,1000
4,9801344-1000,2.325237,5.300906,3.205556,1000


In [26]:
worker_df.to_csv('clients.csv', index=False)