Generate dummy data for fuzzy matching

In [84]:
import pandas as pd
import random
from faker import Faker
from faker.providers import person
from faker.providers import address
from faker.providers import date_time
import datetime
import numpy as np
import math


This function generates a random data that doesnt have the limit of 1970

In [134]:
def generate_master(fake):
    d = dict()  
    d['gender'] = 'M' if random.randint(0,1) == 0 else 'F'
    d['title']= fake.prefix_male() if d['gender']=="M" else fake.prefix_female()
    d['first_name'] = fake.first_name_male() if d['gender']=='M' else fake.first_name_female()
    d['middle_name'] = fake.first_name() if random.randint(0,5) == 0 else ''   
    d['middle_name']= d['middle_name'][0] if (d['middle_name']!='') and (random.randint(0,5)) == 0 else d['middle_name']
    d['last_name'] = fake.last_name()
    d['suffix']=fake.suffix() if random.randint(0,30) == 0 else ''
    d['country']=fake.country()
    d['city']=fake.city()
    #note that faker has 1970 limit, for earlier dates you need a custom formula and it is trickier
    d['dob'] = fake.date_between_dates(date_start=datetime.date(1971,1,1), date_end=datetime.date.today())
    d['address']= fake.street_address()
    d['customer_id']=str(fake.ean(length=13))
    d['full_address']=d['address']+', '+ d['city']+', '+ d['country']
    d['full_name']= (d['first_name']+ ' ' + d['middle_name']+ ' '+d['last_name']+' '+d['suffix']).strip()
    return d

def generate_transaction(fake, customer_dict):
    d = dict()  
    d['date'] = fake.date_between(start_date='-3y', end_date='today')
    d['customer_id']=customer_dict['customer_id']
    #d['quantity']= math.ceil(np.random.normal(loc=1.5, scale=0.5))
    d['quantity']= random.randint(1,100)   
    d['unit_price']= random.randint(300,50000)/100
    d['total_value']= d['quantity'] * d['unit_price']
    return d

def seed_list_with_duplicates(list_to_seed, number_of_seeds=0):
    from numpy.random import default_rng
    if number_of_seeds<=0:
        return list_to_seed
    rng = default_rng()
    numbers = rng.choice(len(list_to_seed), size=number_of_seeds, replace=False)
    for i in numbers:
        list_to_seed[i]=list_to_seed[i-1]
    


def generate_dataset(num_rec_master,num_rec_trans, num_seed=0):
    fake = Faker()
    fake.add_provider(person)
    fake.add_provider(address)
    fake.add_provider(date_time)
    master = pd.DataFrame()
    rec=None
    record_list=[]
    print("Generating master file with ",num_rec_master," records")
    for i in range(num_rec_master):
        rec=generate_master(fake)
        record_list.append(rec)
    master = master.append(record_list, ignore_index=True)  
    transact = pd.DataFrame()
    rec=None
    record_list=[]
    n=0
    master_len=len(master.index)
    print("Generated master file with ",master_len," records")
    print("Generating transaction file with",num_rec_trans," records")
    for i in range(num_rec_trans):
        n=n+1
        r=np.random.uniform(low=0, high=master_len, size=1).astype(int)
        customer_dict=master.iloc[r].to_dict('r')[0]
        rec=generate_transaction(fake, customer_dict)
        record_list.append(rec)
        if (n == 100000):
             print('Progress report...', len(record_list))
             n = 0
    print("Seeding the file with ", num_seed," duplicate instance")
    seed_list_with_duplicates(record_list,num_seed)
    transact = transact.append(record_list, ignore_index=True)  
    return master, transact



In [135]:
master, transact=generate_dataset(10,1000,1)

Generating master file with  10  records
Generated master file with  10  records
Generating transaction file with 1000  records


In [136]:
master.head()

Unnamed: 0,gender,title,first_name,middle_name,last_name,suffix,country,city,dob,address,customer_id,full_address,full_name
0,F,Ms.,Amy,,Hawkins,,Saint Helena,Port Claytonport,1982-10-01,711 Annette Stravenue,3843257330738,"711 Annette Stravenue, Port Claytonport, Saint...",Amy Hawkins
1,F,Mrs.,Katrina,Bryan,Reese,,Belgium,Christopherhaven,1978-03-25,726 Valerie Fort Suite 168,5551546605044,"726 Valerie Fort Suite 168, Christopherhaven, ...",Katrina Bryan Reese
2,F,Dr.,Victoria,,Clark,,Saint Pierre and Miquelon,Erinburgh,2000-04-03,99910 Burgess Station,5082514769081,"99910 Burgess Station, Erinburgh, Saint Pierre...",Victoria Clark
3,F,Dr.,Jennifer,,Ward,,United States Minor Outlying Islands,East Nicole,2004-06-06,34357 Mann Branch,9852507963880,"34357 Mann Branch, East Nicole, United States ...",Jennifer Ward
4,F,Mrs.,Brittany,,Pineda,,United Arab Emirates,East Thomas,1979-07-13,9594 Bruce Flat Suite 244,3104902308701,"9594 Bruce Flat Suite 244, East Thomas, United...",Brittany Pineda


In [137]:
transact.head()

Unnamed: 0,date,customer_id,quantity,unit_price,total_value
0,2018-10-03,7356014855949,90,481.95,43375.5
1,2018-06-08,3104902308701,3,184.96,554.88
2,2018-10-20,5572308599326,70,446.62,31263.4
3,2018-02-01,2618377851149,1,65.35,65.35
4,2018-11-14,5572308599326,32,491.76,15736.32


In [138]:
transact[transact.duplicated(keep='last')]

Unnamed: 0,date,customer_id,quantity,unit_price,total_value
420,2020-08-04,9932113692470,77,264.38,20357.26


In [139]:
master, transact=generate_dataset(10,1000)
transact.to_csv("./test_data/transact_short.csv")
master.to_csv("./test_data/master_short.csv")

Generating master file with  10  records
Generated master file with  10  records
Generating transaction file with 1000  records


In [140]:
master, transact=generate_dataset(2000,2000000)
transact.to_csv("./test_data/transact_long.csv")
master.to_csv("./test_data/master_long.csv")

Generating master file with  2000  records
Generated master file with  2000  records
Generating transaction file with 2000000  records
Progress report... 200000
Progress report... 400000
Progress report... 600000
Progress report... 800000
Progress report... 1000000
Progress report... 1200000
Progress report... 1400000
Progress report... 1600000
Progress report... 1800000
Progress report... 2000000
