Generate dummy data for fuzzy matching

In [1]:
import pandas as pd
import random
from faker import Faker
from faker.providers import person
from faker.providers import address
from faker.providers import date_time
import datetime
import numpy as np
import math


This function generates a random data with DOB that doesnt have the limit of 1970

In [23]:
def generate_master(fake):
    d = dict()  
    d['gender'] = 'M' if random.randint(0,1) == 0 else 'F'
    d['title']= fake.prefix_male() if d['gender']=="M" else fake.prefix_female()
    d['first_name'] = fake.first_name_male() if d['gender']=='M' else fake.first_name_female()
    d['middle_name'] = fake.first_name() if random.randint(0,5) == 0 else ''   
    d['middle_name']= d['middle_name'][0] if (d['middle_name']!='') and (random.randint(0,5)) == 0 else d['middle_name']
    d['last_name'] = fake.last_name()
    d['suffix']=fake.suffix() if random.randint(0,30) == 0 else ''
    d['country']=fake.country()
    d['city']=fake.city()
    #note that faker has 1970 limit, for earlier dates you need a custom formula and it is trickier
    d['dob'] = fake.date_between_dates(date_start=datetime.date(1971,1,1), date_end=datetime.date.today())
    d['address']= fake.street_address()
    d['customer_id']=str(fake.ean(length=13))
    d['full_address']=d['address']+', '+ d['city']+', '+ d['country']
    d['full_name']= (d['first_name']+ ' ' + d['middle_name']+ ' '+d['last_name']+' '+d['suffix']).strip()
    return d

def generate_transaction(fake, customer_dict):
    d = dict()  
    d['date'] = fake.date_between(start_date='-3y', end_date='today')
    d['customer_id']=customer_dict['customer_id']
    #d['quantity']= math.ceil(np.random.normal(loc=1.5, scale=0.5))
    d['quantity']= random.randint(1,100)   
    d['unit_price']= random.randint(300,50000)/100
    d['total_value']= d['quantity'] * d['unit_price']
    return d

def seed_list_with_duplicates(list_to_seed, number_of_seeds=0):
    if number_of_seeds<=0:
        return list_to_seed
    numbers = np.random.choice(len(list_to_seed), size=number_of_seeds, replace=False)
    for i in numbers:
        list_to_seed[i]=list_to_seed[i-1]
    print("Seeded :", len(numbers)," records")
    return list_to_seed


def generate_dataset(num_rec_master,num_rec_trans, num_seed=0):
    fake = Faker()
    fake.add_provider(person)
    fake.add_provider(address)
    fake.add_provider(date_time)
    master = pd.DataFrame()
    rec=None
    record_list=[]
    print("Generating master file with ",num_rec_master," records")
    for i in range(num_rec_master):
        rec=generate_master(fake)
        record_list.append(rec)
    master = master.append(record_list, ignore_index=True)  
    transact = pd.DataFrame()
    rec=None
    record_list=[]
    n=0
    master_len=len(master.index)
    print("Generated master file with ",master_len," records")
    print("Generating transaction file with",num_rec_trans," records")
    for i in range(num_rec_trans):
        n=n+1
        r=np.random.uniform(low=0, high=master_len, size=1).astype(int)
        customer_dict=master.iloc[r].to_dict('r')[0]
        rec=generate_transaction(fake, customer_dict)
        record_list.append(rec)
        if (n == 100000):
             print('Progress report...', len(record_list))
             n = 0
    print("Seeding the file with ", num_seed," duplicate instance")
    record_list= seed_list_with_duplicates(record_list,num_seed)
    transact = transact.append(record_list, ignore_index=True)  
    return master, transact



In [24]:
master, transact=generate_dataset(10,1000,1)

Generating master file with  10  records
Generated master file with  10  records
Generating transaction file with 1000  records
Seeding the file with  1  duplicate instance
Seeded : 1  records


In [25]:
master.head()

Unnamed: 0,gender,title,first_name,middle_name,last_name,suffix,country,city,dob,address,customer_id,full_address,full_name
0,F,Mrs.,Susan,,Glenn,,Dominica,New Douglasmouth,1977-03-09,7711 Tracy Dam Suite 800,6017370469683,"7711 Tracy Dam Suite 800, New Douglasmouth, Do...",Susan Glenn
1,F,Dr.,Heather,,Hood,,Seychelles,Veleztown,2014-10-15,09598 Hernandez Freeway Apt. 669,6511948444015,"09598 Hernandez Freeway Apt. 669, Veleztown, S...",Heather Hood
2,M,Dr.,Matthew,,Neal,,Jamaica,West Leah,1994-11-21,1433 Christian Plaza Apt. 753,5841358133133,"1433 Christian Plaza Apt. 753, West Leah, Jamaica",Matthew Neal
3,M,Mr.,Carl,,Whitney,,Latvia,Port Olivia,2014-10-14,736 Michelle Road,3688002093649,"736 Michelle Road, Port Olivia, Latvia",Carl Whitney
4,M,Dr.,Ian,,Cruz,,Samoa,Smithbury,2005-02-17,851 Shelton Extension,7703645493659,"851 Shelton Extension, Smithbury, Samoa",Ian Cruz


In [26]:
transact.head()

Unnamed: 0,date,customer_id,quantity,unit_price,total_value
0,2021-02-03,3688002093649,25,482.66,12066.5
1,2021-02-03,3688002093649,7,131.96,923.72
2,2020-12-18,63608853799,99,445.15,44069.85
3,2019-05-17,5841358133133,27,8.97,242.19
4,2019-03-16,7703645493659,90,89.28,8035.2


In [29]:
transact[transact.duplicated(keep='last')]

Unnamed: 0,date,customer_id,quantity,unit_price,total_value
467,2020-06-27,3688002093649,70,123.1,8617.0


In [30]:

transact.to_csv("./test_data/transact_short.csv")
master.to_csv("./test_data/master_short.csv")

In [140]:
#This is to generate large number of transactions
#master, transact=generate_dataset(2000,2000000)
#transact.to_csv("./test_data/transact_long.csv")
#master.to_csv("./test_data/master_long.csv")

Generating master file with  2000  records
Generated master file with  2000  records
Generating transaction file with 2000000  records
Progress report... 200000
Progress report... 400000
Progress report... 600000
Progress report... 800000
Progress report... 1000000
Progress report... 1200000
Progress report... 1400000
Progress report... 1600000
Progress report... 1800000
Progress report... 2000000
