In [10]:
import pandas as pd
import random
from faker import Faker
from faker.providers import person
from faker.providers import address
from faker.providers import date_time
import datetime
import numpy as np


In [1]:
import pathlib
pathlib.Path('./input').mkdir(parents=True, exist_ok=True)
pathlib.Path('./output').mkdir(parents=True, exist_ok=True)

This function generates a random data that doesnt have the limit of 1970

In [4]:
def generate_dob(start, end, n=1):
    start_u = start.value//10**9 # // is the floor divison operator removing nanoseconds
    end_u = end.value//10**9  # ** is the power operator 
    return pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, n, dtype=np.int64)).view('M8[ns]')).normalize()[0]


In [5]:
start = pd.to_datetime('1940-1-1')
end = pd.to_datetime('today')
print(start.value)
print(start.value//10**9)
print (end.value)
print (pd.to_datetime((start.value//10**9)*10**9))
generate_dob(start,end)

-946771200000000000
-946771200
1602919175067064000
1940-01-01 00:00:00


Timestamp('2006-04-11 00:00:00')

In [12]:
def generate_record(fake):
    d = dict()  
    d['gender'] = 'M' if random.randint(0,1) == 0 else 'F'
    d['title']= fake.prefix_male() if d['gender']=="M" else fake.prefix_female()
    d['first_name'] = fake.first_name_male() if d['gender']=='M' else fake.first_name_female()
    d['middle_name'] = fake.first_name() if random.randint(0,5) == 0 else ''   
    d['middle_name']= d['middle_name'][0] if (d['middle_name']!='') and (random.randint(0,5)) == 0 else d['middle_name']
    d['last_name'] = fake.last_name()
    d['suffix']=fake.suffix() if random.randint(0,30) == 0 else ''
    d['country']=fake.country()
    d['city']=fake.city()
    #note that this library has 1970 limit, for earlier dates you need a custom formula and it is trickier
    #d['dob'] = fake.date_between_dates(date_start=datetime.date(1970,1,1), date_end=datetime.date.today())
    start = pd.to_datetime('1940-1-1')
    end = pd.to_datetime('today')
    d['dob']=generate_dob(start,end)
    d['address']= fake.street_address()
    d['member_id']=str(fake.ean(length=13))
    d['full_address']=d['address']+', '+ d['city']+', '+ d['country']
    if random.randint(0,9)>0:
        d['full_name']= d['first_name']+ ' ' + d['middle_name']+ ' '+d['last_name']+' '+d['suffix']
    else:
        d['full_name']= d['last_name']+ ', '+ d['first_name']+ ' ' + d['middle_name']+(', ' if d['suffix'] != '' else '') +d['suffix']
    
    d['full_name']= " ".join(d['full_name'].split())
    return d


def generate_dataset(number_of_records):
    fake = Faker()
    fake.add_provider(person)
    fake.add_provider(address)
    fake.add_provider(date_time)
    df = pd.DataFrame()
    rec=None
    record_list=[]
    for i in range(number_of_records):
        rec=generate_record(fake)
        record_list.append(rec)
    df = df.append(record_list, ignore_index=True)  
    return df



In [13]:
df=generate_dataset(500)
df.head()

Unnamed: 0,gender,title,first_name,middle_name,last_name,suffix,country,city,dob,address,member_id,full_address,full_name
0,F,Mrs.,Nichole,,Howell,MD,Niue,West Baileyhaven,1995-05-30,20671 Sims Tunnel Apt. 988,9521648372256,"20671 Sims Tunnel Apt. 988, West Baileyhaven, ...",Nichole Howell MD
1,F,Dr.,Suzanne,,Braun,,Lesotho,North Henrybury,2002-07-30,796 Willie Throughway Apt. 137,6834313056423,"796 Willie Throughway Apt. 137, North Henrybur...",Suzanne Braun
2,M,Dr.,Eduardo,,Wang,,Singapore,East Roger,1943-01-18,18777 Jacqueline Flat,4631084451325,"18777 Jacqueline Flat, East Roger, Singapore",Eduardo Wang
3,F,Mrs.,Stephanie,,Ruiz,,Slovakia (Slovak Republic),North Angela,1971-07-25,546 Joseph Dale,5327759230393,"546 Joseph Dale, North Angela, Slovakia (Slova...",Stephanie Ruiz
4,F,Mrs.,Katherine,,Cisneros,,Comoros,Julianmouth,1996-03-13,64233 Diane Passage,8319126373779,"64233 Diane Passage, Julianmouth, Comoros","Cisneros, Katherine"


In [37]:
df.to_excel("./input/data_short.xlsx",sheet_name="Data")

In [38]:
df=generate_dataset(2000)

In [39]:
df.to_excel("./input/data_long.xlsx",sheet_name="Data")