## Create simulated organisation and transactions

In [7]:
from decimal import Decimal, ROUND_HALF_UP, ROUND_HALF_DOWN # use in rounding floating numbers 
import datetime 
from datetime import date
import time
import random
import pandas as pd
import numpy as np
#!conda install faker
#!pip install faker
from faker import Faker

In [8]:
import pathlib
pathlib.Path('./test_data').mkdir(parents=True, exist_ok=True)
pathlib.Path('./output').mkdir(parents=True, exist_ok=True)




The short story is that we create a simulated organization with employees and managers using the Faker library to create various names and titles. 

Then we create a random selection of people, dates and amounts to simulate an expense. The claim date is a random date X days from the date of the expenditure (note that this is not how in real life claims would behave)

TODO: We seed with a configurable number of issues


In [9]:
# Update these variables to configure how large are going to be the datasets we are going to generate
number_of_transactions= 1000
number_of_individuals=200000
titles=[['Executive','Senior Director','Chairman'],
        ['Director'],
        ['Department Head',"Senior Manager"],
        ['Manager'],
        ['Team lead','Supervisor'],
        ['Senior staff','Staff','Junior Staff', 'Assistant', 'Intern','Temp']]    
#band is the index of the titles-1, ie 1 is top band, 6 is staff

fake = Faker()

In [10]:
def add_member(id,fullname,manager_id, manager_fullname,titles,list,avgtsize,maxtsize,osize,band,staff_band, ccentre,manager_borg_id):
    """Recursive function to create teams under people"""    
    if len(list)>=osize:
        return list
    if fullname=='':
        fullname=fake.first_name()+' '+fake.last_name()
    if manager_fullname=='':
        manager_fullname=fullname     
    title=np.random.choice(titles[band-1])           
    if band<staff_band:
        ccentre=str(ccentre)+"."+str(id)
    borg_id=str(manager_borg_id)+"."+str(id)            
    list.append([fullname,id,manager_id,manager_fullname,band,ccentre,borg_id,title,False]) 
    if band<staff_band:               
        if band==1: #big boss we force a minimum executive team of 7
            tsize = max(7, int(random.normalvariate(avgtsize, 2)))
        else:
            tsize = min(maxtsize,max(0, int(random.normalvariate(avgtsize, 3))))      
        if (band <= 3) and (random.random() > .8) and (len(list)<osize):
            secretary_name=fake.first_name()+' '+fake.last_name()
            secretary_title=np.random.choice(["PA","Secretary","Assistant"])
            secretary_band=np.random.choice([4,5,6])
            list.append([secretary_name,len(list)+1,id ,fullname,secretary_band,ccentre,str(manager_borg_id)+"."+str(id) ,secretary_title,True]) 
            
        if tsize>0:
            manager_id=id
            for n in range(0,tsize):                       
                if len(list)>=osize:
                    break
                id=len(list)+1
                list=add_member(id,'',manager_id,fullname,titles, list,avgtsize, maxtsize,osize,band+1,staff_band,ccentre,borg_id)                       
    return list  



In [11]:
list=add_member(1,'Steve Jobs',1,'',titles,[],6,12,number_of_individuals,1,6,1,1)
df=pd.DataFrame(list,columns=['employee_name','employee_id','manager_id','manager_name','band','cost_centre','borg_id','employee_title','is_pa'])
print(df.shape)
df[df['manager_name']=="Steve Jobs"]

(9553, 9)


Unnamed: 0,employee_name,employee_id,manager_id,manager_name,band,cost_centre,borg_id,employee_title,is_pa
0,Steve Jobs,1,1,Steve Jobs,1,1.1,1.1,Senior Director,False
1,Sarah Morgan,2,1,Steve Jobs,2,1.1.2,1.1.2,Director,False
1813,Derek Smith,1814,1,Steve Jobs,2,1.1.1814,1.1.1814,Director,False
2333,Nathan Sanchez,2334,1,Steve Jobs,2,1.1.2334,1.1.2334,Director,False
4408,Brian Miller,4409,1,Steve Jobs,2,1.1.4409,1.1.4409,Director,False
5448,Aaron Odom,5449,1,Steve Jobs,2,1.1.5449,1.1.5449,Director,False
6089,Tina Smith,6090,1,Steve Jobs,2,1.1.6090,1.1.6090,Director,False
7512,Beth Ward,7513,1,Steve Jobs,2,1.1.7513,1.1.7513,Director,False


In [12]:
df.iloc[44]['employee_name']

'Samuel Singh'

In [13]:
#We generate the simulated expense claims
def generate_line(df,yyyy):       
    r=random.randint(0,len(df)-1)
    employee_name=df.iloc[r]['employee_name']
    employee_id=df.iloc[r]['employee_id']
    manager_id=df.iloc[r]['manager_id']
    cost_centre=df.iloc[r]['cost_centre']
    employee_title=df.iloc[r]['employee_title']
    start_date=date(yyyy,1,1)
    end_date=date(yyyy,12,31)
    description= fake.text(max_nb_chars=200) 
    date1= fake.date_time_between(start_date=start_date, end_date=end_date)     
    date1_str = date1.strftime("%d-%m-%Y")                                            
    d=max(0,random.normalvariate(30, 8))
    date2= date1+ datetime.timedelta(days=d)
    date2_str = date2.strftime("%d-%m-%Y %H:%M:%S")       
    amountfp=random.normalvariate(60, 20)
    amount= Decimal(str(amountfp)).quantize(Decimal('1.11'), rounding=ROUND_HALF_UP) 
    amount_str=str(amount)
    line=[employee_id,employee_name,date1_str,amount_str,date2_str,cost_centre,employee_title,manager_id, description]
    return line

def generate_expenses(number_of_transactions, df_org):
    simulated_expenses=[]
    count=1
    simulated_expenses_header=['employee_id','employee_name','date_expense','amount','date_claim','cost_centre','employee_title','approver_id','description']
    while count<=number_of_transactions:
        simulated_expenses.append(generate_line(df_org,2020))
        count+=1
    print ("Created a simulated run of " + str(len(simulated_expenses)) + " expenses entries")
    return pd.DataFrame(data=simulated_expenses,columns=simulated_expenses_header)

In [14]:
df1= generate_expenses(1000, df)
print(df1.shape)
df1.head()

Created a simulated run of 1000 expenses entries
(1000, 9)


Unnamed: 0,employee_id,employee_name,date_expense,amount,date_claim,cost_centre,employee_title,approver_id,description
0,6195,Theresa Nicholson,19-06-2020,19.86,26-07-2020 14:21:02,1.1.6090.6091.6134.6190,Senior staff,6190,Character however spend treat worker friend to...
1,3110,Devin Larsen,09-06-2020,50.33,09-07-2020 01:13:04,1.1.2334.3057.3058.3101,Assistant,3101,Rise suggest major tonight center. Process mom...
2,1041,Karen Wade,08-10-2020,51.38,27-10-2020 22:19:53,1.1.2.988.1040.1041,Supervisor,1040,House doctor room. He road second push because...
3,1040,Kimberly Le,14-12-2020,18.84,24-01-2021 14:30:22,1.1.2.988.1040,Manager,988,Democratic control nor put special yes house m...
4,4677,Michelle Gregory,10-11-2020,68.4,30-11-2020 18:43:47,1.1.4409.4515.4642.4672,Staff,4672,Could hundred few drop. Individual force dark ...


In [15]:
df2= pd.merge(df1, df.loc[:,['employee_id','borg_id','band','is_pa']], how='left', on='employee_id',
         left_index=False, right_index=False, sort=False,
         suffixes=('', ''), copy=True, indicator=False)
print (df2.iloc[0])

employee_id                                                    6195
employee_name                                     Theresa Nicholson
date_expense                                             19-06-2020
amount                                                        19.86
date_claim                                      26-07-2020 14:21:02
cost_centre                                 1.1.6090.6091.6134.6190
employee_title                                         Senior staff
approver_id                                                    6190
description       Character however spend treat worker friend to...
borg_id                                1.1.6090.6091.6134.6190.6195
band                                                              6
is_pa                                                         False
Name: 0, dtype: object


In [16]:
df3= pd.merge(df2, df.loc[:,['employee_id','employee_name','employee_title','borg_id','band']], how='left',left_on='approver_id', right_on='employee_id',
         left_index=False, right_index=False, sort=False,
         suffixes=('', '_approver'), copy=True, indicator=False)
print (df3.iloc[0])

employee_id                                                             6195
employee_name                                              Theresa Nicholson
date_expense                                                      19-06-2020
amount                                                                 19.86
date_claim                                               26-07-2020 14:21:02
cost_centre                                          1.1.6090.6091.6134.6190
employee_title                                                  Senior staff
approver_id                                                             6190
description                Character however spend treat worker friend to...
borg_id                                         1.1.6090.6091.6134.6190.6195
band                                                                       6
is_pa                                                                  False
employee_id_approver                                                    6190

In [17]:
df.to_excel('./test_data/org.xlsx')
