# Tutorial: Create simulated organization and expenses to do a load test

This is a notebook to perform a load test of how Pandas/Numpy analytics library behaves in the context of this notebook, and to learn a few things on the way.


In [1]:
# Update these variables to configure how large are going to be the datasets we are going to generate

number_of_transactions= 1000
number_of_individuals=100

## Generate simulated data

You can skip this section if you are not interested (yet) in how we create the data set. 

The short story is that we create a simulated organization with employees and managers, and then a random selection of people, dates and amounts to simulate an expense. The claim date is a random date X days from the date of the expenditure (note that this is not how in real life claims would behave, but maybe in future version of this tutorials I will get to do that bit).
The output are two "lists" in memory that store in each row another list, creating a two-dimensional table, that can be referenced as  list[row][column]

In [2]:
#This is needed when running it in azure because the FAKER module (needed to generate fake names)
#is not readily available
#!conda install faker
#!pip install faker
from faker import Faker
fake = Faker()

The following section imports the rest of the libraries needed, these should be standard in any Python
Then we declare the functions we need for generating the fake data

In [32]:
#!Python3

from decimal import Decimal, ROUND_HALF_UP, ROUND_HALF_DOWN # use in rounding floating numbers 
import datetime 
from datetime import date
import time
import random
import numpy as np




def generate_org(n):
    #This is the overall function to create the simulated organization
    
    
    def create_team(id,fullname,mid, mfullname ,list,maxtsize,osize,band,staff_band, ccentre,borg_id):
        """This is the recursive function we use to create teams under people"""
        
        if len(list)<osize:
            if fullname=='':
                fullname=fake.first_name()+' '+fake.last_name()
            if mfullname=='':
                mfullname=fullname            
            titles=[['Executive','Senior Director', 'Non Executive Dr','Chairman'],['Director'],['Department Head'],['Manager'],['Team lead','Supervisor','PA'],['Senior staff','Staff','Junior Staff', 'Assistant', 'Intern','Temp']]         
            
            titlechoice=titles[band-1]        
            title=np.random.choice(titlechoice) 
            
            if band<staff_band:
                ccentre=str(ccentre)+"."+str(id)
            borg_id=str(borg_id)+"."+str(id)            
            list.append([fullname,id,mid,mfullname,band,ccentre,borg_id,title]) 
            #If the person is not the lower "staff" band, then is a manager and needs a team below
            if band<staff_band:               
                if band==1:
                    tsize = max(4, int(np.random.normal(maxtsize, 1, 1)[0]))
                else:
                    tsize = max(0, int(np.random.normal(maxtsize, 1.5, 1)[0]))                
                if tsize>0:
                    mid=id
                    for n in range(1,tsize):                       
                        id=len(list)+1
                        if len(list)>=osize:
                             break
                        list=create_team(id,'',mid,fullname,list,maxtsize,osize,band+1,staff_band,ccentre,borg_id)                       
        return list  
    list=create_team(1,'Steve Jobs',1,'',[],10,n,1,6,1,1)
    
    namelist_header=['EmployeeName','EmployeeID','ManagerID','ManagerFullname','Band','CostCentre','BorgID','EmployeeTitle']
    print("Created an organization of " + str(len(namelist)) + " individuals")

    return list, namelist_header
    

def generate_desc():
    return fake.text(max_nb_chars=200) 

#We dont use yet this sanitise function, it would be handy if we have unicode or non printable characters in the names or description fields
#for example if we wanted to export a selection of expenses with the filename having the name of the manager
def sanitise(s):
    import unicodedata
    import string
    validChars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    cleaneds=s
    if type(s)=='unicode':
        try:
            cleaneds = unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore')
        except:
            #do nothing
            print ("error parsing unicode string")
        
    return  ''.join(c for c in cleaneds if c in validChars)

def generate_date(start,end):
    nbdays=(end-start).days
    d=random.randint(0,nbdays)
    return start+dt.timedelta(days=d)

    
def generate_line(namelist,yyyy):    
    
    r=random.randint(0,len(namelist)-1)
    person=namelist[r][0]
    person_id=namelist[r][1]
    approver_id=namelist[r][2]
    person_ccentre=namelist[r][5]
    person_title=namelist[r][7]
    start_date=date(yyyy,1,1)
    end_date=date(yyyy,12,31)
    description= generate_desc()
    date1= fake.date_time_between(start_date=start_date, end_date=end_date)     
    date1_str = date1.strftime("%d-%m-%Y")                                            
    d=max(0,np.random.normal(30,8,1)[0])
    date2= date1+ datetime.timedelta(days=d)
    date2_str = date2.strftime("%d-%m-%Y %H:%M:%S")       
    amountfp = abs(np.random.normal(20,8,1)[0])
    amount= Decimal(str(amountfp)).quantize(Decimal('1.11'), rounding=ROUND_HALF_UP) 
    amount_str=str(amount)
    line=[person_id,person,date1_str,amount_str,date2_str,person_ccentre,person_title,approver_id, description]
    return line

def export_csv(filename,list, header):
    #This is a utility function that just export a list as a csv
    import csv
    with open(filename , 'w') as csvfile:
        outputwriter = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC, dialect='excel-tab')
        outputwriter.writerow(header)
        for r in list: 
            outputwriter.writerow(r)


In [33]:

#We generate the simulated expense claims
def generate_simulated_expenses(number_of_transactions, namelist):
    simulated_expenses=[]
    count=1
    simulated_expenses_header=['EmployeeID','EmployeeName','DateExpense','Amount','DateClaim','CostCentre','EmployeeTitle','ApproverID','Description']
    while count<=number_of_transactions:
        simulated_expenses.append(generate_line(namelist,2016))
        count+=1
    print ("Created a simulated run of " + str(len(simulated_expenses)) + " expenses entries")
    #you can return just the list if you wish
    #return simulated_expenses
    #Alternative you can return the list and the header
    #return simulated_expenses, simulated_expenses_header
    #Or you could try something like to combine into a single list including header
    #combined_list= simulated_expenses_header
    #combined_list.extend(simulated_expenses)
    #Or you can return a Pandas DataFrame object
    return pd.DataFrame(data=simulated_expenses,columns=simulated_expenses_header)



## --- END SKIP---

# We create a Pandas dataframe and perform some merges
This section is to test the Pandas speed. Pandas sits on top of Numpy, and is slightly slower, but it is easier to use

In [30]:
import pandas as pd

In [None]:

df1= generate_expenses()
df2= pd.DataFrame(data=namelist,columns=namelist_header)

In [None]:
#Working call to export the org chart, commented out by default
#export_csv('dummy_org.csv',namelist,namelist_header)
#!ls -lh

In [13]:
print(df1.EmployeeName[0])
print(df1.iloc[0])
print (df1.loc[df1.CostCentre=='1.1.3'])
print (df2.loc[df2.EmployeeID==6])


James Leach
EmployeeID                                                      94
EmployeeName                                           James Leach
DateExpense                                             09-11-2016
Amount                                                        9.02
DateClaim                                      09-12-2016 17:00:56
CostCentre                                           1.1.2.3.78.89
EmployeeTitle                                                 Temp
ApproverID                                                      89
Description      Step place which. Win buy manage. Whose a much...
Name: 0, dtype: object
Empty DataFrame
Columns: [EmployeeID, EmployeeName, DateExpense, Amount, DateClaim, CostCentre, EmployeeTitle, ApproverID, Description]
Index: []
  EmployeeName  EmployeeID  ManagerID ManagerFullname  Band   CostCentre  \
5   John Dixon           6          5     Robert Kemp     6  1.1.2.3.4.5   

          BorgID EmployeeTitle  
5  1.1.2.3.4.5.6        Intern

In [14]:
print(df2.iloc[1])
print (df2.loc[0:5,['EmployeeID','EmployeeName','EmployeeTitle','BorgID','Band']])

EmployeeName        Ruben Lee
EmployeeID                  2
ManagerID                   1
ManagerFullname    Steve Jobs
Band                        2
CostCentre              1.1.2
BorgID                  1.1.2
EmployeeTitle        Director
Name: 1, dtype: object
   EmployeeID  EmployeeName    EmployeeTitle         BorgID  Band
0           1    Steve Jobs         Chairman            1.1     1
1           2     Ruben Lee         Director          1.1.2     2
2           3  Kelsey Evans  Department Head        1.1.2.3     3
3           4   Carly Perez          Manager      1.1.2.3.4     4
4           5   Robert Kemp        Team lead    1.1.2.3.4.5     5
5           6    John Dixon           Intern  1.1.2.3.4.5.6     6


In [15]:
df3= pd.merge(df1, df2.loc[:,['EmployeeID','BorgID','Band']], how='left', on='EmployeeID',
         left_index=False, right_index=False, sort=False,
         suffixes=('', ''), copy=True, indicator=False)
print (df3.iloc[0])

EmployeeID                                                      94
EmployeeName                                           James Leach
DateExpense                                             09-11-2016
Amount                                                        9.02
DateClaim                                      09-12-2016 17:00:56
CostCentre                                           1.1.2.3.78.89
EmployeeTitle                                                 Temp
ApproverID                                                      89
Description      Step place which. Win buy manage. Whose a much...
BorgID                                            1.1.2.3.78.89.94
Band                                                             6
Name: 0, dtype: object


In [21]:
df4= pd.merge(df3, df2.loc[:,['EmployeeID','EmployeeName','EmployeeTitle','BorgID','Band']], how='left',left_on='ApproverID', right_on='EmployeeID',
         left_index=False, right_index=False, sort=False,
         suffixes=('', '_approver'), copy=True, indicator=False)
print (df4.iloc[0])

EmployeeID                                                               65
EmployeeName                                                  Amanda Peters
DateExpense                                                      01-10-2016
Amount                                                                27.44
DateClaim                                               01-11-2016 05:18:07
CostCentre                                                     1.1.2.3.4.57
EmployeeTitle                                                  Junior Staff
ApproverID                                                               57
Description               Somebody purpose item economic beyond board. C...
BorgID                                                      1.1.2.3.4.57.65
Band                                                                      6
EmployeeID_approver                                                      57
EmployeeName_approver                                       Heather Wiggins
EmployeeTitl

In [22]:
print (df2[df2.EmployeeID==58])
print (df2[df2.EmployeeID==55])


   EmployeeName  EmployeeID  ManagerID  ManagerFullname  Band    CostCentre  \
57  Debbie Chan          58         57  Heather Wiggins     6  1.1.2.3.4.57   

             BorgID EmployeeTitle  
57  1.1.2.3.4.57.58          Temp  
   EmployeeName  EmployeeID  ManagerID ManagerFullname  Band    CostCentre  \
54   Tammy Hall          55         50   Taylor Molina     6  1.1.2.3.4.50   

             BorgID EmployeeTitle  
54  1.1.2.3.4.50.55  Junior Staff  
