# Generate a Fake bunch of "Workers"

In [1]:
from faker import Faker
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
from dateutil.relativedelta import relativedelta
import random

___

### Connect to the Database

In [2]:
dbconn = sqlite3.connect('FakeHR.db') # permanent database
print(dbconn) # Check the connection to the database.

<sqlite3.Connection object at 0x7fe1358fa040>


In [3]:
cursor = dbconn.cursor()
cursor

<sqlite3.Cursor at 0x7fe1359a1d40>

___

### Working with the FAKER

In [4]:
# initialize a generator
fake = Faker()

___

### Wanted realistic On Board (OBD) & Date of Birth (DOB).

(Source for Below)[https://www.tonic.ai/blog/how-to-generate-simple-test-data-with-faker]

In [52]:
def birth_and_start_date(x):
   sd = fake.date_between(start_date="-30y", end_date="now")
   delta = timedelta(days=365*random.randint(22,52))
   bd = sd-delta

   return {'DOB':bd, 'OBD': sd}

In [53]:
d = dict()
d['birth_and_start_date'] = birth_and_start_date

In [None]:
# will error if def birth_and_start_date(x): has the 'x' in the parens
# [d[k]() for k in d.keys()] 

In [None]:
# df = pd.DataFrame(np.random.randint(100,size=(1000, 3)),columns=['A','B','C'])
# df[['DOB', 'OBD']] = df.apply(birth_and_start_date, axis=1, result_type ='expand')

In [None]:
# d['first_name'] = lambda: {'first_name':fake.first_name_male()}
# d['middle_name'] = lambda: {'middle_name':fake.first_name_male()}
# d['last_name'] = lambda: {'last_name':fake.last_name()}
# d['ssn'] = lambda: {'ssn':fake.ssn()}
# d['birth_and_start_date'] = birth_and_start_date

In [None]:
# for _ in range(5):
#    r=[d[k]() for k in d.keys()]
#    print(r)

In [None]:
# for _ in range(5):
#    deep_list = [list(d[k]().values()) for k in d.keys()]
#    row = [item for sublist in deep_list for item in sublist]
#    print(row)

___

### Creating the list of MALE "workers"

In [54]:
fake_gents = [
{'MALE_FN': fake.first_name_male(), 'MALE_MN': fake.first_name_male(), 'MALE_LN': fake.last_name_male(),
# 'DOB':fake.date_between(start_date='-65y', end_date='-18y'),
# 'OBD':fake.date_between(start_date='-25y', end_date='-1M'),
'SSN':fake.ssn()}
for _ in range(500)]

## CONVERT A LIST TO A DATAFRAME

In [55]:
df = pd.DataFrame(fake_gents)

In [None]:
df.dtypes

(Multiple column help)[https://www.pauldesalvo.com/how-to-return-multiple-columns-from-pandas-using-the-apply-function/]

In [56]:
df[['DOB', 'OBD']] = df.apply(birth_and_start_date, axis=1, result_type ='expand')

### Join (concat) the names and stip the middle name to just the middle initial

In [57]:
df['FULL_NAME'] = df['MALE_LN'] + (', ' + df['MALE_FN'])  + (' '+ df['MALE_MN'].str[0])


In [58]:
df['EMAIL'] = df['MALE_FN'].str[0] + (df['MALE_LN'] + ('@FEDERAL.GOV'))
df['EMAIL'] = df['EMAIL'].str.lower()

In [59]:
df['SEX'] = ('M')

In [60]:
df.sort_values(by=['OBD'], inplace=True)

In [61]:
df['OBD']= pd.to_datetime(df['OBD']) # This line converts the string 'date' to a real datetime
df['OBD']= df['OBD'].dt.date # This line strips the time data off and leave just a date
df['DOB']= pd.to_datetime(df['DOB']) # This line converts the string 'date' to a real datetime
df['DOB']= df['DOB'].dt.date # This line strips the time data off and leave just a date
df.head(5)

Unnamed: 0,MALE_FN,MALE_MN,MALE_LN,SSN,DOB,OBD,FULL_NAME,EMAIL,SEX
410,Juan,Joseph,Neal,796-48-6407,1953-04-15,1992-04-05,"Neal, Juan J",jneal@federal.gov,M
497,Dean,David,Cox,776-12-6237,1970-05-17,1992-05-11,"Cox, Dean D",dcox@federal.gov,M
443,James,Anthony,Rivas,203-89-3993,1965-06-05,1992-05-29,"Rivas, James A",jrivas@federal.gov,M
261,Christopher,Timothy,Clark,067-31-4135,1960-06-14,1992-06-06,"Clark, Christopher T",cclark@federal.gov,M
106,Scott,Rodney,Wilson,319-25-3000,1950-06-22,1992-06-11,"Wilson, Scott R",swilson@federal.gov,M


In [62]:
df['EMAIL'].is_unique 

False

In [63]:
df['SSN'].is_unique 

True

___

### Creating the Female "workers"

In [64]:
fake_ladies = [
{'FEMALE_FN': fake.first_name_female(), 'FEMALE_MN': fake.first_name_female(), 'FEMALE_LN': fake.last_name_female(),
# 'DOB':fake.date_between(start_date='-65y', end_date='-18y'),
# 'OBD':fake.date_between(start_date='-25y', end_date='-1M'),
'SSN':fake.ssn()}
for _ in range(500)]

In [65]:
df2 = pd.DataFrame(fake_ladies)

In [66]:
df2[['DOB', 'OBD']] = df2.apply(birth_and_start_date, axis=1, result_type ='expand')

In [67]:
df2['FULL_NAME'] = df2['FEMALE_LN'] + (', ' + df2['FEMALE_FN'])  + (' '+ df2['FEMALE_MN'].str[0])
df2['EMAIL'] = df2['FEMALE_FN'].str[0] + (df2['FEMALE_LN'] + ('@FEDERAL.GOV'))
df2['EMAIL'] = df2['EMAIL'].str.lower()
df2['SEX'] = ('F')


In [68]:
df2.sort_values(by=['OBD'], inplace=True)

In [69]:
df2['OBD']= pd.to_datetime(df2['OBD']) # This line converts the string 'date' to a real datetime
df2['OBD']= df2['OBD'].dt.date # This line strips the time data off and leave just a date
df2['DOB']= pd.to_datetime(df2['DOB']) # This line converts the string 'date' to a real datetime
df2['DOB']= df2['DOB'].dt.date # This line strips the time data off and leave just a date
df2.head(5)

Unnamed: 0,FEMALE_FN,FEMALE_MN,FEMALE_LN,SSN,DOB,OBD,FULL_NAME,EMAIL,SEX
253,Christina,Amy,Smith,846-97-0112,1944-04-24,1992-04-12,"Smith, Christina A",csmith@federal.gov,F
435,Mallory,Samantha,Gibson,830-42-4529,1951-05-10,1992-04-29,"Gibson, Mallory S",mgibson@federal.gov,F
190,Veronica,Nicole,Wells,544-03-8760,1945-05-18,1992-05-06,"Wells, Veronica N",vwells@federal.gov,F
140,Sarah,Carol,Cole,016-08-8924,1957-05-22,1992-05-13,"Cole, Sarah C",scole@federal.gov,F
373,Carly,Sara,Stewart,158-45-7999,1964-05-20,1992-05-13,"Stewart, Carly S",cstewart@federal.gov,F


Renaming the columns so the injest does not fail.

In [70]:
df = df.rename(columns={"MALE_FN":"FNAME", "MALE_MN":"MNAME","MALE_LN":"LNAME"})
df2 = df2.rename(columns={"FEMALE_FN":"FNAME", "FEMALE_MN":"MNAME","FEMALE_LN":"LNAME"})

___

In [72]:
df.to_sql('tmp_FAKER', dbconn, index=0, if_exists='replace') # injesting the first (Male) dataframe

500

In [73]:
df2.to_sql('tmp_FAKER', dbconn, index=0, if_exists='append') # injesting the sencond df (female)

500

___

Housekeeping:

In [71]:
cursor = dbconn.cursor()
cursor.execute('''DROP TABLE tmp_FAKER''')
# cursor.execute('''DROP TABLE tmp_FAKERM''')
# cursor.execute('''DROP TABLE tmp_FAKERF''')
dbconn.commit()

___

Read all the 'workers' back into a dataframe.

In [74]:
df3 = pd.read_sql_query('select * FROM tmp_FAKER', dbconn)
df3.head(5)

Unnamed: 0,FNAME,MNAME,LNAME,SSN,DOB,OBD,FULL_NAME,EMAIL,SEX
0,Juan,Joseph,Neal,796-48-6407,1953-04-15,1992-04-05,"Neal, Juan J",jneal@federal.gov,M
1,Dean,David,Cox,776-12-6237,1970-05-17,1992-05-11,"Cox, Dean D",dcox@federal.gov,M
2,James,Anthony,Rivas,203-89-3993,1965-06-05,1992-05-29,"Rivas, James A",jrivas@federal.gov,M
3,Christopher,Timothy,Clark,067-31-4135,1960-06-14,1992-06-06,"Clark, Christopher T",cclark@federal.gov,M
4,Scott,Rodney,Wilson,319-25-3000,1950-06-22,1992-06-11,"Wilson, Scott R",swilson@federal.gov,M


Getting ready to sort all by their OBD so that the ECI will makes sense, initially.

In [75]:
df3['OBD']= pd.to_datetime(df3['OBD']) # This line converts the string 'date' to a real datetime
df3['OBD']= df3['OBD'].dt.date # This line strips the time data off and leave just a date

In [76]:
df3.dtypes

FNAME        object
MNAME        object
LNAME        object
SSN          object
DOB          object
OBD          object
FULL_NAME    object
EMAIL        object
SEX          object
dtype: object

### Sorting everything after mixing MALES with FEMALES

In [77]:
df3.sort_values(by=['OBD'], inplace=True)
df3.head(12)

Unnamed: 0,FNAME,MNAME,LNAME,SSN,DOB,OBD,FULL_NAME,EMAIL,SEX
0,Juan,Joseph,Neal,796-48-6407,1953-04-15,1992-04-05,"Neal, Juan J",jneal@federal.gov,M
500,Christina,Amy,Smith,846-97-0112,1944-04-24,1992-04-12,"Smith, Christina A",csmith@federal.gov,F
501,Mallory,Samantha,Gibson,830-42-4529,1951-05-10,1992-04-29,"Gibson, Mallory S",mgibson@federal.gov,F
502,Veronica,Nicole,Wells,544-03-8760,1945-05-18,1992-05-06,"Wells, Veronica N",vwells@federal.gov,F
1,Dean,David,Cox,776-12-6237,1970-05-17,1992-05-11,"Cox, Dean D",dcox@federal.gov,M
504,Carly,Sara,Stewart,158-45-7999,1964-05-20,1992-05-13,"Stewart, Carly S",cstewart@federal.gov,F
503,Sarah,Carol,Cole,016-08-8924,1957-05-22,1992-05-13,"Cole, Sarah C",scole@federal.gov,F
505,Nicole,Susan,Campbell,169-20-0469,1953-05-27,1992-05-17,"Campbell, Nicole S",ncampbell@federal.gov,F
506,Ashley,Barbara,Hernandez,597-04-7377,1948-06-04,1992-05-24,"Hernandez, Ashley B",ahernandez@federal.gov,F
2,James,Anthony,Rivas,203-89-3993,1965-06-05,1992-05-29,"Rivas, James A",jrivas@federal.gov,M


### Checking for duplicate emails

In [None]:
df_dupes = pd.read_sql_query('SELECT *, COUNT(EMAIL) FROM tmp_FAKER GROUP BY EMAIL HAVING COUNT(EMAIL)>1;', dbconn)
df_dupes.head(35)

### Dropping the duplicate emails.

In [78]:
df3 = df3.drop_duplicates(subset='EMAIL', keep="first")
df3.shape

(889, 9)

### Dropping and recreating the tables with the mixed and deduped data.

In [101]:
cursor = dbconn.cursor()
cursor.execute('''DROP TABLE tmp_FAKER''')
# cursor.execute('''DROP TABLE tmp_FAKERM''')
# cursor.execute('''DROP TABLE tmp_FAKERF''')
dbconn.commit()

In [102]:
df3.to_sql('tmp_FAKER', dbconn, index=0, if_exists='replace')

889

### Wanted to do some time calculations:

Seeing who has been will 'the Agency' the longest, etc.

Situation where 'worker' is 20 years old but has 24 years of service.

This is a function to calculate the age of the 'Worker'
But I had to convert it back to a datetime first

In [81]:
df3['DOB']= pd.to_datetime(df3['DOB'])
df3['OBD']= pd.to_datetime(df3['OBD'])

In [91]:
current_datetime = datetime.now()
dt = current_datetime.date()
dt_tomorrow = dt + timedelta(days=1)
dt

datetime.date(2022, 4, 1)

In [92]:
dt_tomorrow

datetime.date(2022, 4, 2)

In [93]:
# def f(end):
#     r = relativedelta(pd.to_datetime('now'), end) 
#     return '{} years {} days'.format(r.years, r.days)

def f(end):
    # r = relativedelta(pd.to_datetime('now'), end) 
    r = relativedelta(pd.to_datetime(dt_tomorrow), end) 
    # return '{} years' .format(r.years)  # This line adds the word "years" into the field.
    return '{}' .format(r.years)

In [94]:
df3['AGE'] = df3["DOB"].apply(f)

In [95]:
df3['SERVICE_TIME'] = df3["OBD"].apply(f)

Converting the df back to just a Date for injest into db.

In [100]:
df3['OBD']= pd.to_datetime(df3['OBD']) # This line converts the string 'date' to a real datetime
df3['OBD']= df3['OBD'].dt.date # This line strips the time data off and leave just a date
df3['DOB']= pd.to_datetime(df3['DOB']) # This line converts the string 'date' to a real datetime
df3['DOB']= df3['DOB'].dt.date # This line strips the time data off and leave just a date