# Generate a Fake bunch of "Workers"

In [None]:
from faker import Faker
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
from dateutil.relativedelta import relativedelta
import random

___

### Connect to the Database

In [None]:
dbconn = sqlite3.connect('FakeHR.db') # permanent database
print(dbconn) # Check the connection to the database.

In [None]:
cursor = dbconn.cursor()
cursor

___

### Working with the FAKER

In [None]:
# initialize a generator
fake = Faker()

___

### Wanted realistic On Board (OBD) & Date of Birth (DOB).

(Source for Below)[https://www.tonic.ai/blog/how-to-generate-simple-test-data-with-faker]

In [None]:
def birth_and_start_date(x):
   sd = fake.date_between(start_date="-30y", end_date="now")
   delta = timedelta(days=365*random.randint(22,52))
   bd = sd-delta

   return {'DOB':bd, 'OBD': sd}

In [None]:
d = dict()
d['birth_and_start_date'] = birth_and_start_date

In [None]:
# will error if def birth_and_start_date(x): has the 'x' in the parens
# [d[k]() for k in d.keys()] 

In [None]:
# df = pd.DataFrame(np.random.randint(100,size=(1000, 3)),columns=['A','B','C'])
# df[['DOB', 'OBD']] = df.apply(birth_and_start_date, axis=1, result_type ='expand')

In [None]:
# d['first_name'] = lambda: {'first_name':fake.first_name_male()}
# d['middle_name'] = lambda: {'middle_name':fake.first_name_male()}
# d['last_name'] = lambda: {'last_name':fake.last_name()}
# d['ssn'] = lambda: {'ssn':fake.ssn()}
# d['birth_and_start_date'] = birth_and_start_date

In [None]:
# for _ in range(5):
#    r=[d[k]() for k in d.keys()]
#    print(r)

In [None]:
# for _ in range(5):
#    deep_list = [list(d[k]().values()) for k in d.keys()]
#    row = [item for sublist in deep_list for item in sublist]
#    print(row)

___

### Creating the list of MALE "workers"

In [None]:
fake_gents = [
{'MALE_FN': fake.first_name_male(), 'MALE_MN': fake.first_name_male(), 'MALE_LN': fake.last_name_male(),
# 'DOB':fake.date_between(start_date='-65y', end_date='-18y'),
# 'OBD':fake.date_between(start_date='-25y', end_date='-1M'),
'SSN':fake.ssn()}
for _ in range(500)]

## CONVERT A LIST TO A DATAFRAME

In [None]:
df = pd.DataFrame(fake_gents)

In [None]:
df.dtypes

(Multiple column help)[https://www.pauldesalvo.com/how-to-return-multiple-columns-from-pandas-using-the-apply-function/]

In [None]:
df[['DOB', 'OBD']] = df.apply(birth_and_start_date, axis=1, result_type ='expand')

### Join (concat) the names and stip the middle name to just the middle initial

In [None]:
df['FULL_NAME'] = df['MALE_LN'] + (', ' + df['MALE_FN'])  + (' '+ df['MALE_MN'].str[0])


In [None]:
df['EMAIL'] = df['MALE_FN'].str[0] + (df['MALE_LN'] + ('@FEDERAL.GOV'))
df['EMAIL'] = df['EMAIL'].str.lower()

In [None]:
df['SEX'] = ('M')

In [None]:
df.sort_values(by=['OBD'], inplace=True)

In [None]:
df['OBD']= pd.to_datetime(df['OBD']) # This line converts the string 'date' to a real datetime
df['OBD']= df['OBD'].dt.date # This line strips the time data off and leave just a date
df['DOB']= pd.to_datetime(df['DOB']) # This line converts the string 'date' to a real datetime
df['DOB']= df['DOB'].dt.date # This line strips the time data off and leave just a date
df.head(5)

In [None]:
df['EMAIL'].is_unique 

In [None]:
df['SSN'].is_unique 

___

### Creating the Female "workers"

In [None]:
fake_ladies = [
{'FEMALE_FN': fake.first_name_female(), 'FEMALE_MN': fake.first_name_female(), 'FEMALE_LN': fake.last_name_female(),
# 'DOB':fake.date_between(start_date='-65y', end_date='-18y'),
# 'OBD':fake.date_between(start_date='-25y', end_date='-1M'),
'SSN':fake.ssn()}
for _ in range(500)]

In [None]:
df2 = pd.DataFrame(fake_ladies)

In [None]:
df2[['DOB', 'OBD']] = df2.apply(birth_and_start_date, axis=1, result_type ='expand')

In [None]:
df2['FULL_NAME'] = df2['FEMALE_LN'] + (', ' + df2['FEMALE_FN'])  + (' '+ df2['FEMALE_MN'].str[0])
df2['EMAIL'] = df2['FEMALE_FN'].str[0] + (df2['FEMALE_LN'] + ('@FEDERAL.GOV'))
df2['EMAIL'] = df2['EMAIL'].str.lower()
df2['SEX'] = ('F')


In [None]:
df2.sort_values(by=['OBD'], inplace=True)

In [None]:
df2['OBD']= pd.to_datetime(df2['OBD']) # This line converts the string 'date' to a real datetime
df2['OBD']= df2['OBD'].dt.date # This line strips the time data off and leave just a date
df2['DOB']= pd.to_datetime(df2['DOB']) # This line converts the string 'date' to a real datetime
df2['DOB']= df2['DOB'].dt.date # This line strips the time data off and leave just a date
df2.head(5)

Renaming the columns so the injest does not fail.

In [None]:
df = df.rename(columns={"MALE_FN":"FNAME", "MALE_MN":"MNAME","MALE_LN":"LNAME"})
df2 = df2.rename(columns={"FEMALE_FN":"FNAME", "FEMALE_MN":"MNAME","FEMALE_LN":"LNAME"})

___

In [None]:
df.to_sql('tmp_FAKER', dbconn, index=0, if_exists='replace') # injesting the first (Male) dataframe

In [None]:
df2.to_sql('tmp_FAKER', dbconn, index=0, if_exists='append') # injesting the sencond df (female)

___

Housekeeping:

In [None]:
cursor = dbconn.cursor()
cursor.execute('''DROP TABLE tmp_FAKER''')
# cursor.execute('''DROP TABLE tmp_FAKERM''')
# cursor.execute('''DROP TABLE tmp_FAKERF''')
dbconn.commit()

___

Read all the 'workers' back into a dataframe.

In [None]:
df3 = pd.read_sql_query('select * FROM tmp_FAKER', dbconn)
df3.head(5)

Getting ready to sort all by their OBD so that the ECI will makes sense, initially.

In [None]:
df3['OBD']= pd.to_datetime(df3['OBD']) # This line converts the string 'date' to a real datetime
df3['OBD']= df3['OBD'].dt.date # This line strips the time data off and leave just a date

In [None]:
df3.dtypes

### Sorting everything after mixing MALES with FEMALES

In [None]:
df3.sort_values(by=['OBD'], inplace=True)
df3.head(12)

### Checking for duplicate emails

In [None]:
df_dupes = pd.read_sql_query('SELECT *, COUNT(EMAIL) FROM tmp_FAKER GROUP BY EMAIL HAVING COUNT(EMAIL)>1;', dbconn)
df_dupes.head(35)

### Dropping the duplicate emails.

In [None]:
df3 = df3.drop_duplicates(subset='EMAIL', keep="first")
df3.shape

### Dropping and recreating the tables with the mixed and deduped data.

In [None]:
cursor = dbconn.cursor()
cursor.execute('''DROP TABLE tmp_FAKER''')
# cursor.execute('''DROP TABLE tmp_FAKERM''')
# cursor.execute('''DROP TABLE tmp_FAKERF''')
dbconn.commit()

In [None]:
df3.to_sql('tmp_FAKER', dbconn, index=0, if_exists='replace')

### Wanted to do some time calculations:

Seeing who has been will 'the Agency' the longest, etc.

Situation where 'worker' is 20 years old but has 24 years of service.

This is a function to calculate the age of the 'Worker'
But I had to convert it back to a datetime first

In [None]:
df3['DOB']= pd.to_datetime(df3['DOB'])
df3['OBD']= pd.to_datetime(df3['OBD'])

In [None]:
current_datetime = datetime.now()
dt = current_datetime.date()
dt_tomorrow = dt + timedelta(days=1)
dt

In [None]:
dt_tomorrow

In [None]:
# def f(end):
#     r = relativedelta(pd.to_datetime('now'), end) 
#     return '{} years {} days'.format(r.years, r.days)

def f(end):
    # r = relativedelta(pd.to_datetime('now'), end) 
    r = relativedelta(pd.to_datetime(dt_tomorrow), end) 
    # return '{} years' .format(r.years)  # This line adds the word "years" into the field.
    return '{}' .format(r.years)

In [None]:
df3['AGE'] = df3["DOB"].apply(f)

In [None]:
df3['SERVICE_TIME'] = df3["OBD"].apply(f)

Converting the df back to just a Date for injest into db.

In [None]:
df3['OBD']= pd.to_datetime(df3['OBD']) # This line converts the string 'date' to a real datetime
df3['OBD']= df3['OBD'].dt.date # This line strips the time data off and leave just a date
df3['DOB']= pd.to_datetime(df3['DOB']) # This line converts the string 'date' to a real datetime
df3['DOB']= df3['DOB'].dt.date # This line strips the time data off and leave just a date