In [67]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset
import pandas as pd
from datetime import date, datetime, timedelta
import numpy as np
from azure.storage.blob import BlobServiceClient

subscription_id = '96599b9b-0d9b-4577-9163-418d2c6cd411'
resource_group = 'rg-datasci-ml-dev-001'
workspace_name = 'ml-moss-dev-001'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset_hourly_compare_cmic = Dataset.get_by_name(workspace, name='Hourly_Employee_Time_prod')
cmic_pr_df_dev = dataset_hourly_compare_cmic.to_pandas_dataframe()

dataset_hourly_compare_wd = Dataset.get_by_name(workspace, name='WD_Hourly_Employee_Time_prod')
wd_pr_df_dev = dataset_hourly_compare_wd.to_pandas_dataframe()

tcp_dataset = Dataset.get_by_name(workspace, name='WorkSegments_Timesheet_prod')
tcp_df_dev = tcp_dataset.to_pandas_dataframe()

jcjob_dataset = Dataset.get_by_name(workspace, name='JCJOB_TABLE_TN_prod')
jcjob_df = jcjob_dataset.to_pandas_dataframe()

Message: rslex failed, falling back to clex.
Payload: {"pid": 5434, "source": "azureml.dataprep", "version": "4.11.3", "trace": "azureml|data|tabular_dataset.py, line 169 in function <lambda>.\nazureml|data|dataset_error_handling.py, line 107 in function _try_execute.\nazureml|data|tabular_dataset.py, line 169 in function to_pandas_dataframe.", "subscription": "", "run_id": "", "resource_group": "", "workspace_name": "", "experiment_id": "", "location": "", "rslex_version": "2.18.3"}


In [68]:
##########################

week_ending = '2023-10-14'

##########################

date = datetime.strptime(week_ending, "%Y-%m-%d")
start_date = date - timedelta(days = 6)
end_date = date

In [69]:
tcp_df = tcp_df_dev.copy()

In [70]:
###TCP Data Manipulation
#Filter: Solar Jobs only 
tcp_df = tcp_df.loc[tcp_df['Job_Code'].str.startswith('710')]
#Filter: Only Keeping Labor Hourly
tcp_df = tcp_df.loc[tcp_df['Task'].isin(['LH','LHA'])]
#Filter: Only keeping Moss Labor Hourly - excluding subs
tcp_df = tcp_df.loc[tcp_df['Department']=='710']
#Creating a total hours column
tcp_df['total_hours'] = tcp_df['Regular_Hours'] + tcp_df['Overtime_Hours']
#Renaming Relevant Columns
tcp_df = tcp_df.rename({'Employee_Number':'EEID','Job_Code':'Job','Activity':'Sub_Job','total_hours':'Hours'},axis='columns')
tcp_cols_keep = ['EEID' , 'Date' , 'Job' , 'Sub_Job' , 'Hours']
tcp_df_comp = tcp_df.loc[:,tcp_cols_keep]

In [92]:
wd_pr_df = wd_pr_df_dev.copy()

In [93]:
#Workday Data Manipulation
#Drop Null values if the employee does not have a job/sub job assignment
wd_pr_df = wd_pr_df.loc[~wd_pr_df['cf_INTLRVCalculatedProjectFromTimeBlock.descriptor'].isna()]
#Filter: Solar Jobs Only
wd_pr_df = wd_pr_df.loc[wd_pr_df['cf_INTLRVCalculatedProjectFromTimeBlock.descriptor'].str.startswith('710')]
#Extracting Employee ID (EEID) from 'Workers' column
wd_pr_df['EEID'] = wd_pr_df['Workers'].str.split('(').str[1]
wd_pr_df['EEID'] = wd_pr_df['EEID'].str.split(')').str[0]
#This isolates the 2 digit sub job, also accomodates the sub jobs that have a letter in them like Claims (C) and Warranty (W)
wd_pr_df['cf_INTLRVCalculatedProjectFromTimeBlock.descriptor'] = wd_pr_df['cf_INTLRVCalculatedProjectFromTimeBlock.descriptor'].str.replace('C' , '.C')
wd_pr_df['cf_INTLRVCalculatedProjectFromTimeBlock.descriptor'] = wd_pr_df['cf_INTLRVCalculatedProjectFromTimeBlock.descriptor'].str.replace('W' , '.W')
wd_pr_df['sub_job'] = wd_pr_df['cf_INTLRVCalculatedProjectFromTimeBlock.descriptor'].str.split('.').str[1]
wd_pr_df['sub_job'] = wd_pr_df['sub_job'].str.split(' ').str[0]
#Leaves behind Job ONLY
wd_pr_df['job'] = wd_pr_df.loc[:,'cf_INTLRVCalculatedProjectFromTimeBlock.descriptor'].str[0:7]
#Renaming Relevant Columns
wd_pr_df = wd_pr_df.rename({'calculatedDate':'Date','job':'Job','sub_job':'Sub_Job','amount':'Hours'},axis='columns')
wd_cols_keep = ['EEID' , 'Date' , 'Job' , 'Sub_Job' , 'Hours']
wd_pr_df_comp = wd_pr_df.loc[:,wd_cols_keep]
#excluding these employees from Workday because they will not be present in the TCP data
wd_eeid_exclude = ['6423','6525','9633','9647','9686','11780','13401','13757','14452','14523','14587','14594','15312','15358','15681','15912']
wd_pr_df_comp = wd_pr_df_comp.loc[~wd_pr_df_comp['EEID'].isin(wd_eeid_exclude)]

In [18]:
cmic_pr_df = cmic_pr_df_dev.copy()

In [19]:
#CMiC Data Manipulation
#Re-casting this column as a Date data type
cmic_pr_df["Date"] = pd.to_datetime(cmic_pr_df["Date"])
#Filter: Only Keeping Labor Hourly
cmic_pr_df = cmic_pr_df.loc[cmic_pr_df['Task'].isin(['LH','LHA'])]
#Renaming Relevant Columns
cmic_pr_df = cmic_pr_df.rename({'Employee Number':'EEID','Job Code':'Job','Sub Job Code':'Sub_Job','Regular Hours':'Hours'},axis='columns')
cmic_cols_keep = ['EEID' , 'Date' , 'Job' , 'Sub_Job' , 'Hours']
cmic_pr_df_comp = cmic_pr_df.loc[:,cmic_cols_keep]

In [36]:
#Date Filters
tcp_filter = (tcp_df_comp['Date']>=start_date)&(tcp_df_comp['Date']<=end_date)
cmic_filter = (cmic_pr_df_comp['Date']>=start_date)&(cmic_pr_df_comp['Date']<=end_date)
wd_filter = (wd_pr_df_comp['Date']>=start_date)&(wd_pr_df_comp['Date']<=end_date)

#### Grouping the Data by Different Levels

In [183]:
#Grouping by Job
tcp_grouped_job = tcp_df_comp.loc[tcp_filter].groupby(['Job']).agg(TotalHours_TCP = pd.NamedAgg(column='Hours',aggfunc='sum'))
tcp_grouped_job = tcp_grouped_job.reset_index()
cmic_grouped_job = cmic_pr_df_comp.loc[cmic_filter].groupby(['Job']).agg(TotalHours_CMiC = pd.NamedAgg(column='Hours',aggfunc='sum'))
cmic_grouped_job = cmic_grouped_job.reset_index()
wd_grouped_job = wd_pr_df_comp.loc[wd_filter].groupby(['Job']).agg(TotalHours_WD = pd.NamedAgg(column='Hours',aggfunc='sum'))
wd_grouped_job = wd_grouped_job.reset_index()

In [184]:
#Grouping by Job, Sub Job
tcp_grouped_subjob = tcp_df_comp.loc[tcp_filter].groupby(['Job','Sub_Job']).agg(TotalHours_TCP = pd.NamedAgg(column='Hours',aggfunc='sum'))
tcp_grouped_subjob = tcp_grouped_subjob.reset_index()
cmic_grouped_subjob = cmic_pr_df_comp.loc[cmic_filter].groupby(['Job','Sub_Job']).agg(TotalHours_CMiC = pd.NamedAgg(column='Hours',aggfunc='sum'))
cmic_grouped_subjob = cmic_grouped_subjob.reset_index()
wd_grouped_subjob = wd_pr_df_comp.loc[wd_filter].groupby(['Job','Sub_Job']).agg(TotalHours_WD = pd.NamedAgg(column='Hours',aggfunc='sum'))
wd_grouped_subjob = wd_grouped_subjob.reset_index()

In [185]:
#Grouping by Job, Sub Job, EEID
tcp_grouped_ee = tcp_df_comp.loc[tcp_filter].groupby(['EEID']).agg(TotalHours_TCP = pd.NamedAgg(column='Hours',aggfunc='sum'))
tcp_grouped_ee = tcp_grouped_ee.reset_index()
cmic_grouped_ee = cmic_pr_df_comp.loc[cmic_filter].groupby(['EEID']).agg(TotalHours_CMiC = pd.NamedAgg(column='Hours',aggfunc='sum'))
cmic_grouped_ee = cmic_grouped_ee.reset_index()
wd_grouped_ee = wd_pr_df_comp.loc[wd_filter].groupby(['EEID']).agg(TotalHours_WD = pd.NamedAgg(column='Hours',aggfunc='sum'))
wd_grouped_ee = wd_grouped_ee.reset_index()

#### Unique Column Creation for the Joins

In [186]:
#Unique Columns to Join On for Sub Job Comparison
tcp_grouped_subjob['unique'] = tcp_grouped_subjob[['Job','Sub_Job']].apply(lambda x: ' - '.join(str(value) for value in x), axis=1)
cmic_grouped_subjob['unique'] = cmic_grouped_subjob[['Job','Sub_Job']].apply(lambda x: ' - '.join(str(value) for value in x), axis=1)
wd_grouped_subjob['unique'] = wd_grouped_subjob[['Job','Sub_Job']].apply(lambda x: ' - '.join(str(value) for value in x), axis=1)

#### TCP vs. Workday Comparison

In [187]:
#TCP v. WD Job
tcp_v_wd_job = tcp_grouped_job.merge(wd_grouped_job , how = 'outer' , on = 'Job' , suffixes = ['_TCP','_WD'])
#NaN Hours will be filled with 0
tcp_v_wd_job['TotalHours_WD'] = tcp_v_wd_job['TotalHours_WD'].fillna(0)
tcp_v_wd_job['TotalHours_TCP'] = tcp_v_wd_job['TotalHours_TCP'].fillna(0)
tcp_v_wd_job['Hours_Variance (TCP - WD)'] = tcp_v_wd_job['TotalHours_TCP'] - tcp_v_wd_job['TotalHours_WD']
tcp_v_wd_job = tcp_v_wd_job.sort_values(['Job'  , 'Hours_Variance (TCP - WD)'] , ascending=[False,False])

In [188]:
#TCP v. WD Job, Sub Job
tcp_v_wd_subjob = tcp_grouped_subjob.merge(wd_grouped_subjob , how = 'outer' , on = 'unique' , suffixes = ['_TCP','_WD']).drop(columns =['unique'])
#NaN Hours will be filled with 0
tcp_v_wd_subjob['TotalHours_WD'] = tcp_v_wd_subjob['TotalHours_WD'].fillna(0)
tcp_v_wd_subjob['TotalHours_TCP'] = tcp_v_wd_subjob['TotalHours_TCP'].fillna(0)
tcp_v_wd_subjob['Hours_Variance (TCP - WD)'] = tcp_v_wd_subjob['TotalHours_TCP'] - tcp_v_wd_subjob['TotalHours_WD']
tcp_v_wd_subjob = tcp_v_wd_subjob.sort_values(['Job_TCP' , 'Sub_Job_TCP' , 'Hours_Variance (TCP - WD)'] , ascending=[False,True,False])

In [189]:
#TCP v. WD Job, Sub Job, EEID
### Commented out for EEID Comparison Only - uncomment out to compare EEID by Job and Sub Job
#tcp_v_wd_ee = tcp_grouped_ee.merge(wd_grouped_ee , how = 'outer' , on = 'unique' , suffixes = ['_TCP','_WD']).drop(columns =['unique'])
tcp_v_wd_ee = tcp_grouped_ee.merge(wd_grouped_ee , how = 'outer' , on = 'EEID' , suffixes = ['_TCP','_WD'])
#NaN Hours will be filled with 0
tcp_v_wd_ee['TotalHours_WD'] = tcp_v_wd_ee['TotalHours_WD'].fillna(0)
tcp_v_wd_ee['TotalHours_TCP'] = tcp_v_wd_ee['TotalHours_TCP'].fillna(0)
tcp_v_wd_ee['Hours_Variance (TCP - WD)'] = tcp_v_wd_ee['TotalHours_TCP'] - tcp_v_wd_ee['TotalHours_WD']
tcp_v_wd_ee = tcp_v_wd_ee.sort_values('Hours_Variance (TCP - WD)' , ascending=False, key=abs)
#Removing Variances that are 0 - excluding EEIDs that match
tcp_v_wd_ee = tcp_v_wd_ee.loc[tcp_v_wd_ee['Hours_Variance (TCP - WD)'] != 0.0]

#### TCP vs. CMiC Comparison

In [190]:
#TCP v CMiC Job
tcp_v_cmic_job = tcp_grouped_job.merge(cmic_grouped_job , how = 'outer' , on = 'Job' , suffixes = ['','_CMiC'])
#NaN Hours will be filled with 0
tcp_v_cmic_job['TotalHours_CMiC'] = tcp_v_cmic_job['TotalHours_CMiC'].fillna(0)
tcp_v_cmic_job['TotalHours_TCP'] = tcp_v_cmic_job['TotalHours_TCP'].fillna(0)
tcp_v_cmic_job['Hours_Variance (TCP - CMiC)'] = tcp_v_cmic_job['TotalHours_TCP'] - tcp_v_cmic_job['TotalHours_CMiC']
tcp_v_cmic_job = tcp_v_cmic_job.sort_values(['Job' , 'Hours_Variance (TCP - CMiC)'], ascending=[False, False])

In [191]:
#TCP v CMiC Job, Sub Job
tcp_v_cmic_subjob = tcp_grouped_subjob.merge(cmic_grouped_subjob , how = 'outer' , on = 'unique' , suffixes = ['','_CMiC']).drop(columns =['unique'])
#NaN Hours will be filled with 0
tcp_v_cmic_subjob['TotalHours_CMiC'] = tcp_v_cmic_subjob['TotalHours_CMiC'].fillna(0)
tcp_v_cmic_subjob['TotalHours_TCP'] = tcp_v_cmic_subjob['TotalHours_TCP'].fillna(0)
tcp_v_cmic_subjob['Hours_Variance (TCP - CMiC)'] = tcp_v_cmic_subjob['TotalHours_TCP'] - tcp_v_cmic_subjob['TotalHours_CMiC']
tcp_v_cmic_subjob = tcp_v_cmic_subjob.sort_values(['Job' , 'Sub_Job', 'Hours_Variance (TCP - CMiC)'], ascending=[False, True, False])

In [192]:
#TCP v CMiC Job, Sub Job, EEID
### Commented out for EEID Comparison Only - uncomment out to compare EEID by Job and Sub Job
#tcp_v_cmic_ee = tcp_grouped_ee.merge(cmic_grouped_ee , how = 'outer' , on = 'unique' , suffixes = ['','_CMiC']).drop(columns =['unique'])
tcp_v_cmic_ee = tcp_grouped_ee.merge(cmic_grouped_ee , how = 'outer' , on = 'EEID' , suffixes = ['','_CMiC'])
#NaN Hours will be filled with 0
tcp_v_cmic_ee['TotalHours_CMiC'] = tcp_v_cmic_ee['TotalHours_CMiC'].fillna(0)
tcp_v_cmic_ee['TotalHours_TCP'] = tcp_v_cmic_ee['TotalHours_TCP'].fillna(0)
tcp_v_cmic_ee['Hours_Variance (TCP - CMiC)'] = tcp_v_cmic_ee['TotalHours_TCP'] - tcp_v_cmic_ee['TotalHours_CMiC']
tcp_v_cmic_ee = tcp_v_cmic_ee.sort_values('Hours_Variance (TCP - CMiC)', ascending=False, key=abs)
#Filter down to relevant columns only

#### Workday vs. CMiC Comparison

In [193]:
#WD v. CMiC EEID
wd_v_cmic_job = wd_grouped_job.merge(cmic_grouped_job , how = 'outer' , on = 'Job' , suffixes = ['_WD','_CMiC'])
#NaN Hours will be filled with 0
wd_v_cmic_job['TotalHours_CMiC'] = wd_v_cmic_job['TotalHours_CMiC'].fillna(0)
wd_v_cmic_job['TotalHours_WD'] = wd_v_cmic_job['TotalHours_WD'].fillna(0)
wd_v_cmic_job['Hours_Variance (WD - CMiC)'] = wd_v_cmic_job['TotalHours_WD'] - wd_v_cmic_job['TotalHours_CMiC']
wd_v_cmic_job = wd_v_cmic_job.sort_values(['Job' , 'Hours_Variance (WD - CMiC)'], ascending=[False,False])

In [194]:
#WD v. CMiC Job, Sub Job
wd_v_cmic_subjob = wd_grouped_subjob.merge(cmic_grouped_subjob , how = 'outer' , on = 'unique' , suffixes = ['_WD','_CMiC']).drop(columns =['unique'])
#NaN Hours will be filled with 0
wd_v_cmic_subjob['TotalHours_CMiC'] = wd_v_cmic_subjob['TotalHours_CMiC'].fillna(0)
wd_v_cmic_subjob['TotalHours_WD'] = wd_v_cmic_subjob['TotalHours_WD'].fillna(0)
wd_v_cmic_subjob['Hours_Variance (WD - CMiC)'] = wd_v_cmic_subjob['TotalHours_WD'] - wd_v_cmic_subjob['TotalHours_CMiC']
wd_v_cmic_subjob = wd_v_cmic_subjob.sort_values(['Job_WD' , 'Sub_Job_WD' , 'Hours_Variance (WD - CMiC)'], ascending=[False,True,False])

In [195]:
#WD v. CMiC Job, Sub Job, EEID
### Commented out for EEID Comparison Only - uncomment out to compare EEID by Job and Sub Job
#wd_v_cmic_ee = wd_grouped_ee.merge(cmic_grouped_ee , how = 'outer' , on = 'unique' , suffixes = ['_WD','_CMiC']).drop(columns =['unique'])
wd_v_cmic_ee = wd_grouped_ee.merge(cmic_grouped_ee , how = 'outer' , on = 'EEID' , suffixes = ['_WD','_CMiC'])
#NaN Hours will be filled with 0
wd_v_cmic_ee['TotalHours_CMiC'] = wd_v_cmic_ee['TotalHours_CMiC'].fillna(0)
wd_v_cmic_ee['TotalHours_WD'] = wd_v_cmic_ee['TotalHours_WD'].fillna(0)
wd_v_cmic_ee['Hours_Variance (WD - CMiC)'] = wd_v_cmic_ee['TotalHours_WD'] - wd_v_cmic_ee['TotalHours_CMiC']
wd_v_cmic_ee = wd_v_cmic_ee.sort_values('Hours_Variance (WD - CMiC)', ascending=False , key=abs)
#Filter down to relevant columns only

In [196]:
file_path = os.getcwd() + '/excel_output'
file_path

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/jbeachy2/code/Users/JBeachy/Project_DataDomain/Payroll/git_LaborHours_Compare/excel_output'

In [197]:
#File Path in AMLS
file_name = f'Hours_Comparison_WeekEnding_{week_ending}'
#Writing to AMLS File Directory
with pd.ExcelWriter(f'{file_path}/{file_name}.xlsx') as writer:
    tcp_v_wd_job.to_excel(writer , sheet_name = 'TCPvWD_Hours_byJob' , index = False)
    tcp_v_wd_subjob.to_excel(writer , sheet_name = 'TCPvWD_Hours_bySubJob' , index = False)
    tcp_v_wd_ee.to_excel(writer , sheet_name = 'TCPvWD_Hours_byEE' , index = False)
    tcp_v_cmic_job.to_excel(writer , sheet_name = 'TCPvCMiC_Hours_byJob' , index = False)
    tcp_v_cmic_subjob.to_excel(writer , sheet_name = 'TCPvCMiC_Hours_bySubJob' , index = False)
    tcp_v_cmic_ee.to_excel(writer , sheet_name = 'TCPvCMiC_Hours_byEE' , index = False)
    wd_v_cmic_job.to_excel(writer , sheet_name = 'WDvCMiC_Hours_byJob' , index = False)
    wd_v_cmic_subjob.to_excel(writer , sheet_name = 'WDvCMiC_Hours_bySubJob' , index = False)
    wd_v_cmic_ee.to_excel(writer , sheet_name = 'WDvCMiC_Hours_byEE' , index = False)

In [198]:
#Blob Storage Info
storage_account_key = 'u46QXBaayH/rWljcqPTWZTNgFdGHo9zH4I0OLWgDi4oa2inkpjVwTbp74C+ISDC2oWtNlrcr69Ec+ASt5wV1PA=='
storage_account_name = 'mossdatalakesource'
connection_string = 'DefaultEndpointsProtocol=https;AccountName=mossdatalakesource;AccountKey=u46QXBaayH/rWljcqPTWZTNgFdGHo9zH4I0OLWgDi4oa2inkpjVwTbp74C+ISDC2oWtNlrcr69Ec+ASt5wV1PA==;EndpointSuffix=core.windows.net'
container_name = 'cmic'
#Defininng Blob Storage Uplpoad#
def uploadtoblobstorage(file_path,file_name):
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    blob_client = blob_service_client.get_blob_client(container = container_name , blob = file_name)

    with open(file_path , 'rb') as data:
        blob_client.upload_blob(data)
    print(f'Uploaded {file_name}.')

In [199]:
file_name_exp = f'{file_name}.xlsx'
uploadtoblobstorage(f'{file_path}/{file_name}.xlsx', file_name_exp)

Uploaded Hours_Comparison_WeekEnding_2023-10-14.xlsx.


#### Run Statistics 

In [200]:
#### TCP vs. Workday Stats
today = date.today()
today_timechange = today - timedelta(hours=4)
today_for_text = today_timechange.strftime("%m-%d-%Y %H:%M:%S")
print("TCP vs. Workday")
print("Count of EEIDs that are showing Zero Hours Worked in Workday: " , len(tcp_v_wd_ee.loc[tcp_v_wd_ee['TotalHours_WD'] == 0]))
print("Count of EEIDs that are showing Zero Hours Worked in TCP: " , len(tcp_v_wd_ee.loc[tcp_v_wd_ee['TotalHours_TCP'] == 0]))
print("Count of EEIDs with a variance less than 10 hours(ABS), not including 0: ", len(tcp_v_wd_ee.loc[(tcp_v_wd_ee['Hours_Variance (TCP - WD)']< 10)&(tcp_v_wd_ee['Hours_Variance (TCP - WD)'] >-10)&(tcp_v_wd_ee['Hours_Variance (TCP - WD)'] != 0)]))
print("Count of EEIDs with no variance in Hours Worked: " , len(tcp_v_wd_ee.loc[tcp_v_wd_ee['Hours_Variance (TCP - WD)'] == 0]))
print("Count of EEIDs with variance greater than or equal to 10(ABS): ",len(tcp_v_wd_ee.loc[(tcp_v_wd_ee['Hours_Variance (TCP - WD)']>=10)|(tcp_v_wd_ee['Hours_Variance (TCP - WD)']<=-10)]))
print(f"Ran on {today_for_text} for Week Ending {week_ending}")

TCP vs. Workday
Count of EEIDs that are showing Zero Hours Worked in Workday:  76
Count of EEIDs that are showing Zero Hours Worked in TCP:  0
Count of EEIDs with a variance less than 10 hours(ABS), not including 0:  5
Count of EEIDs with no variance in Hours Worked:  0
Count of EEIDs with variance greater than or equal to 10(ABS):  72
Ran on 10-17-2023 12:50:08 for Week Ending 2023-10-14


In [209]:
tcp_df_dev.loc[(tcp_df_dev['Employee_Number']=='20763')&tcp_filter].sort_values('Date',ascending = True)

Unnamed: 0,WorkSegments_Timesheet_ID,Employee_Number,First_Name,Last_Name,Date,Job_Code,Activity,Phase_Code,Task,Regular_Hours,Overtime_Hours,Pay_Rate,Employee_Classification,Department,FileName,Created_Date


In [96]:
#Date Filters
tcp_filter = (tcp_df_comp['Date']>=start_date)&(tcp_df_comp['Date']<=end_date)
cmic_filter = (cmic_pr_df_comp['Date']>=start_date)&(cmic_pr_df_comp['Date']<=end_date)
wd_filter = (wd_pr_df_comp['Date']>=start_date)&(wd_pr_df_comp['Date']<=end_date)

In [97]:
#['EEID','Date','Job','Sub_Job','Hours']
wd_eeid = list(wd_pr_df_comp.loc[wd_filter,'EEID'].unique())
tcp_eeid = list(tcp_df.loc[tcp_filter,'EEID'].unique())

In [98]:
eeid_intcp_not_wd = list(set(tcp_eeid) - set(wd_eeid))
len(eeid_intcp_not_wd)

31

In [99]:
for x in eeid_intcp_not_wd:
    print(x)

6937
16200
14705
15564
15033
15962
5243000043
14987
7811
16187
11143
15454
11089
15990
16138
16324
13279
15762
16030
16165
16180
12968
15369
14829
5105
20500
13732
20619
5243000015
16162
20306


In [86]:
wd_eeid_explore = wd_pr_df_dev.copy()

In [87]:
wd_eeid_explore['EEID'] = wd_eeid_explore['Workers'].str.split('(').str[1]
wd_eeid_explore['EEID'] = wd_eeid_explore['EEID'].str.split(')').str[0]

In [100]:
wd_pr_df.loc[wd_pr_df['EEID'] == '4082']

Unnamed: 0,workdayID,EmployeeID,Employee_FullName,Workers,Employee_BillTransaction_Descriptor,billableTransaction.descriptor,Employee_BillTransactionDate,Hours,dayOfTheWeek,FrequencySalary,...,transactionSource.descriptor,transactionSource.id,unit.descriptor,unit.id,unit1,unitOfTimeIsHours,unroundedDuration,EEID,Sub_Job,Job
43021,7c8f1553b88990011947d733e8c20000,4082,Aisoli Vitale,Aisoli Vitale (4082),10 Hours,10 Hours on 10/02/2023,10/02/2023,10.0,Monday,Weekly Unions,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,0.0,4082,15,7102023
43022,7c8f1553b88990011947d86850e10000,4082,Aisoli Vitale,Aisoli Vitale (4082),10 Hours,10 Hours on 10/03/2023,10/03/2023,10.0,Tuesday,Weekly Unions,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,0.0,4082,15,7102023
43023,7c8f1553b88990011947d9029b6b0000,4082,Aisoli Vitale,Aisoli Vitale (4082),10 Hours,10 Hours on 10/04/2023,10/04/2023,10.0,Wednesday,Weekly Unions,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,0.0,4082,15,7102023
43024,7c8f1553b88990011947d99d041c0000,4082,Aisoli Vitale,Aisoli Vitale (4082),10 Hours,10 Hours on 10/05/2023,10/05/2023,10.0,Thursday,Weekly Unions,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,0.0,4082,15,7102023
50602,7c8f1553b8899002172e24b276f40000,4082,Aisoli Vitale,Aisoli Vitale (4082),10 Hours,10 Hours on 10/09/2023,10/09/2023,10.0,Monday,Weekly Unions,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,0.0,4082,20,7102023
50603,7c8f1553b8899002172e254d60670000,4082,Aisoli Vitale,Aisoli Vitale (4082),10 Hours,10 Hours on 10/10/2023,10/10/2023,10.0,Tuesday,Weekly Unions,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,0.0,4082,15,7102023
50604,7c8f1553b8899002172e25e719c70000,4082,Aisoli Vitale,Aisoli Vitale (4082),10 Hours,10 Hours on 10/11/2023,10/11/2023,10.0,Wednesday,Weekly Unions,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,0.0,4082,20,7102023
50605,7c8f1553b8899002172e268098770000,4082,Aisoli Vitale,Aisoli Vitale (4082),10 Hours,10 Hours on 10/12/2023,10/12/2023,10.0,Thursday,Weekly Unions,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,0.0,4082,15,7102023
51896,dcd88da845b59002168f57a38a360000,4082,Aisoli Vitale,Aisoli Vitale (4082),10 Hours,10 Hours on 10/09/2023,10/09/2023,10.0,Monday,Weekly Unions,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,0.0,4082,20,7102023
51897,dcd88da845b59002168f583e79ab0000,4082,Aisoli Vitale,Aisoli Vitale (4082),10 Hours,10 Hours on 10/10/2023,10/10/2023,10.0,Tuesday,Weekly Unions,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,0.0,4082,15,7102023


In [102]:
wd_pr_df.FrequencySalary.unique()
wd_pr_df.loc[wd_pr_df['FrequencySalary'] != 'Bi-Weekly Salary']

Unnamed: 0,workdayID,EmployeeID,Employee_FullName,Workers,Employee_BillTransaction_Descriptor,billableTransaction.descriptor,Employee_BillTransactionDate,Hours,dayOfTheWeek,FrequencySalary,...,transactionSource.descriptor,transactionSource.id,unit.descriptor,unit.id,unit1,unitOfTimeIsHours,unroundedDuration,EEID,Sub_Job,Job
24,07f132c6a9069001f5d7fe0a8b9b0000,3797,Robert Jean,Robert Jean (3797),4 Hours,4 Hours on 10/05/2023,10/05/2023,4.00,Thursday,Weekly Hourly,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,4.00,3797,00,D302201
25,07f132c6a9069001f5d7feb236e60000,3797,Robert Jean,Robert Jean (3797),4.5 Hours,4.5 Hours on 10/06/2023,n 10/06/2023,4.50,Friday,Weekly Hourly,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,4.50,3797,00,D302201
26,07f132c6a9069001f5d7feb236e60001,3797,Robert Jean,Robert Jean (3797),4 Hours,4 Hours on 10/02/2023,10/02/2023,4.00,Monday,Weekly Hourly,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,4.00,3797,00,D302201
27,07f132c6a9069001f5d7ff518df60000,3797,Robert Jean,Robert Jean (3797),4 Hours,4 Hours on 10/03/2023,10/03/2023,4.00,Tuesday,Weekly Hourly,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,4.00,3797,00,D302201
28,07f132c6a9069001f5d7ff518df60001,3797,Robert Jean,Robert Jean (3797),4.5 Hours,4.5 Hours on 10/03/2023,n 10/03/2023,4.50,Tuesday,Weekly Hourly,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,4.50,3797,00,D302201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76555,07f132c6a9069000cd4715622eb60002,6585,Wayne Stevenson,Wayne Stevenson (6585),6.5 Hours,6.5 Hours on 09/27/2023,n 09/27/2023,6.50,Wednesday,Weekly Hourly,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,6.50,6585,50,1412301
76556,07f132c6a9069000cd4715fc58590000,6585,Wayne Stevenson,Wayne Stevenson (6585),6.5 Hours,6.5 Hours on 09/26/2023,n 09/26/2023,6.50,Tuesday,Weekly Hourly,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,6.50,6585,50,1412301
76557,07f132c6a9069000cd4715fc58590001,6585,Wayne Stevenson,Wayne Stevenson (6585),5.5 Hours,5.5 Hours on 09/27/2023,n 09/27/2023,5.50,Wednesday,Weekly Hourly,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,5.50,6585,50,1412301
76558,07f132c6a9069000cd471695eeb60000,6585,Wayne Stevenson,Wayne Stevenson (6585),5.75 Hours,5.75 Hours on 09/26/2023,on 09/26/2023,5.75,Tuesday,Weekly Hourly,...,Time,7a55a775b63b4b9b8a5b3722fc1e4556,Hours,c4dacbd56bca4a9a8950e8d3ed21bbdb,Hours,True,5.75,6585,50,1412301
