This is code to import clean and combine the time sheet datasets. The code is designed to harvest files from a subfolder titled orig. It processes all files in the folder with a CSV or an XLSX file extension.

Each file is imported individually. A number of columns are removed that don't appear to be statistically significant.
   'username', 'payroll_id', 'fname', 'lname', 'number', 'group','local_day', 'local_start_time', 'local_end_time', 'tz', 'location', 'notes', 'approved_status'.
   
Each cleaned file is stored individially, all clean timesheets are combined into a single dataframe which is saved separately as combined.dat.

The Employee names are replaced with a unique identified EMP_1, EMP_2 etc. The employee details are stored in a separate file for later reuse. Based on additional information, the salaried employees have been identified separately and a salary variable in engineered (1 or 0) based on an employees salaried status.

The final output is keys.json which is a dictionary of employee attributes and combined.dat which represents a cleaned dataframe of all significant variables.

In [10]:
# //*** Initial Data Prep and light obfuscate
# //*** Removed Uneeded Columns
# //*** Removed Individidual Identification
# //*** Added EmployeeID Proxy to each Dataframe
# //*** Added Salaried field as additional information from data source
# //*** Combined (concatinated) All Data frames
# //*** Saved Combined and Individual data frames separately so original files can be deleted
import xlrd # pandas dependency
import numpy as np
import pandas as pd
import os
import json
###

# Managers - Salary, Everyone Else is Salaried
# Ari Cynthia  Gina Marina Michelle Rosemary
# Hourly
# Dominique Fiona Francis Jim  Megan Micaela

g = {
    'remove_cols': ['username', 'payroll_id', 'fname', 'lname', 'number', 'group','local_day', 'local_start_time', 'local_end_time', 'tz', 'location', 'notes', 'approved_status'],
    'obfuscate': {},
    'df' : [],
    'salary' : [ 'Ari', 'Cynthia', 'Gina', 'Marina', 'Michelle', 'Rosemary' ]
    
}
curDir = os.getcwd()

fileList = os.listdir(curDir + "\\orig")

employee_counter = 0

for fname in fileList:
    valid_file = False
    ### For each Excel File
    if ".xlsx" in fname:
        valid_file = True
        
        # //*** Build Generic Employee value
        employee_counter = employee_counter + 1

        # //*** Each employee is EMP_ with a number
        generic_name = f"EMP_{employee_counter}"
        loop_df = pd.read_excel(curDir+"\\orig\\"+fname,1)
    
    ### For each CSV File
    if ".csv" in fname:
        valid_file = True
        
        # //*** Build Generic Employee value
        employee_counter = employee_counter + 1

        # //*** Each employee is EMP_ with a number
        generic_name = f"EMP_{employee_counter}"
        loop_df = pd.read_csv(curDir+"\\orig\\"+fname)
        
        
    
    if valid_file == True:
        salary = 0
        
        #print(f"{fname} {generic_name}")
        
        # //*** Classify Employee as Salaried
        for x in g['salary']:
            if x.lower() in fname.lower():                
                # //*** Remove Emplyee from salary List to avoid double matches. It works well Enough.
                g['salary'].remove(x)
                salary = 1
                break
        
        loop_user_details = {
            'generic_name': generic_name,
            'fname': loop_df.loc[0, 'fname'],
            'lname': loop_df.loc[0, 'lname'],
            'group': loop_df.loc[0, 'group'],
            'file': generic_name + ".dat",
            'salary': salary
        }
        
        ### Remove unneeded columns
        loop_df = loop_df.drop(columns=g['remove_cols'])
        
        
        # //*** Add Columnns based on Salary Type and Employee Name.
        # //*** Employee name is used to find an individual in the data frame.
        loop_new_emp_col = []
        loop_salary_column = []
        
        for x in range(0, len(loop_df)):
            loop_new_emp_col.append(generic_name)
            loop_salary_column.append(salary)

        
        # Add Employee Generic Name to the df
        loop_df['emp_name'] = loop_new_emp_col
        loop_df['salary'] = loop_salary_column
        
        
        loop_df.to_csv(generic_name+".dat")

        g['obfuscate'][generic_name] = loop_user_details
        
        g['df'].append(loop_df)
        
#print(g['obfuscate'])


In [11]:
with open('keys.json', 'w') as outfile:
    outfile.write(json.dumps(g['obfuscate']))

In [12]:
# //*** Combine All Employees into a single Data Frame
combined_df = pd.concat(g['df'])
combined_df = combined_df.drop(columns=['has_flags','flag_types'])


combined_df.to_csv("combined.dat")

In [13]:
print(combined_df)

               local_date  hours                    jobcode_1  \
0     2019-01-01 00:00:00   8.00                      Holiday   
1     2019-01-02 00:00:00   1.75                 Gabel Energy   
2     2019-01-02 00:00:00   1.08  ALTUS Architecture + Design   
3     2019-01-02 00:00:00   1.58                 Gabel Energy   
4     2019-01-02 00:00:00   1.25        BDE Architecture Inc.   
...                   ...    ...                          ...   
2796  2019-12-25 00:00:00   8.00                      Holiday   
2797  2019-12-26 00:00:00   7.20                     Vacation   
2798  2019-12-27 00:00:00   7.20                     Vacation   
2799  2019-12-30 00:00:00   7.20                     Vacation   
2800  2019-12-31 00:00:00   7.20                     Vacation   

                                     jobcode_2  \
0                                          NaN   
1                                          NaN   
2     19009 - Munson Residence and Guest House   
3                  