### Import Libraries

In [None]:
#import libraries
import os
import pandas as pd
import numpy as np
from datetime import datetime, date
import warnings
warnings.simplefilter("ignore")

***

### Data Acquisition

In [None]:
team_report_fold   = "../CDM Reports/TeamReport1/" #folder of the team reports
user_report_fold   = "../CDM Reports/UserReport1/" #folder of the user reports
team_task_fold     = "../CDM Reports/TeamTasks1/" #folder of the team task report
task_history_fold  = "../CDM Reports/TaskHistory1/" #folder of the team task report
jasper_report_fold = "../CDM Reports/JasperReport1/" #folder of the jasper report

In [None]:
folder_list = [team_report_fold, user_report_fold, team_task_fold, task_history_fold, jasper_report_fold]

In [None]:
# read data
# for each folder, walk through all directories and subdirectories
final_dfs = []
for folder in folder_list:
    temp_dfs = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.xlsx'): # check if the file is an excel file
                # read the file into a data frame and append it to the list of data frames
                df = pd.read_excel(os.path.join(root, file))
                temp_dfs.append(df)

    # concat all data frames into a single data frame
    df = pd.concat(temp_dfs, ignore_index=True)
    final_dfs.append(df)

In [None]:
team_report_df   = final_dfs[0]
user_report_df   = final_dfs[1]
team_tasks_df    = final_dfs[2]
task_history_df  = final_dfs[3]
jasper_report_df = final_dfs[4]

In [None]:
dashboard_df = pd.read_excel("Data.xlsx", sheet_name="Base")
bc_list_df = pd.read_excel("Data.xlsx",sheet_name="Bulk Creation")
gv_list_df = pd.read_excel("Data.xlsx",sheet_name="Generic Volume")

***

### Data Cleaning

In [None]:
#remove rows with no case number and no process type and no task ID
team_report_df = team_report_df.dropna(axis=0, subset=["Task ID", "Case No", "Process Type"])
user_report_df = user_report_df.dropna(axis=0, subset=["Task ID", "Case No", "Process Type"])

In [None]:
#select rows with process type == "SG - ..."
team_report_df = team_report_df[team_report_df["Process Type"].str.contains("SG - ")]
user_report_df = user_report_df[user_report_df["Process Type"].str.contains("SG - ")]

In [None]:
#strip all trailing whitespace:
'''team_report_df = team_report_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
user_report_df = user_report_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
team_tasks_df = team_tasks_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
dashboard_df = dashboard_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
bc_list_df = bc_list_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
gv_list_df = gv_list_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)'''

In [None]:
#remove cases that also appear in the Team Task Report (cases that are still open)
team_report_df = team_report_df[~team_report_df["Case No"].isin(team_tasks_df["Case No"].tolist())]

In [None]:
#convert date cells to datetime object type
team_report_df["Task Closure Date"] = pd.to_datetime(team_report_df["Task Closure Date"])
team_report_df["Task Assigned to Date"] = pd.to_datetime(team_report_df["Task Assigned to Date"])
user_report_df["Task Assigned to Date"] = pd.to_datetime(user_report_df["Task Assigned to Date"])

In [None]:
#list of teams and processes used for dashboard
process_list = dashboard_df["Process Type"].tolist()
team_list = dashboard_df["Team"].tolist()

In [None]:
#exceptions for bulk creation and generic volume
bc_list = bc_list_df.to_numpy().tolist()
gv_list = gv_list_df.to_numpy().tolist()

***

### Data Cleaning: Dates - Part 1 (Remove Weekends)

In [None]:
#sort ascending according to Case No, Task ID, Task Assigned to Date, Time taken (in days) aggregated
team_report_df = team_report_df.sort_values(["Case No", "Task ID", "Task Assigned to Date", "Time taken (in days) aggregated"])
user_report_df = user_report_df.sort_values(["Case No", "Task ID", "Task Assigned to Date", "Actual Time Taken"])

In [None]:
#organize into a list of cases, and list of tasks ordered by the respective cases
case_list = pd.unique(team_report_df["Case No"]).tolist()
task_list_temp = []
for case in case_list:
    temp_df = team_report_df[team_report_df["Case No"]==case]
    task_list_temp.append(pd.unique(temp_df["Task ID"]).tolist())
task_list = [item for sublist in task_list_temp for item in sublist]

In [None]:
#recalculating the number of days taken - excluding weekends. to be in line with CDM logic, public holidays will not be excluded
new_time_taken_temp = []
calendar_days_temp = []
for task in task_list:
    #get list of all dates in "Task Assigned to Date" for ONE task, sorted
    temp_df = team_report_df[team_report_df["Task ID"]==task]
    #create a list of the dates (in chronological order) where tasks are assigned to teams
    assigned_dates = pd.to_datetime(temp_df["Task Assigned to Date"]).tolist()
    #add task closure date to end of list
    closure_date = list(set(pd.to_datetime(temp_df["Task Closure Date"]).tolist()))
    assigned_dates.append(closure_date[0])
    
    #find difference between adjacent dates in the list (1. diff by business days, 2. diff by calendar days)
    new_time_taken_temp.append([np.busday_count(j.date(), i.date()) for i,j in zip(assigned_dates[1:], assigned_dates[:-1])])
    calendar_days_temp.append([(i.date() - j.date()).days for i,j in zip(assigned_dates[1:], assigned_dates[:-1])])

In [None]:
#create new column for new re-calculated time taken that excludes weekends and phs
#flatten the temp lists so i can convert them into columns
new_time_taken = [item for sublist in new_time_taken_temp for item in sublist]
calendar_days = [item for sublist in calendar_days_temp for item in sublist]
team_report_df["New Time Taken"] = new_time_taken
team_report_df["Calendar Days"] = calendar_days

***

### Data Cleaning: Dates - Part 2 (Tasks Pending With)

In [None]:
#this is where- we cross-reference with the user report
#flag out the tasks where calendar days ! = time taken aggregate (means that tasks were reassigned out at some point)
assigned_out_df = team_report_df[team_report_df["Time taken (in days) aggregated"] != team_report_df["Calendar Days"]]

In [None]:
#compare to user report to get the accurate time taken
#get list of all the flagged tasks
assigned_out_task_list = pd.unique(assigned_out_df["Task ID"]).tolist()
#create a new list for storage
new_user_time_taken_temp = []

#loop through tasks in the flagged task list 
for task in assigned_out_task_list:
    temp_df = team_report_df[team_report_df["Task ID"]==task]
    temp_user_df = user_report_df[user_report_df["Task ID"]==task]

    #this should create a list of all the dates where the tasks were reassigned + closure date (in chronological order)
    assigned_user_dates = pd.to_datetime(temp_user_df["Task Assigned to Date"]).tolist()
    closure_date = list(set(pd.to_datetime(temp_df["Task Closure Date"]).tolist()))
    assigned_user_dates.append(closure_date[0])
    
    #find difference between adjacent dates in the list
    new_user_time_taken_temp.append([np.busday_count(j.date(), i.date()) for i,j in zip(assigned_user_dates[1:], assigned_user_dates[:-1])])

In [None]:
new_user_time_taken = [item for sublist in new_user_time_taken_temp for item in sublist] #this list is too long. where are the extra values coming from
#match back to user report since calculations were made with user report data
user_report_df1 = user_report_df[user_report_df["Task ID"].isin(assigned_out_task_list)]
user_report_df1["Time Taken (User Report)"] = new_user_time_taken

In [None]:
#get a list of the total time (.sum()) taken per task per team, based on the user report. store in time_list
time_list = []
for task, team in zip(assigned_out_df["Task ID"],assigned_out_df["Team"]):
    #time taken, in business days, based on user report
    x = user_report_df1[(user_report_df1["Task ID"]==task) & (user_report_df1["Team"]==team)]["Time Taken (User Report)"].sum()
    time_list.append(x)

In [None]:
#"New Time Taken" for flagged tasks are immediately replaced with the Time Taken based on the User Report
assigned_out_df["New Time Taken"] = time_list

In [None]:
#merge team_report_closed_df and assigned_out_df back together
#1. remove the rows from team_report_closed_df that correspond to assigned_out_df
#2. append/concat the full assigned_out_df dataframe to team_report_closed_df

to_drop = assigned_out_df.index.values.tolist()
team_report_df = team_report_df.drop(to_drop)
team_report_df = pd.concat([team_report_df, assigned_out_df])

In [None]:
#this is the part where we take the minimum of the new time taken and the time taken aggregate to account for tasks that were assigned out to Front/IT/etc.
team_report_df["Min Time Taken"] = team_report_df[["New Time Taken","Time taken (in days) aggregated"]].min(axis=1)

***

### Team Report

#### Bulk Creation

In [None]:
#bulk creation: creating dataframe
bulk_creation_df = pd.DataFrame()
for idx in range(len(bc_list)):    
    temp_df = team_report_df[(team_report_df["Team"]==bc_list[idx][0]) &
                             (team_report_df["Process Type"]==bc_list[idx][1]) &
                             (team_report_df["Task Type"]==bc_list[idx][2])]
    bulk_creation_df = pd.concat([bulk_creation_df,temp_df])

In [None]:
# create new column to indicate if task is overdue
bulk_creation_df['days_overdue'] = bulk_creation_df["Team Defined SLA (in days)"] - bulk_creation_df["Min Time Taken"]

# create new column to indicate bulk creation volume count for each case
bulk_creation_df['total_volume'] = 1
for case_no in bulk_creation_df['Case No'].tolist():
    vol = bulk_creation_df[bulk_creation_df['Case No']==case_no]['Task ID'].nunique()
    bulk_creation_df.loc[bulk_creation_df['Case No']==case_no,'total_volume'] = vol

#### Generic Volume

In [None]:
#generic volume: creating dataframe
generic_volume_df = pd.DataFrame()
for idx in range(len(gv_list)):
    temp_df = team_report_df[(team_report_df["Team"]==gv_list[idx][0]) &
                             (team_report_df["Process Type"]==gv_list[idx][1]) &
                             (team_report_df["Task Type"]==gv_list[idx][2])]
    generic_volume_df = pd.concat([generic_volume_df,temp_df])

In [None]:
#generic volume: preprocessing. fill all blank cells, convert each cell to list and take the last element of the list
generic_volume_df["Generic Volume"] = generic_volume_df["Generic Volume"].fillna('1')
generic_volume_df["Generic Volume"] = generic_volume_df["Generic Volume"].astype(str) #necessary step in the case where all generic volume entries are miraculously somehow all integers
generic_volume_df["Generic Volume"] = generic_volume_df["Generic Volume"].apply(lambda x: list(x.split(","))[-1]) #last entry of generic volume is the latest input in CDM task
generic_volume_df["Generic Volume"] = (generic_volume_df["Generic Volume"].astype(float)).astype(int)

In [None]:
# create new column to indicate if task is overdue
generic_volume_df['days_overdue'] = generic_volume_df["Team Defined SLA (in days)"] - generic_volume_df["Min Time Taken"]

# create new column to indicate bulk creation volume count for each case
generic_volume_df['total_volume'] = 1
for case_no in generic_volume_df['Case No'].tolist():
    vol = generic_volume_df[generic_volume_df['Case No']==case_no]['Generic Volume'].max()
    if vol == 0: vol=1
    generic_volume_df.loc[generic_volume_df['Case No']==case_no,'total_volume'] = vol

#### Case Volume

In [None]:
#case volume: creating remaining dataframe
case_volume_df = team_report_df.drop(generic_volume_df.index, errors='ignore')
case_volume_df = case_volume_df.drop(bulk_creation_df.index, errors='ignore')

In [None]:
# just mark which cases are past SLA and which arent
case_volume_df['days_overdue'] = case_volume_df["Team Defined SLA (in days)"] - case_volume_df["Min Time Taken"]

# create new column to indicate case volume count for each case
case_volume_df['total_volume'] = 1

#### Others

In [None]:
# join all 3 vol dfs together
team_report_final_df = pd.concat([bulk_creation_df, generic_volume_df, case_volume_df])

In [None]:
# create column for final volume (take max volume per case)
team_report_final_df['final_volume'] = 1
for case_no in team_report_final_df['Case No'].tolist():
    vol = team_report_final_df[team_report_final_df['Case No']==case_no]['total_volume'].max()
    team_report_final_df.loc[team_report_final_df['Case No']==case_no,'final_volume'] = vol

In [None]:
# create new column to indicate if a task is overdue
team_report_final_df['overdue'] = team_report_final_df['days_overdue'].apply(lambda x: 'Yes' if x<0 else 'No')

In [None]:
# convert portfolios to list
team_report_final_df["Portfolio Number"] = team_report_final_df["Portfolio Number"].fillna('0')
team_report_final_df["Portfolio Number"] = team_report_final_df["Portfolio Number"].apply(lambda x: list(x.split(",")))

In [None]:
team_report_final_df = team_report_final_df.explode("Portfolio Number")

***

### User Report

In [None]:
# group by case number
#user_report_df['time_taken'] = 0
#for case_no in user_report_df['Case No'].tolist():
#    time = user_report_df[user_report_df['Case No']==case_no]['Actual Time Taken'].max()
#    user_report_df.loc[user_report_df['Case No']==case_no,'time_taken'] = time

In [None]:
# separate time taken into bins
user_report_df['Bin_0-3'] = user_report_df['Actual Time Taken'].apply(lambda x: 1 if x<4 else 0)
user_report_df['Bin_4-6'] = user_report_df['Actual Time Taken'].apply(lambda x: 1 if (x>3 and x<7) else 0)
user_report_df['Bin_7-8'] = user_report_df['Actual Time Taken'].apply(lambda x: 1 if (x>6 and x<9) else 0)
user_report_df['Bin_9+'] = user_report_df['Actual Time Taken'].apply(lambda x: 1 if x>8 else 0)

In [None]:
user_report_df

In [None]:
user_report_df.shape

***

### Task History

In [None]:
# convert portfolios to list (task history - for rejection reason)
task_history_df["Portfolio"] = task_history_df["Portfolio"].fillna('0')
task_history_df["Portfolio"] = task_history_df["Portfolio"].apply(lambda x: list(x.split(",")))
task_history_df = task_history_df.explode("Portfolio")

In [None]:
# convert portfolios to list (task history - for rejection reason)
task_history_df["Assigned To Team"] = task_history_df["Assigned To Team"].fillna('0')
task_history_df["Assigned To Team"] = task_history_df["Assigned To Team"].apply(lambda x: list(x.split(" ; ")))
task_history_df = task_history_df.explode("Assigned To Team")

In [None]:
# convert rejection reasons to list (task history - for rejection reason)
task_history_df["Rejection Reason"] = task_history_df["Rejection Reason"].fillna('0')
task_history_df["Rejection Reason"] = task_history_df["Rejection Reason"].apply(lambda x: list(x.split(",")))
task_history_df = task_history_df.explode("Rejection Reason")

***

### Merge Reports

In [None]:
# merge all the dataframes
# merge with jasper report
merged_df1 = team_report_final_df.merge(jasper_report_df, how='inner', on='Portfolio Number', suffixes=('_tr', '_jr'))
# merge with user report
#merged_df1 = merged_df1.merge(user_report_df, how='inner', on='')
# merge with task history report
#merged_df1 = merged_df1.merge(task_history_df, how='inner', on='Task ID')

In [None]:
merged_df2 = task_history_df.merge(jasper_report_df, how='outer', left_on='Portfolio', right_on='Portfolio Number', suffixes=('_th', '_jp'))

***

### Output

In [None]:
merged_df1.to_excel("../Output/tr-jr_merged.xlsx", index=False)
merged_df2.to_excel("../Output/th-jr_merged.xlsx", index=False)
user_report_df.to_excel("../Output/user_report.xlsx", index=False)
print("Completed.")