# Vendor Work Orders: Analyzing Process Flow in Maximo

In [None]:
import numpy as np
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import regex as re
import multiprocessing as mp
import gc
from floweaver import *
import datetime as dt
import os

pd.set_option('display.max_columns', None)

### Load Work Order Status Changes

In [None]:
status_df = pd.read_csv('Data/Vendor_WO_Statuses_12222022.csv')
status_df['CHANGEDATE'] = pd.to_datetime(status_df['CHANGEDATE'])

In [None]:
wos = list(status_df['WONUM'].unique()) #List distinct WO numbers
grouped = status_df.groupby('WONUM') #Group status changes by work order number

#### Define routines to group statuses by previous status and step number since work order was created

In [None]:
def status_by_prev(grouper, status, outdir=None):
    def get_status_followups(groupby_group, status_of_interest):
        wonum = groupby_group[0]
        df = groupby_group[1]

        group = df.sort_values('WOSTATUSID').reset_index()
        status_indices = list(group.query(f"STATUS == '{status_of_interest}'").index)
        followup_rows = pd.DataFrame()
        try:
            if len(status_indices) > 0:
                if group.iloc[-1]['STATUS'] != status_of_interest:
                    followup_rows = group.iloc[[i+1 for i in status_indices]]
                elif len(status_indices) >1:
                    followup_rows = group.iloc[[i+1 for i in status_indices[:-1]]]
            else:
                return None
        except:
            print(f"Error on Work Order #{wonum}")
            return None

        return followup_rows
    
    follows_df = pd.DataFrame()
    
    print("Running status %s" % status)
    
    for group in tqdm.tqdm(grouper):
        followups = get_status_followups(group, status)
        if followups is not None:
            follows_df = pd.concat([follows_df, followups], axis=0)
            
    follows_df.reset_index().drop(columns = ['index']).to_csv(outdir+f'{status}.csv')
    
    return "Completed status %s" % status

In [None]:
def status_by_step(grouper, step_number, outdir=None):
    
    print(f"Step number {step_number}")
    
    def get_status_by_step(groupby_group, step_number):
        wonum = groupby_group[0]
        df = groupby_group[1]
        
        group = df.sort_values('WOSTATUSID')
        #print(group)
        try:
            relevant_row = pd.DataFrame(group.iloc[step_number - 1]).transpose()
            #print(relevant_row)
            return relevant_row
        except:
            return None
        
        
    outdf = pd.DataFrame()
    
    for group in tqdm.tqdm(grouper):
        step_df = get_status_by_step(group, step_number)
        if step_df is not None:
            outdf = pd.concat([outdf, step_df], axis = 0)
  
    outdf.reset_index().drop(columns=['index']).to_excel(outdir + f'_step_{step_number}_no_statuscount_restriction.xlsx')
    
    return f"Completed step {step_number}"
        

#### Create output directories

In [None]:
for path in ['Outputs', 'Outputs/_STATUS_BY_PREV_STATUS', 'Outputs/_STATUS_BY_STEP', 'Outputs/_STATUS_BY_NEXT_STATUS']:
    if not os.path.exists(path):
        os.makedirs(path)

#### Determine which steps, statuses have not yet been processed

In [None]:
def searchif(pattern, item):
    search = re.search(pattern, item)
    if search is not None:
        return search[0]
    else:
        return None
    
prev_status_files = glob.glob('_STATUS_BY_PREV_STATUS/*')
prev_statuses = [searchif('''(?<=\/)\w+(?=\.)''', item) for item in prev_status_files]
unrun_status_list = [item for item in list(status_df['STATUS'].unique()) if item not in prev_statuses]


step_files = glob.glob('_STATUS_BY_STEP/*')
steps = [searchif('''(?<=\/)\d+(?=\.)''', item) for item in step_files]


##### Create tasks for multithreading, and process statuses by previou status, step

In [None]:
TASKS = [(grouped, status, 'Output/_STATUS_BY_PREV_STATUS/') for status in unrun_status_list]

**WARNING TO USERS: These processing steps will take a WHILE (upwards of an hour), even with multithreading. Consider opening in a separate notebook so that other analytical tasks can proceed.**

In [None]:
#Get DFs by preceding status

with mp.Pool(10) as pool:
    #Trying with map this time -- actually, starmap, to permit multiple args
    results = pool.starmap(status_by_prev, TASKS)
    
    pool.close()
    

In [None]:
#Get DFs by Step

unrun_step_list = [item for item in [step for step in range(1, 10, 1) if step not in steps]]

with mp.Pool(10) as pool:
    TASKS = [(grouped, i, 'Outputs/_STATUS_BY_STEP/') for i in unrun_step_list]

    
    #With apply_async, opens n new processes, each of which handles a single task
    #results = [pool.apply_async(status_by_step, t) for t in TASKS]
    results = pool.starmap(status_by_step, TASKS)
    


### Examining Results with Sankey Plot

Also known as a Flow Diagram, a Sankey Plot visualizes state transitions among discrete objects or groups. In this case, we are visualizing the flow of work orders between statuses -- where they began, where they went next, etc.

Although we are capable of diagramming up to nine steps with our outputs from above, the vast majority of work orders are closed or canceled after four steps at most. As such, we have restricted our visualization for clarity.


In [None]:
ziplist = [item for item in zip(sorted(glob.glob('Outputs/_STATUS_BY_STEP/*')), range(1,10))]

In [None]:
steps_df = pd.read_excel(ziplist[0][0])[['WONUM','FAILURECODE','PROBLEMCODE','LOCATION','LATEST_STATUS','STATUS']]
steps_df.columns = ['WONUM','FAILURECODE','PROBLEMCODE','LOCATION','LATEST_STATUS','STATUS_1']

for path, index in tqdm.tqdm(ziplist[1:]):
    this_df = pd.read_excel(path)[['WONUM','STATUS']]
    this_df.columns = ['WONUM',f'STATUS_{index}']
    steps_df = steps_df.merge(this_df, on='WONUM', how='left')
    

#### Group statuses by category

In [None]:
max_status = {'APPR',
 'CAN',
 'CAPWORK',
 'CLOSE',
 'COMP',
 'DISP',
 'FAILREVIEW',
 'FAILSCH',
 'HISTEDIT',
 'INPRG',
 'INSREV',
 'PENDREVIEW',
 'PLANDOC',
 'RESUB',
 'REVIEW',
 'SBMTAGAPPR',
 'SBMTMGAPPR',
 'SCHED',
 'VIFAILRV',
 'WAPPR',
 'WMATL',
 'WTSCH',
 'nan'}

cat_status = {'Closed or Canceled':['CLOSE','nan', 'CAN'],
 'Awaiting Scheduling':['WTSCH'],
 'Approved': ['APPR'],
 'Scheduled':['SCHED'],
 'In Progress':['INPRG'],
 'Inspection Pending':['INSREV','PENDREVIEW','REVIEW', 'VIFAILRV','FAILREVIEW'],
 'Other' : ['CAPWORK','COMP','DISP','HISTEDIT','PLANDOC','RESUB','SBMTAGAPPR','SBMTMGAPPR','WAPPR','WMATL', 'FAILSCH']}


def reverse_dict(this_dict):
    new_dict= {}
    for key, value in this_dict.items():
        if isinstance(value, list):
            for item in value:
                new_dict[item] = key
        else:
            new_dict[value] = key
    
    return new_dict

status_cats = reverse_dict(cat_status)

In [None]:
t = steps_df[steps_df.columns[4:]].fillna('CLOSE')

for col in t.columns[1:]:
    t[col] = t[col].map(status_cats)
    
t = t.groupby(list(t.columns[1:])).count().reset_index()

In [None]:
#To visualize transitions between steps: use Sankey diagram (the one with flows)
#Join step tables by wonum, and track status across steps

import plotly.graph_objs as go
%matplotlib inline

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
node_labels = []
for col in t.columns[0:-1]:
    status_labels = list(t[col].unique())
    step_number = col.split('_')[1]
    numbered_status_labels = [f'{step_number}_{item}' for item in status_labels]
    node_labels += numbered_status_labels
    
node_dict = {y:x for x, y in enumerate(node_labels)}

In [None]:
steps_for_diagram = t.copy()

for col in steps_for_diagram.columns[0:-1]:
    step_number = col.split('_')[1]
    steps_for_diagram[col] = steps_for_diagram[col].apply(lambda x: f'{step_number}_{x}')
    steps_for_diagram[col] = steps_for_diagram[col].map(node_dict)

In [None]:
source_nodes = []
target_nodes = []

for col in steps_for_diagram.columns[0:-6]:
    source_nodes += steps_for_diagram[col].to_list()
    
for col in steps_for_diagram.columns[1:-5]:
    target_nodes += steps_for_diagram[col].to_list()
    
values = steps_for_diagram['LATEST_STATUS']


In [None]:
link1 = steps_for_diagram[['STATUS_1','STATUS_2','LATEST_STATUS']].groupby(['STATUS_1','STATUS_2']).sum().reset_index()
link1.columns = ['source','target','value']

In [None]:
link2 = steps_for_diagram[['STATUS_2','STATUS_3','LATEST_STATUS']].groupby(['STATUS_2','STATUS_3']).sum().reset_index()
link2.columns = ['source','target','value']

In [None]:
link3 = steps_for_diagram[['STATUS_3','STATUS_4','LATEST_STATUS']].groupby(['STATUS_3','STATUS_4']).sum().reset_index()
link3.columns = ['source','target','value']

In [None]:
link4 = steps_for_diagram[['STATUS_4','STATUS_5','LATEST_STATUS']].groupby(['STATUS_4','STATUS_5']).sum().reset_index()
link4.columns = ['source','target','value']

In [None]:
link5 = steps_for_diagram[['STATUS_5','STATUS_6','LATEST_STATUS']].groupby(['STATUS_5','STATUS_6']).sum().reset_index()
link5.columns = ['source','target','value']

In [None]:
link6 = steps_for_diagram[['STATUS_6','STATUS_7','LATEST_STATUS']].groupby(['STATUS_6','STATUS_7']).sum().reset_index()
link6.columns = ['source','target','value']

In [None]:
links = pd.concat([link1, link2, link3])

In [None]:
cat_color = {'Closed or Canceled':'#280955',
 'Awaiting Scheduling':'#57A4B1',
 'Approved': '#FADE89',
 'Scheduled':'#B0D894',
 'In Progress':'#548531',
 'Inspection Pending':'#b16457',
 'Other' : '#808080'}

chart_labels = [label.split('_')[1] for label in node_labels]
node_colors = [cat_color[item] for item in [label.split('_')[1] for label in node_labels]]


node_label_color = {x:y for x, y in zip(node_labels, node_colors)}
link_color = [node_label_color[x] for x in links.source.to_list()]

link_color = ['rgba({},{},{}, 0.4)'.format(
    hex_to_rgb(x)[0],
    hex_to_rgb(x)[1],
    hex_to_rgb(x)[2]) for x in link_color] 
link_color


In [None]:
'''fig = go.Figure(
    data=[go.Sankey(
        node = dict(label=node_labels),
        arrangement="perpendicular",
        link = dict(
            source = source_nodes,
            target = target_nodes,
            value = values
        ))])'''

fig = go.Figure(
    data=[go.Sankey(
        node = dict(label = chart_labels,
                   color = node_colors),
        arrangement="perpendicular",
        link = dict(source = links.source.to_list(),
                   target = links.target.to_list(),
                   value = links.value.to_list()))])

#fig.write_image("my_fig.png")

plot(fig,
     filename = "Outputs/three_link_diagram.html",
     image_filename='sankey_plot_1', 
     image='png', 
     image_width=1000, 
     image_height=600, 
     auto_open = False
)
#fig.show()

### Analysis of Time between Steps

In [None]:
ziplist = [item for item in zip(sorted(glob.glob('Outputs/_STATUS_BY_STEP/*')), range(1,10))]

In [None]:
steps_df = pd.read_excel(ziplist[0][0])[['WONUM','FAILURECODE','PROBLEMCODE','LOCATION','LATEST_STATUS','STATUS','CHANGEDATE']]
steps_df.columns = ['WONUM','FAILURECODE','PROBLEMCODE','LOCATION','LATEST_STATUS','STATUS_1', 'STATUS_DATE_1']

for path, index in tqdm.tqdm(ziplist[1:]):
    this_df = pd.read_excel(path)[['WONUM','STATUS', 'CHANGEDATE']]
    this_df.columns = ['WONUM',f'STATUS_{index}', f'STATUS_DATE_{index}']
    steps_df = steps_df.merge(this_df, on='WONUM', how='left')
    

In [None]:
for col in steps_df.columns:
    if 'DATE' in col:
        steps_df[col] = pd.to_datetime(steps_df[col])

In [None]:
for i in range(1, 9):
    steps_df[f'LAG_{i}'] = steps_df[f'STATUS_DATE_{i+1}'] - steps_df[f'STATUS_DATE_{i}']

In [None]:
steps_df.groupby(['FAILURECODE','STATUS_1']).agg({'LAG_1':'median', 'WONUM':'count'}).sort_values('WONUM', ascending=False)
#.to_csv('_DEL_TEST_202212071256.csv')

### Drilling down on statuses

In [None]:
wtsch = pd.read_csv('Outputs/_STATUS_BY_PREV_STATUS/WTSCH.csv')
wappr = pd.read_csv('Outputs/_STATUS_BY_PREV_STATUS/WAPPR.csv')
appr = pd.read_csv('Outputs/_STATUS_BY_PREV_STATUS/APPR.csv')
sched = pd.read_csv('Outputs/_STATUS_BY_PREV_STATUS/SCHED.csv')
inprg = pd.read_csv('Outputs/_STATUS_BY_PREV_STATUS/INPRG.csv')
failsh = pd.read_csv('Outputs/_STATUS_BY_PREV_STATUS/FAILSCH.csv')
pendrev = pd.read_csv('Outputs/_STATUS_BY_PREV_STATUS/PENDREVIEW.csv')
review = pd.read_csv('Outputs/_STATUS_BY_PREV_STATUS/REVIEW.csv')
failrev = pd.read_csv('Outputs/_STATUS_BY_PREV_STATUS/FAILREVIEW.csv')


statuses = ['WTSCH','WAPPR','APPR','SCHED','INPRG','FAILSCH','PENDREV','REVIEW','FAILREVIEW']
dfs = [wtsch, wappr, appr, sched, inprg, failsh, pendrev, review, failrev]

In [None]:
base_df = pd.DataFrame(wtsch['STATUS'].value_counts(normalize=False))
base_df.columns = ['WTSCH']

for item in [item for item in zip(statuses, dfs)][1:]:
    thisdf = pd.DataFrame(item[1]['STATUS'].value_counts(normalize=False))
    thisdf.columns = [item[0]]
    base_df = base_df.merge(thisdf, how='outer', left_index=True, right_index=True)

In [None]:
for col in base_df.columns:
    base_df[col] = base_df[col].apply(lambda x: '{:,.1%}'.format(x))

In [None]:
base_df.sort_values([item for item in base_df.columns], ascending=False).to_excel('Outputs/_tables/next_status_breakdown_absolute.xlsx')


In [None]:
ziplist = [item for item in zip(sorted(glob.glob('_STATUS_BY_STEP/*')), range(1,10))]

In [None]:
steps_df = pd.read_excel(ziplist[0][0])[['WONUM','FAILURECODE','PROBLEMCODE','LOCATION','LATEST_STATUS','STATUS']]
steps_df.columns = ['WONUM','FAILURECODE','PROBLEMCODE','LOCATION','LATEST_STATUS','STATUS_1']

for path, index in tqdm.tqdm(ziplist[1:]):
    this_df = pd.read_excel(path)[['WONUM','STATUS']]
    this_df.columns = ['WONUM',f'STATUS_{index}']
    steps_df = steps_df.merge(this_df, on='WONUM', how='left')

In [None]:
#To visualize transitions between steps: use Sankey diagram (the one with flows)
#Join step tables by wonum, and track status across steps

import plotly.graph_objs as go
%matplotlib inline

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
node_labels = []
for col in steps_df.columns[5:]:
    status_labels = list(steps_df[col].unique())
    step_number = col.split('_')[1]
    numbered_status_labels = [f'{step_number}_{item}' for item in status_labels]
    node_labels += numbered_status_labels
    
node_dict = {y:x for x, y in enumerate(node_labels)}

In [None]:
steps_for_diagram = steps_df.copy()[['WONUM']+list(steps_df.columns[5:])]

for col in steps_for_diagram.columns[1:]:
    step_number = col.split('_')[1]
    steps_for_diagram[col] = steps_for_diagram[col].apply(lambda x: f'{step_number}_{x}')
    steps_for_diagram[col] = steps_for_diagram[col].map(node_dict)

In [None]:
source_nodes = []
target_nodes = []

for col in steps_for_diagram.columns[1:6]:
    source_nodes += steps_for_diagram[col].to_list()
    
for col in steps_for_diagram.columns[2:7]:
    target_nodes += steps_for_diagram[col].to_list()
    
values = [1]*len(source_nodes)

In [None]:
fig = go.Figure(
    data=[go.Sankey(
        link = dict(
            source = source_nodes,
            target = target_nodes,
            value = values
        ))])

fig.write_image("Outputs/my_fig.png")

'''plot(fig,
     image_filename='sankey_plot_1', 
     image='png', 
     image_width=1000, 
     image_height=600, 
     auto_open = False
)
fig.show()'''

In [None]:
t = pd.read_csv('Outputs/_STATUS_BY_PREV_STATUS/WTSCH.csv')

In [None]:
t['STATUS'].value_counts(normalize = True)