### imports

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
import re
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')
pio.renderers.default = "notebook"
import panel as pn
pn.extension('plotly')

## Summary
- Three log files were given, and a templateList.csv
- Each log file was about 1 Million Rows totalling 3.7 Million rows of logs
- The data viz were chosen based on the business questions posed


# Table of contents
1. [How many of the 'TEMPLATE NAMES' occur in the log file?](#introduction)

2. [How many times does each TEMPLATE NAME occur in each log file?](#paragraph1)

    1. [How many of the TEMPLATE NAMES do not occur even once in each log file?](#subparagraph1)
    

3. [How many of the TEMPLATE NAME occurrences are 'Invoice'?](#paragraph2)


4. [How many are 'PackSlip'?](#paragraph3)


5. [How many of the TEMPLATE NAME occurrences are 'Invoice'?](#paragraph4)

In [2]:
# preprocessing

def getTemplate(tf):
    '''returns a pandas dataframe 
    containing the desired template file
    from the root directory'''
    temp = pd.read_csv(tf)
    temp.columns = ['Names']
    mylist = []
    for name in temp.Names: mylist.append(name.rstrip('pp7')) # strip out file extensions
    temp.Names = mylist
    return temp

def outputStats(temp,i):
    '''prints some basic stuff'''
    count, mean, std, min,_,_,_,_ = temp[i].describe()
    print('Mean: '+ str(mean) + '\nStandard Deviation: '+ str(std))

def getLog(filename):
    '''returns a "cleanish" dataframe
    from the desired logfile'''
    s  = '-'*72
    with open(filename) as f:
        atext = f.read().split(s)
    df = pd.DataFrame(data = atext)
    df.columns = ['logs']
    return df

def getLogMatches(df, temp):
    '''returns a list of matches of each element of temp in df'''
    lis = []
    
    for col,row in temp.iterrows():
        filtereddf = df[df['logs'].str.contains(row['Names'])]
        lis.append(filtereddf.shape[0])
#     sortedtemp = temp.groupby(['log_matches'])
    return (lis)

def matchBar(df, col):
    '''returns a plotly object'''
    fig = go.Figure(
        data=[go.Bar(y=df[col], x= df.Names)],
        layout_title_text= 'Number of occurances by TEMPLATE NAMES in the log file: ' + col
    )
    plotly_pane = pn.pane.Plotly(fig)
    return plotly_pane

In [3]:
filenames = ['ppw20210125.log', 'ppw20210126.log', 'ppw20210127.log']
temp = getTemplate('PP Templates (Initial) Listing.csv')
df1 = getLog(filenames[0])
df2 = getLog(filenames[1])
df3 = getLog(filenames[2])

In [4]:
temp[filenames[0]] = getLogMatches(df1, temp)
temp[filenames[1]] = getLogMatches(df2, temp)
temp[filenames[2]] = getLogMatches(df3, temp)

In [5]:
temp['total'] = temp[filenames].sum(axis=1)

In [6]:
temp = temp.sort_values(by='total')
# temp

In [12]:
temp['Invoice'] = temp.Names.str.contains('Invoice')
temp['Packslip'] = temp.Names.str.contains('PackSlip')
temp['Pickslip'] = temp.Names.str.contains('PickSlip')
temp['Other'] = ~temp['Names'].str.contains('Invoice|Pack|Pick', flags = re.I, regex = True)
temp['Invoice'] = temp.Invoice.map({True: 'Invoice', False: ''})
temp['Packslip'] = temp.Packslip.map({True: 'Packslip', False: ''})
temp['Pickslip'] = temp.Pickslip.map({True: 'Pickslip', False: ''})
temp['Other'] = temp.Other.map({True: 'Other', False: ''})
temp['type'] = (temp['Invoice'] + temp['Packslip']+ temp['Pickslip']+ temp['Other']).astype(str)
temp = temp.drop(['Invoice','Packslip','Pickslip','Other'], axis = 1)
temp['used']=temp['total'].map(lambda x: 'Used' if x>0 else 'Not Used')
temp

Unnamed: 0,Names,ppw20210125.log,ppw20210126.log,ppw20210127.log,total,type,used
209,CompanyU_WO_GMP_Powder.,0,0,0,0,Other,Not Used
288,CompanyZ_Invoice_A4_032119.,0,0,0,0,Invoice,Not Used
286,CompanyZBranch5_Statement_A4.,0,0,0,0,Other,Not Used
285,CompanyZBranch5_Statement_03282019.,0,0,0,0,Other,Not Used
284,CompanyZBranch5_Statement.,0,0,0,0,Other,Not Used
...,...,...,...,...,...,...,...
210,CompanyU_XML_Invoice.,329,376,263,968,Invoice,Used
249,CompanyW_PickSlip_XML_HP_P4515x.,440,429,407,1276,Pickslip,Used
247,CompanyW_PickSlip_XML.,440,429,407,1276,Pickslip,Used
90,CompanyD_Invoice_HP_P4515.,530,452,331,1313,Invoice,Used


<a name="introduction"></a>
## How many of the 'TEMPLATE NAMES' occur in the log file? 

In [13]:
for i in range(0,3):
    outputStats(temp,filenames[i])
    display(matchBar(temp.sort_values(by=filenames[i]).tail(30), filenames[i]))

Mean: 11.334128878281623
Standard Deviation: 71.80654536207007


Mean: 11.062052505966587
Standard Deviation: 59.30987449021523


Mean: 9.073985680190932
Standard Deviation: 46.361712529001814


<a name="paragraph1"></a>
## How many times does each TEMPLATE NAME occur in each log file?

In [37]:
def stackedBar(temp,filenames):
    temp = temp.tail(20)
    x = temp['Names']

    fig = go.Figure(data=[go.Bar(
        name = filenames[0],
        x = x,
        y = temp[filenames[0]]
       ),
                go.Bar(
        name = filenames[1],
        x = x,
        y = temp[filenames[1]]
       ),
                 go.Bar(
        name = filenames[2],
        x = x,
        y = temp[filenames[2]]
       )
    ])
    
#     fig.update_layout(barmode = 'stack')
    plotly_pane = pn.pane.Plotly(fig)
    return plotly_pane

display(stackedBar(temp,filenames))

<a name="subparagraph1"></a>
### How many of the TEMPLATE NAMES do not occur even once in each log file?

In [15]:
# temp['total'].apply(lambda x: True x==0 else False})

In [52]:
fig = px.pie(temp, names='used', title = 'Proportion of used templates')
plotly_pane = pn.pane.Plotly(fig)
display(plotly_pane)

fig = px.sunburst(temp, path=['type', 'used'], title = 'Proportion of used templates by Type')
plotly_pane = pn.pane.Plotly(fig)
display(plotly_pane)

print('These Template Names do not occur even once in ANY of the the log files: ')
temp[temp['total']==0]['Names'].sort_values().values

These Template Names do not occur even once in ANY of the the log files: 


array(['AND_DBA_Invoice_C5051.', 'AND_DBA_Invoice_HPM605.',
       'AND_DBA_Invoice_HP_P3015.', 'AND_DBA_Invoice_New.',
       'AND_DBA_Invoice_Orig.', 'AND_DBA_PO.', 'AND_DBA_PackSlip_HPM605.',
       'AND_DBA_PickSlipApproved_HP_P4515.', 'AND_DBA_PickSlip_HPM605.',
       'AND_DBA_Statement.', 'AND_DBA_Statement_080219.',
       'AND_DBA_Statement_HP_P3015.',
       'AND_DBA_Statement_MANUAL_NOTICE_02.', 'AND_DBA_Statement_Orig.',
       'AND_DBA_XML_Invoice.', 'AND_DBA_XML_Invoice_HP.',
       'AND_DBA_XML_Invoice_HPM605.', 'AND_DBA_XML_Invoice_Mail.',
       'AND_DBA_XML_PackSlip.', 'AND_DBA_XML_PackSlip_HP.',
       'AND_DBA_XML_PackSlip_HPM605.', 'AND_Invoice.',
       'AND_Invoice_4235.', 'AND_Invoice_C5051.', 'AND_Invoice_Faxing.',
       'AND_PO.', 'AND_PackSlip.', 'AND_PackSlip_4235.', 'AND_PickSlip.',
       'AND_PickSlip_4235.', 'AON_PO.', 'CaptureTest.',
       'Company0_Statement.', 'Company2M_Invoice.', 'Company2M_PackSlip.',
       'Company2M_PickSlip.', 'Company2M_Stat

<a name="paragraph2"></a>
## How many of the TEMPLATE NAME occurrences are 'Invoice'?

<a name="paragraph3"></a>
## How many are 'PackSlip'?

<a name="paragraph4"></a>
## How many are PickSlip'?

In [40]:
fig = px.sunburst(temp, path=['type'], values='total', title = 'Total number of occurences by type')
plotly_pane = pn.pane.Plotly(fig)
display(plotly_pane)

temp[temp['total']==0]['Names'].count()

337

In [49]:
#find the percentage for this graph
# temp['total']

fig = px.pie(temp, values='total', names='type', title = 'Total number of TemplateName occurences by type')
plotly_pane = pn.pane.Plotly(fig)
display(plotly_pane)
