In [None]:
import pandas as pd
import numpy as np
import gc
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
df = pd.read_csv('../data/Final_invoice.csv', low_memory=False)
jtd_df = pd.read_csv('../data/JTD.csv', low_memory=False)

In [None]:
# displaying the number of job_card records from invoice missing in JTD 
df[df['Job Card No'].isin(jtd_df['DBM Order']) == False].shape[0] 

In [None]:
# ignoring the job card records from invoice not present in JTD file
# hence inner join
invoiced_jtd_df = pd.merge(df, jtd_df, left_on='Job Card No',\
                           right_on='DBM Order', how='inner')

In [None]:
invoiced_jtd_df1 = invoiced_jtd_df[['Invoice No','Job Card No','Area / Locality','District','CITY','Customer No.',\
                   'Make', 'Model', 'Order Type', 'Cust Type', 'Cash /Cashless Type', 'Invoice Date',\
                   'JobCard Date','KMs Reading','Total Amt Wtd Tax.', 'DBM Order', 'Material',\
                   'Description', 'Item Category', 'Order Quantity', 'Target quantity UoM', 'Net value']]

In [None]:
invoiced_jtd_df2 = invoiced_jtd_df1.copy()

In [None]:
# Market Basket analysis using apriori algorithm implemented in mlxtend
# Setting Support Threshold as 70%, Confidence Threshold of 80%
# lift is set to 2. lift and confidence threshold settings are used to filter the final rule sets
# These values are typically set by discussing with business users
def createMarketBasket(inputData, supThrshld=0.07, confThrshld=0.8, liftThrshld=2):
    inputData = inputData.to_sparse(fill_value=0)
    frequentItemsets = apriori(inputData, supThrshld, use_colnames=True)
    rules = association_rules(frequentItemsets, metric="lift", min_threshold=confThrshld)
    rules_filtered = rules[ (rules['lift'] >= liftThrshld) & (rules['confidence'] >= confThrshld) ]
    return rules_filtered

# append all the rules as per district and city in one dataframe
def appendRules(existingData, rules, district, city):
    df_1 = pd.DataFrame()
    new_df = pd.concat([df_1,rules],ignore_index=True)
    new_df['district'] = 'dist-1'
    new_df['city'] = 'city-1'
    if existingData is None:
        return new_df
    else:
        existingData = existingData.append(new_df, ignore_index=True)
        return existingData

In [None]:
# Filtering based on District and City
invoiced_jtd_df2 = invoiced_jtd_df2[['Invoice No', 'District', 'CITY', 'Description','Order Quantity']]
dtype_con = {'Invoice No':'int8',
             'District':'str',
             'CITY':'str',
             'Description':'str',
             'Order Quantity':'float32'}
invoiced_jtd_df2 = invoiced_jtd_df2.astype(dtype_con)
dist_city_list = invoiced_jtd_df1.groupby(by=['District','CITY']).groups.keys()
k = 1
# collects all the rules in single dataframe
dfRules = None
for district, city in dist_city_list:
    inv_jtd_gdf = invoiced_jtd_df1[(invoiced_jtd_df1['District'] == district) & \
                                    (invoiced_jtd_df1['CITY'] == city)] 
    inv_jtd_gdf = inv_jtd_gdf.groupby(by=['Invoice No','Description'])
#     display(inv_jtd_gdf['Order Quantity'].sum().reset_index())
    inputData = inv_jtd_gdf['Order Quantity'].sum().unstack().\
    reset_index().fillna(0).set_index('Invoice No')
    display(inputData)
    rules = createMarketBasket(inputData)
    print('\nAssociation rules for District: {} and City: {}'.format(district,city))
    display(rules)
    dfRules = appendRules(dfRules, rules, district, city)
    k = k + 1
    # Temporary variable to run the loop only for certain threshold
    if k > 4:
        break
dfRules.reset_index()

In [None]:
# Memory cleanup 
del df
del jtd_df

In [None]:
# garbage collection
gc.collect()

In [None]:
display(dfRules)