# Mine relevant rules using a model

In [1]:
import pandas as pd
import numpy as np
import glob
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

## Load all the csv files downloaded from the Github repository

List all the files in the directory.

In [2]:
glob.glob('./Assignment_1/*.csv')

['./Assignment_1/SCR.csv',
 './Assignment_1/PSAJ.csv',
 './Assignment_1/ATC.csv',
 './Assignment_1/JADAMB - Sheet1.csv',
 './Assignment_1/b_s.csv']

Read all the files in the directory and put them in a list. Next, concatenate the files together into one DataFrame. Next, list the first five rows.

In [3]:
l = [pd.read_csv(filename, header = None) for filename in glob.glob('./Assignment_1/*.csv')]
df = pd.concat(l, axis=0).reset_index(drop=True)
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
70,cogollo,caldo,flan,salad,yogurt,pear,apple,paper bag,,,...,,,,,,,,,,
71,toothpaste,ham,chicken breast,cashew nuts,almond,paper bag,,,,,...,,,,,,,,,,
72,beer,pineapple,napkins,plastic cups,plastic plates,plastic knife,plastic fork,,,,...,,,,,,,,,,
73,coke,orange juice,pineapple,apple,grape,limon,cheese,peach,eggs,yogurt,...,,,,,,,,,,
74,pineapple,fruit juice,olives,nutella,tomato,milk,banana,orange,cucumber,,...,,,,,,,,,,


Transpose the dataframe to get one transcation per column. 
Drop the duplicates in each column after transposing to make sure each item occurs only once per transaction. 

In [4]:
df_t = df.T
df_t = df_t.apply(pd.Series.drop_duplicates)
df_t.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
17,,,,,,,,,,,...,,,,caneloni,,,,,,
18,,,,,,,,,,,...,,,,,,,,,,
19,,,,,,,,,,,...,,,,,,,,,,
20,,,,,,,,,,,...,,,,,,,,,,
21,,,,,,,,,,,...,,,,,,,,,,


Apply to each columns (transaction) the following operation:
* drop the NaN values
* convert it to a list ==> one list per transaction

This is the format required by the mlxtend package to calculate the rules.

In [5]:
def convert_to_list(s):
    return s.dropna().tolist()

In [10]:
transactions = df_t.apply(convert_to_list).tolist()
transactions[30:32]

[['bread', 'snacks', 'chicken', 'cheese', 'nuts', 'yogurt', 'fruit'],
 ['vegetables', 'sauce']]

Caculate the support for each item to use later in association rules, set the min_support to 0.03 so that not every rule is calculated later. Display the first five item frequencies.

In [7]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.03, use_colnames=True)

frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.04,(avocado)
1,0.04,(banana)
2,0.066667,(beer)
3,0.106667,(bread)
4,0.053333,(cereal)


Calculate the association rules with a minimun threshold of parameter metric (in this case `support`) set to 0.001. Next sort the values according to the lift value and drop the unnecessary columns.

In [8]:
res = association_rules(frequent_itemsets, min_threshold=0.001, metric = 'support')
res.sort_values(by='lift', ascending=False).drop(columns=['conviction', 'leverage'])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
22,(cereal),"(potato, milk)",0.053333,0.04,0.04,0.75,18.75
19,"(potato, milk)",(cereal),0.04,0.053333,0.04,1.0,18.75
2,(cereal),(milk),0.053333,0.066667,0.04,0.75,11.25
18,"(potato, cereal)",(milk),0.053333,0.066667,0.04,0.75,11.25
23,(milk),"(potato, cereal)",0.066667,0.053333,0.04,0.6,11.25
3,(milk),(cereal),0.066667,0.053333,0.04,0.6,11.25
20,"(milk, cereal)",(potato),0.04,0.093333,0.04,1.0,10.714286
21,(potato),"(milk, cereal)",0.093333,0.04,0.04,0.428571,10.714286
5,(cereal),(potato),0.053333,0.093333,0.053333,1.0,10.714286
4,(potato),(cereal),0.093333,0.053333,0.053333,0.571429,10.714286


If you buy `cereal` (which is common) you are very likely to buy `milk and potatoes` as well. This is just an example of the interpretation.

If you buy `milk and potatoes` (which is less common than the previous example) you are going to buy `cereal` as well (confidence in this case is 1). This is just another example of the interpretation.
