# Association mining using Apriori

In [186]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations

%matplotlib inline

In [187]:
df = pd.read_csv("./dataset.csv")

In [188]:
df.head()

Unnamed: 0,TID,List
0,T100,"I1, I2, I5"
1,T200,"I2, I4"
2,T300,"I2, I3"
3,T400,"I1, I2, I4"
4,T500,"I1, I3"


In [189]:
def convertToOnehot(df, col):
    return df.join(df[col].str.get_dummies(", ")).drop(["List"], axis=1)

In [200]:
def getCount(df, col):
    return len(df.loc[df[col] == 1])

In [191]:
def combineCols(df, col1, col2):
    return (df[col1] & df[col2])

In [192]:
def checkForCombinations(combList, combTuple, r):
    isPresent = True
    for comb in combinations(combTuple, r):
        if comb not in combList:
            isPresent = False
            break
    return isPresent

In [209]:
def apriori(df, col, s_count):
    df = pd.DataFrame(df[col])
    df = convertToOnehot(df, col)
    items = df.columns
    countDf = df.sum()
    countDict = countDf.loc[countDf >= s_count].to_dict()
    items = list(countDict.keys())
    combHist = []
    combPrev = list(combinations(items, 1))
    combNext = []
    for i in range(2, len(items) - 1):
        for comb in combinations(items, i):
            if checkForCombinations(combPrev, comb, (i-1)):
                combProp = "_".join(str(c) for c in comb)
                col1 = "_".join(str(c) for c in comb[:-1])
                col2 = str(comb[-1])
                df[combProp] = combineCols(df, col1, col2)
                countTemp = getCount(df, combProp)
                if countTemp >= s_count:
                    combNext.append(comb)
                    countDict[combProp] = getCount(df, combProp)
        combHist.append(combPrev)
        combPrev = combNext
    return countDict

In [224]:
def getConfidence(items_given, support_item, countDict):
    items_given_str = "_".join(items_given)
    item_support = "_".join([items_given_str, support_item])
    items = list(countDict.keys())
    if (items_given_str not in items) or (item_support not in items):
        return 0
    else:
        return (countDict[item_support]/ countDict[items_given_str])

In [225]:
countDict = apriori(df, 'List', 2)
getConfidence(["I1", "I2"], "I3", countDict)

0.5

In [217]:
countDict

{'I1': 6,
 'I1_I2': 4,
 'I1_I2_I3': 2,
 'I1_I2_I5': 2,
 'I1_I3': 4,
 'I1_I5': 2,
 'I2': 7,
 'I2_I3': 4,
 'I2_I4': 2,
 'I2_I5': 2,
 'I3': 6,
 'I4': 2,
 'I5': 2}