# Association mining using Apriori

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations

%matplotlib inline

In [66]:
df = pd.read_csv("./dataset.csv")

In [67]:
df.head()

Unnamed: 0,TID,List
0,T100,"I1, I2, I5"
1,T200,"I2, I4"
2,T300,"I2, I3"
3,T400,"I1, I2, I4"
4,T500,"I1, I3"


In [68]:
def convertToOnehot(df, col):
    return df.join(df[col].str.get_dummies(", ")).drop(["List"], axis=1)

In [69]:
def getCount(df, col):
    return len(df.loc[df[col] == 1])

In [70]:
def combineCols(df, col1, col2):
    return (df[col1] & df[col2])

In [71]:
def checkForCombinations(combList, combTuple, r):
    isPresent = True
    for comb in combinations(combTuple, r):
        if comb not in combList:
            isPresent = False
            break
    return isPresent

In [72]:
def apriori(df, col, s_count):
    df = pd.DataFrame(df[col])
    df = convertToOnehot(df, col)
    items = df.columns
    countDf = df.sum()
    countDict = countDf.loc[countDf >= s_count].to_dict()
    print(countDict)
    items = list(countDict.keys())
    combHist = []
    combPrev = list(combinations(items, 1))
    combNext = []
    for i in range(2, len(items) - 1):
        for comb in combinations(items, i):
            if checkForCombinations(combPrev, comb, (i-1)):
                combProp = "_".join(str(c) for c in comb)
                col1 = "_".join(str(c) for c in comb[:-1])
                col2 = str(comb[-1])
                df[combProp] = combineCols(df, col1, col2)
                countTemp = getCount(df, combProp)
                if countTemp >= s_count:
                    combNext.append(comb)
                    countDict[combProp] = getCount(df, combProp)
        combHist.append(combPrev)
        combPrev = combNext
    return countDict

In [73]:
def getConfidence(items_given, support_items, countDict):
    items_given.sort()
    support_items = support_items + items_given
    support_items.sort()
    items_given_str = "_".join(items_given)
    item_support_str = "_".join(support_items)
    item_support = item_support_str
    items = list(countDict.keys())
    if (items_given_str not in items) or (item_support not in items):
        return 0
    else:
        return (countDict[item_support]/ countDict[items_given_str])

In [74]:
countDict = apriori(df, 'List', 2)
getConfidence(["I2", "I1"], ["I5"], countDict)

{'I1': 6, 'I2': 7, 'I3': 6, 'I4': 2, 'I5': 2}


0.5

In [75]:
elements = [s.split("_") for s in countDict.keys()]
elements3 = [element for element in elements if len(element) == 3]
elementsRest = [element for element in elements if len(element) != 3]

In [76]:
def removeElements(list_, elements):
    return [e for e in list_ if e not in elements]

In [77]:
removeElements(["I1", "I2", "I3"], ["I2", "I3"])

['I1']

In [78]:
for element in elementsRest:
    for ele2 in elements3:
        if(getConfidence(element, removeElements(ele2, element), countDict) > 0.7):
            print(element, " -> ", ele2, " ", getConfidence(element, removeElements(ele2, element), countDict))

['I5']  ->  ['I1', 'I2', 'I5']   1.0
['I1', 'I5']  ->  ['I1', 'I2', 'I5']   1.0
['I2', 'I5']  ->  ['I1', 'I2', 'I5']   1.0
