In [None]:
#  Item Based collaborative Filtering

In [1]:
import pandas as pd
from scipy.spatial.distance import cosine  # Calculate the distance between two vectors

In [2]:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [3]:
txn_data = pd.read_excel('sample_txn.xlsx')

In [4]:
txn_data.head(10)

Unnamed: 0,txn,itm
0,1,Bread
1,1,Milk
2,2,Bread
3,2,Diaper
4,2,Beer
5,2,Eggs
6,3,Milk
7,3,Diaper
8,3,Beer
9,3,Coke


In [5]:
# grouping data frame items into list
list1 = txn_data.groupby('txn')['itm'].apply(list)
list1

txn
1                  [Bread, Milk]
2    [Bread, Diaper, Beer, Eggs]
3     [Milk, Diaper, Beer, Coke]
4    [Bread, Milk, Diaper, Beer]
5    [Bread, Milk, Diaper, Coke]
6           [Milk, Diaper, Beer]
Name: itm, dtype: object

In [6]:
# Convert into one-hot coded txn using mlxtend package

from mlxtend.preprocessing import TransactionEncoder
oht = TransactionEncoder()

In [7]:
oht_ary = oht.fit(list1).transform(list1)
txn_data_items = pd.DataFrame(oht_ary, columns=oht.columns_)
txn_data_items

Unnamed: 0,Beer,Bread,Coke,Diaper,Eggs,Milk
0,False,True,False,False,False,True
1,True,True,False,True,True,False
2,True,False,True,True,False,True
3,True,True,False,True,False,True
4,False,True,True,True,False,True
5,True,False,False,True,False,True


In [8]:
# Calculate the cosine similarities to look for each items similarities.  
#                    Cosine-sim = crossProd(x,y)/ sqrt( crosProd(x) * crossprod(y))

# we will be taking cosine function form scipy library. ( scipy.spatial.distance  - cosine())

# txn_itme_sim - similarities of item vs. item
txn_items_sim = pd.DataFrame(index=txn_data_items.columns,columns=txn_data_items.columns)

### scipy cosine() function calculate the distance similarities.
- 0 : Most similar
- 1 : Dissimilar

In [17]:
# Unit vectors are maximally "similar" if they're parallel and maximally "dissimilar" if they're orthogonal (perpendicular)
cosine(txn_data_items.iloc[:,0], txn_data_items.iloc[:,0]) # Calcuate the distance between same vectors, 0 : Most similar, 1: dissimilar

0.0

In [10]:
# Calcuate the cosine similiarities 
for i in range(0,len(txn_items_sim.columns)) :           
    for j in range(0,len(txn_items_sim.columns)) :       
        txn_items_sim.iloc[i,j] = cosine(txn_data_items.iloc[:,i],txn_data_items.iloc[:,j]) # Compute the Cosine Similarity 

In [11]:
txn_items_sim

Unnamed: 0,Beer,Bread,Coke,Diaper,Eggs,Milk
Beer,0.0,0.5,0.646447,0.105573,0.5,0.32918
Bread,0.5,0.0,0.646447,0.32918,0.5,0.32918
Coke,0.646447,0.646447,0.0,0.367544,1.0,0.367544
Diaper,0.105573,0.32918,0.367544,0.0,0.552786,0.2
Eggs,0.5,0.5,1.0,0.552786,0.0,1.0
Milk,0.32918,0.32918,0.367544,0.2,1.0,0.0


In [12]:
# get the list of highly associated items in sorting order ( top 5 most associated)
item_association = pd.DataFrame(index=txn_items_sim.columns,columns=range(1,6))

In [13]:
# Associated items list
for i in range(0,len(txn_items_sim.columns)):
    item_association.iloc[i,:5] = txn_items_sim.iloc[0:,i].sort_values(ascending=True)[:5].index

In [14]:
# Top 4 associated items
item_association.iloc[:,1:5]

Unnamed: 0,2,3,4,5
Beer,Diaper,Milk,Bread,Eggs
Bread,Diaper,Milk,Beer,Eggs
Coke,Diaper,Milk,Beer,Bread
Diaper,Beer,Milk,Bread,Coke
Eggs,Beer,Bread,Diaper,Coke
Milk,Diaper,Beer,Bread,Coke
