# Objective
The purpose is to create a demonstable prototype that mines purchase data and predicts categories similar to the input.
For elucidations sake we will divide the summary problem into sub problems.
*  Problem 1 -Given item A predict item B which is most associated through purchase patterns.


# Packages required
* Pandas for data frame
* Numpy for arrays

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

# Problem 1

# Import Data

In [2]:
data = pd.read_csv('../data/FMCGSales.csv', names = ['BillId','ItemId','ItemName','Level1','Level2','Level3','Level4','Level5','Level6'] )


In [3]:
#Dummy for pivot table
data['dummy'] = 1

In [4]:
data.head()

Unnamed: 0,BillId,ItemId,ItemName,Level1,Level2,Level3,Level4,Level5,Level6,dummy
0,9,135291,DETTOL ANTISEPTIC LIQUID 500ML,FMCG,FMCG NON FOOD,OTC,HEALTH,FIRST Aid,ANTISEPTIC,1
1,14,152333,GULABARI 59ML,FMCG,FMCG NON FOOD,PERSONAL CARE,SKIN CARE,LOTION,BEAUTY & NOURISHMENT,1
2,14,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1
3,16,203917,SAVLON ANTISEPTIC LIQUID 200ML,FMCG,FMCG NON FOOD,OTC,HEALTH,SOLUTION,ANTISEPTIC,1
4,21,203289,SAFI SYP 100ML,FMCG,AYUSH,UNANI,LIQUID & GELS,SYRUP,,1


# Data Exploration

In [5]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BillId,10000.0,30108.5341,19185.501711,9.0,13684.0,27368.0,46706.75,65967.0
ItemId,10000.0,169527.2357,30982.008258,110810.0,144337.25,162672.0,200831.0,225105.0
dummy,10000.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [6]:
#is any row NULL ?
data.isnull().any().any(), data.shape

(True, (10000, 10))

In [7]:
#describe nullness
data.isnull().sum(axis=0)

BillId         0
ItemId         0
ItemName       0
Level1         0
Level2         0
Level3         0
Level4         0
Level5         5
Level6      1286
dummy          0
dtype: int64

Level 5 has 5 null values
Level 6 has 1286 null Values
Use data.dropna() to drop null rows if we are using that. Dont know if filling null values will solve the problem because the null values can have many values. Perhaps clustering can be employed to label these products first.

In [8]:
#data['pName'] = data['pName'].str.lower()

In [9]:
#data['pName'] = data['pName'].str.title()

In [10]:
#data['pName'] = data['pName'].str.replace(" ","")

In [11]:
#test for loc arbitrary item Id 220862
data.loc[data['ItemId'] == 220862]

Unnamed: 0,BillId,ItemId,ItemName,Level1,Level2,Level3,Level4,Level5,Level6,dummy
2,14,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1
148,848,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1
403,2307,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1
630,3638,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1
638,3682,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1
651,3730,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1
704,4069,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1
720,4152,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1
724,4166,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1
851,4810,220862,VICKS VAPORUB 25GM,FMCG,FMCG NON FOOD,OTC,HEALTH,BALM,COLD,1


# Name Indexer

In [12]:
#new DF with itemName and itemID
names = data[['ItemName', 'ItemId']]
names.shape

(10000, 2)

In [13]:
names = names.drop_duplicates(subset = 'ItemName')
print(names.shape)
names.head()

(902, 2)


Unnamed: 0,ItemName,ItemId
0,DETTOL ANTISEPTIC LIQUID 500ML,135291
1,GULABARI 59ML,152333
2,VICKS VAPORUB 25GM,220862
3,SAVLON ANTISEPTIC LIQUID 200ML,203917
4,SAFI SYP 100ML,203289


In [14]:
# set index to pName
name_index = names.set_index('ItemName')
name_index.head()

Unnamed: 0_level_0,ItemId
ItemName,Unnamed: 1_level_1
DETTOL ANTISEPTIC LIQUID 500ML,135291
GULABARI 59ML,152333
VICKS VAPORUB 25GM,220862
SAVLON ANTISEPTIC LIQUID 200ML,203917
SAFI SYP 100ML,203289


In [15]:
#Sorting
name_index.sort_values('ItemName')

Unnamed: 0_level_0,ItemId
ItemName,Unnamed: 1_level_1
110PELLETS SUGAR FREE GOLD,209917
ADIDAS DEO DYNAMIC PULSE 150ML,110810
ADIDAS DEO FRUITY RHYTHM 150ML,110815
ADIDAS DEO ICE DIVE MEN 150ML,110819
ADIDAS DEO PURE GAME MEN 150ML,110824
ADIDAS DEO TEAM FORCE MEN 150ML,110828
ADIDAS DEO VICTORY LEG MEN 150ML,110829
AIR WICK AQUA MIST SPRAY 345ML,111340
ALL OUT BEDTIME REFILL 45ML,112012
ALL OUT COMBO ADJUSTABLE 45NIGHT,112019


In [16]:
name_index.loc['WOODWARDS GRIPE WATER 200ML']

ItemId    223749
Name: WOODWARDS GRIPE WATER 200ML, dtype: int64

# Items Index

In [17]:
normal_index = name_index.reset_index()
normal_index.head()

Unnamed: 0,ItemName,ItemId
0,DETTOL ANTISEPTIC LIQUID 500ML,135291
1,GULABARI 59ML,152333
2,VICKS VAPORUB 25GM,220862
3,SAVLON ANTISEPTIC LIQUID 200ML,203917
4,SAFI SYP 100ML,203289


In [18]:
items_index =  normal_index.set_index('ItemId')
items_index.head()

Unnamed: 0_level_0,ItemName
ItemId,Unnamed: 1_level_1
135291,DETTOL ANTISEPTIC LIQUID 500ML
152333,GULABARI 59ML
220862,VICKS VAPORUB 25GM
203917,SAVLON ANTISEPTIC LIQUID 200ML
203289,SAFI SYP 100ML


In [19]:
items_index.index.is_unique

True

In [20]:
#itemIndexTest.loc[88510]

In [21]:
#itemIndexTest.loc[88510]

In [22]:
name_index.index.is_unique

True

# Pivot Table

In [23]:
matrix = data.pivot_table(values='dummy',index ='ItemId', columns ='BillId')
matrix.head()

BillId,9,14,16,21,22,30,34,38,49,55,...,65798,65814,65815,65823,65850,65905,65941,65949,65957,65967
ItemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
110810,,,,,,,,,,,...,,,,,,,,,,
110815,,,,,,,,,,,...,,,,,,,,,,
110819,,,,,,,,,,,...,,,,,,,,,,
110824,,,,,,,,,,,...,,,,,,,,,,
110828,,,,,,,,,,,...,,,,,,,,,,


In [25]:
matrix.shape

(904, 7485)

In [26]:
matrix_dummy = matrix.copy().fillna(0)

In [27]:
matrix_dummy.head()

BillId,9,14,16,21,22,30,34,38,49,55,...,65798,65814,65815,65823,65850,65905,65941,65949,65957,65967
ItemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
110810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
matrix_dummy.loc[110828]

BillId
9        0.0
14       0.0
16       0.0
21       0.0
22       0.0
30       0.0
34       0.0
38       0.0
49       0.0
55       0.0
57       0.0
63       0.0
68       0.0
74       0.0
88       0.0
98       0.0
106      0.0
110      0.0
118      0.0
122      0.0
127      0.0
133      0.0
147      0.0
149      0.0
159      0.0
181      0.0
185      0.0
188      0.0
200      0.0
203      0.0
        ... 
65651    0.0
65654    0.0
65658    0.0
65660    0.0
65665    0.0
65706    0.0
65712    0.0
65713    0.0
65714    0.0
65716    0.0
65718    0.0
65741    0.0
65749    0.0
65751    0.0
65754    0.0
65761    0.0
65775    0.0
65787    0.0
65790    0.0
65794    0.0
65798    0.0
65814    0.0
65815    0.0
65823    0.0
65850    0.0
65905    0.0
65941    0.0
65949    0.0
65957    0.0
65967    0.0
Name: 110828, Length: 7485, dtype: float64

# Jaccardian
We define a jaccardian as intersection over union. 

In [28]:
#needs parameter in scipy.sparse.csc_matrix
type(matrix_dummy)

pandas.core.frame.DataFrame

In [29]:
from sklearn.metrics.pairwise import pairwise_distances
jac_sim = 1 - pairwise_distances(matrix_dummy, metric = "hamming")
# optionally convert it to a DataFrame
jac_sim = pd.DataFrame(jac_sim, index=matrix_dummy.index, columns=matrix_dummy.index)

In [30]:
jac_sim.head()

ItemId,110810,110815,110819,110824,110828,110829,111340,112012,112019,112031,...,223563,223584,223748,223749,224914,224915,224916,225093,225097,225105
ItemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
110810,1.0,0.998931,0.998798,0.998931,0.998664,0.998931,0.998798,0.995458,0.994923,0.998397,...,0.998931,0.999065,0.995992,0.993454,0.99853,0.998798,0.998263,0.997061,0.996794,0.99853
110815,0.998931,1.0,0.999332,0.999466,0.999198,0.999466,0.999332,0.995992,0.995458,0.998931,...,0.999466,0.999599,0.996526,0.993988,0.999065,0.999332,0.998798,0.997595,0.997328,0.999065
110819,0.998798,0.999332,1.0,0.999332,0.999065,0.999332,0.999198,0.995858,0.995324,0.998798,...,0.999332,0.999466,0.996393,0.993854,0.998931,0.999198,0.998664,0.997462,0.997194,0.998931
110824,0.998931,0.999466,0.999332,1.0,0.999198,0.999466,0.999332,0.995992,0.995458,0.998931,...,0.999466,0.999599,0.996526,0.993988,0.999065,0.999332,0.998798,0.997595,0.997328,0.999065
110828,0.998664,0.999198,0.999065,0.999198,1.0,0.999198,0.999065,0.995725,0.99519,0.998664,...,0.999198,0.999332,0.996259,0.993721,0.998798,0.999065,0.99853,0.997328,0.997061,0.998798


In [38]:
#Tester Jaccard
from sklearn.metrics import jaccard_similarity_score
print(jaccard_similarity_score(matrix_dummy.loc[110815], matrix_dummy.loc[110810]))

0.998931195724783


# Final Function

In [40]:
# Final Function
# Takes title, userId and returns to 25 sorted movie names on est
def reco(idx):

    similar_itemids = jac_sim.loc[idx].to_frame(name=None)
    topItemIds = similar_itemids.sort_values(by =[idx],ascending = False).head(26)
    topItemNames = topItemIds.merge(items_index, how ='left', on = 'itemId' )

    
    return topItemNames.head(10)



In [41]:
reco(110824)

KeyError: 'itemId'

In [None]:
reco(88640)

In [None]:
reco(88486)

In [None]:
reco(16288)

In [None]:
reco(16722)

In [None]:
reco(111852)

In [None]:
reco(86616)