# **Importing Libraries**

In [3]:
import numpy as np
import pandas as pd

In [4]:
df=pd.read_csv("data.csv",encoding = 'ISO-8859-1')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6.0,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6.0,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8.0,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6.0,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6.0,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12462 entries, 0 to 12461
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    12462 non-null  object 
 1   StockCode    12462 non-null  object 
 2   Description  12417 non-null  object 
 3   Quantity     12461 non-null  float64
 4   InvoiceDate  12461 non-null  object 
 5   UnitPrice    12461 non-null  float64
 6   CustomerID   8956 non-null   float64
 7   Country      12461 non-null  object 
dtypes: float64(3), object(5)
memory usage: 779.0+ KB


# **STATISTICAL SUMMARY**

In [6]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,12461.0,12461.0,8956.0
mean,7.477249,3.97328,15598.129299
std,93.885564,17.253932,1748.194062
min,-9360.0,0.0,12395.0
25%,1.0,1.25,14307.0
50%,2.0,2.51,15646.0
75%,6.0,4.21,17211.0
max,2880.0,940.87,18239.0


In [7]:
df.describe(include=object)

Unnamed: 0,InvoiceNo,StockCode,Description,InvoiceDate,Country
count,12462,12462,12417,12461,12461
unique,585,2168,2136,504,15
top,537237,22632,HAND WARMER SCOTTY DOG DESIGN,12/6/2010 9:58,United Kingdom
freq,597,75,72,597,11806


# MISSING VALUE TREATMENT

In [8]:
df.isnull().sum()


InvoiceNo         0
StockCode         0
Description      45
Quantity          1
InvoiceDate       1
UnitPrice         1
CustomerID     3506
Country           1
dtype: int64

In [9]:
df = df.loc[df['Quantity'] > 0]
df = df.loc[df['UnitPrice'] > 0]

In [10]:
df.isnull().sum()

InvoiceNo         0
StockCode         0
Description       0
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID     3437
Country           0
dtype: int64

In [11]:
df.loc[df['CustomerID'].isna()].head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
1443,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1.0,12/1/2010 14:32,2.51,,United Kingdom
1444,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2.0,12/1/2010 14:32,2.51,,United Kingdom
1445,536544,21786,POLKADOT RAIN HAT,4.0,12/1/2010 14:32,0.85,,United Kingdom
1446,536544,21787,RAIN PONCHO RETROSPOT,2.0,12/1/2010 14:32,1.66,,United Kingdom
1447,536544,21790,VINTAGE SNAP CARDS,9.0,12/1/2010 14:32,1.66,,United Kingdom


In [12]:
df.nunique()


InvoiceNo       486
StockCode      2142
Description    2121
Quantity         88
InvoiceDate     440
UnitPrice       160
CustomerID      348
Country          15
dtype: int64

In [13]:
df.shape

(12285, 8)

In [14]:
df = df.dropna(subset=['CustomerID'])

In [15]:
df.shape

(8848, 8)

In [16]:
df.isnull().sum()


InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

# **COLLABORATIVE FILTERING**

The models created by collaborative filtering techniques are based on the prior actions of a user (things previously chosen or purchased, and/or numerical ratings given to those items), as well as comparable choices made by other users. Then, this model is used to forecast the ratings for things or items themselves that the user could be interested in.

In [17]:
customer_item_matrix = df.pivot_table(index='CustomerID', columns='StockCode', values='Quantity',aggfunc='sum')
customer_item_matrix.head()

StockCode,10002,10120,10123C,10124G,10125,10133,10135,11001,15034,15036,...,90204,90210B,90214J,90214M,90214S,90214V,BANK CHARGES,C2,M,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12395.0,,,,,,,,,,,...,,,,,,,,,,2.0
12427.0,,,,,,,,,,,...,,,,,,,,,,3.0
12431.0,,,,,,,,,,,...,,,,,,,,,,
12433.0,,,,,,,,,,,...,,,,,,,,,,
12472.0,,,,,,,,,,,...,,,,,,,,,,6.0


In [18]:
customer_item_matrix = customer_item_matrix.applymap(lambda x: 1 if x > 0 else 0)
customer_item_matrix.head()

StockCode,10002,10120,10123C,10124G,10125,10133,10135,11001,15034,15036,...,90204,90210B,90214J,90214M,90214S,90214V,BANK CHARGES,C2,M,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12395.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12427.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12431.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12433.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12472.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [19]:
customer_item_matrix.shape


(348, 1780)

# (A) Creating User-to-User Similarity Matrix

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

user_user_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix))
user_user_sim_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,338,339,340,341,342,343,344,345,346,347
0,1.000000,0.091287,0.000000,0.168934,0.069007,0.0,0.072739,0.064550,0.408248,0.070014,...,0.000000,0.000000,0.0,0.080408,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.091287,1.000000,0.000000,0.000000,0.113389,0.0,0.119523,0.070711,0.223607,0.076696,...,0.000000,0.000000,0.0,0.117444,0.000000,0.044721,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,1.000000,0.000000,0.000000,0.0,0.067344,0.179284,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.052414,0.000000,0.101015,0.049629
3,0.168934,0.000000,0.000000,1.000000,0.083935,0.0,0.132712,0.052342,0.000000,0.170320,...,0.039014,0.031281,0.0,0.108670,0.000000,0.132417,0.045907,0.000000,0.000000,0.000000
4,0.069007,0.113389,0.000000,0.083935,1.000000,0.0,0.120468,0.106904,0.084515,0.115954,...,0.039841,0.000000,0.0,0.055487,0.000000,0.033806,0.000000,0.000000,0.000000,0.133169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,0.000000,0.044721,0.000000,0.132417,0.033806,0.0,0.017817,0.031623,0.000000,0.000000,...,0.282843,0.000000,0.0,0.091915,0.081650,1.000000,0.166410,0.000000,0.000000,0.000000
344,0.000000,0.000000,0.052414,0.045907,0.000000,0.0,0.049417,0.043853,0.000000,0.000000,...,0.000000,0.000000,0.0,0.072836,0.000000,0.166410,1.000000,0.000000,0.000000,0.000000
345,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.074125,0.0,0.025751,0.160128,0.000000,0.000000,1.000000,0.000000,0.103005
346,0.000000,0.000000,0.101015,0.000000,0.000000,0.0,0.190476,0.169031,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000


In [21]:
user_user_sim_matrix.shape

(348, 348)

In [22]:
user_user_sim_matrix.columns = customer_item_matrix.index

user_user_sim_matrix['CustomerID'] = customer_item_matrix.index

user_user_sim_matrix = user_user_sim_matrix.set_index('CustomerID')
user_user_sim_matrix.head()

CustomerID,12395.0,12427.0,12431.0,12433.0,12472.0,12557.0,12567.0,12583.0,12600.0,12647.0,...,18085.0,18109.0,18113.0,18118.0,18144.0,18156.0,18168.0,18219.0,18229.0,18239.0
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12395.0,1.0,0.091287,0.0,0.168934,0.069007,0.0,0.072739,0.06455,0.408248,0.070014,...,0.0,0.0,0.0,0.080408,0.0,0.0,0.0,0.0,0.0,0.0
12427.0,0.091287,1.0,0.0,0.0,0.113389,0.0,0.119523,0.070711,0.223607,0.076696,...,0.0,0.0,0.0,0.117444,0.0,0.044721,0.0,0.0,0.0,0.0
12431.0,0.0,0.0,1.0,0.0,0.0,0.0,0.067344,0.179284,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.052414,0.0,0.101015,0.049629
12433.0,0.168934,0.0,0.0,1.0,0.083935,0.0,0.132712,0.052342,0.0,0.17032,...,0.039014,0.031281,0.0,0.10867,0.0,0.132417,0.045907,0.0,0.0,0.0
12472.0,0.069007,0.113389,0.0,0.083935,1.0,0.0,0.120468,0.106904,0.084515,0.115954,...,0.039841,0.0,0.0,0.055487,0.0,0.033806,0.0,0.0,0.0,0.133169


In [24]:
user_user_sim_matrix.loc[12557].sort_values(ascending=False)

CustomerID
12557.0    1.000000
12793.0    0.338062
17967.0    0.279372
16218.0    0.253546
17961.0    0.253546
             ...   
14723.0    0.000000
14708.0    0.000000
14696.0    0.000000
14680.0    0.000000
18239.0    0.000000
Name: 12557.0, Length: 348, dtype: float64

In [25]:
items_bought_by_12557 = set(customer_item_matrix.loc[12557].iloc[customer_item_matrix.loc[12557].to_numpy().nonzero()].index)
items_bought_by_12557

{'20725', '20727', '20728', '22383', '22384'}

In [26]:
items_bought_by_12431 = set(customer_item_matrix.loc[12431.0].iloc[customer_item_matrix.loc[12431.0].to_numpy().nonzero()].index)
items_bought_by_12431

{'21622',
 '21791',
 '22191',
 '22192',
 '22193',
 '22195',
 '22196',
 '22726',
 '22727',
 '22941',
 '35004C',
 '35004G',
 '85014A',
 '85014B'}

In [27]:
items_to_recommend_to_12557 = items_bought_by_12557 - items_bought_by_12431
items_to_recommend_to_12557

{'20725', '20727', '20728', '22383', '22384'}

In [29]:
df.loc[df['StockCode'].isin(items_to_recommend_to_12557), ['StockCode', 'Description']].drop_duplicates().set_index('StockCode')

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
20725,LUNCH BAG RED RETROSPOT
22384,LUNCH BAG PINK POLKADOT
22383,LUNCH BAG SUKI DESIGN
20728,LUNCH BAG CARS BLUE
20727,LUNCH BAG BLACK SKULL.


In [30]:
most_similar_user = user_user_sim_matrix.loc[12557].sort_values(ascending=False).reset_index().iloc[1, 0]
most_similar_user

12793.0

In [31]:
def get_items_to_recommend_cust(cust_a): 
  most_similar_user = user_user_sim_matrix.loc[cust_a].sort_values(ascending=False).reset_index().iloc[1, 0]
  items_bought_by_cust_a = set(customer_item_matrix.loc[cust_a].iloc[customer_item_matrix.loc[cust_a].to_numpy().nonzero()].index)
  items_bought_by_cust_b = set(customer_item_matrix.loc[most_similar_user].iloc[customer_item_matrix.loc[most_similar_user].to_numpy().nonzero()].index)
  items_to_recommend_to_a = items_bought_by_cust_b - items_bought_by_cust_a
  items_description = df.loc[df['StockCode'].isin(items_to_recommend_to_a), ['StockCode', 'Description']].drop_duplicates().set_index('StockCode')
  return items_description

In [None]:
get_items_to_recommend_cust(12557.0)

In [32]:
get_items_to_recommend_cust(12431.0)


Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
22086,PAPER CHAIN KIT 50'S CHRISTMAS
20679,EDWARDIAN PARASOL RED
15056BL,EDWARDIAN PARASOL BLACK
15056N,EDWARDIAN PARASOL NATURAL
22910,PAPER CHAIN KIT VINTAGE CHRISTMAS
21506,"FANCY FONT BIRTHDAY CARD,"
22730,ALARM CLOCK BAKELIKE IVORY
22768,FAMILY PHOTO FRAME CORNICE
21519,GIN & TONIC DIET GREETING CARD
22819,"BIRTHDAY CARD, RETRO SPOT"


# **(B) Creating Ite to Item similarity matrix**

In [33]:
item_item_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix.T))
item_item_sim_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1770,1771,1772,1773,1774,1775,1776,1777,1778,1779
0,1.0,0.316228,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102598
1,0.316228,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.447214,0.707107,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
item_item_sim_matrix.shape


(1780, 1780)

In [35]:
item_item_sim_matrix.columns = customer_item_matrix.T.index

item_item_sim_matrix['StockCode'] = customer_item_matrix.T.index
item_item_sim_matrix = item_item_sim_matrix.set_index('StockCode')
item_item_sim_matrix.head()

StockCode,10002,10120,10123C,10124G,10125,10133,10135,11001,15034,15036,...,90204,90210B,90214J,90214M,90214S,90214V,BANK CHARGES,C2,M,POST
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,1.0,0.316228,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102598
10120,0.316228,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10123C,0.447214,0.707107,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10124G,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10125,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
item_item_sim_matrix.loc['10002'].sort_values(ascending=False)


StockCode
10002     1.000000
10123C    0.447214
90162D    0.447214
90162B    0.447214
90094     0.447214
            ...   
22089     0.000000
22088     0.000000
22087     0.000000
22084     0.000000
22428     0.000000
Name: 10002, Length: 1780, dtype: float64

In [40]:
top_10_similar_items = list(item_item_sim_matrix.loc['10002'].sort_values(ascending=False).iloc[:10].index)
top_10_similar_items

['10002',
 '10123C',
 '90162D',
 '90162B',
 '90094',
 '21884',
 '21883',
 '22383',
 '21832',
 '22079']

In [41]:
df.head()


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6.0,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6.0,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8.0,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6.0,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6.0,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [42]:
df.loc[df['StockCode'] == '90210A']


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [43]:
df.loc[df['StockCode'] == '90210A'][:1]


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [44]:
df.loc[df['StockCode'].isin(top_10_similar_items), ['StockCode', 'Description']].drop_duplicates().set_index('StockCode').loc[top_10_similar_items]

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
10002,INFLATABLE POLITICAL GLOBE
10123C,HEARTS WRAPPING TAPE
90162D,ANT SILVER PURPLE BOUDICCA RING
90162B,ANT SILVER LIME GREEN BOUDICCA RING
90094,NECKLACE+BRACELET SET FRUIT SALAD
21884,CAKES AND BOWS GIFT TAPE
21883,STARS GIFT TAPE
22383,LUNCH BAG SUKI DESIGN
21832,CHOCOLATE CALCULATOR
22079,RIBBON REEL HEARTS DESIGN


In [45]:
def get_top_similar_items(item):
  top_10_similar_items = list(item_item_sim_matrix.loc[item].sort_values(ascending=False).iloc[:10].index)
  top_10 = df.loc[df['StockCode'].isin(top_10_similar_items), ['StockCode', 'Description']].drop_duplicates().set_index('StockCode').loc[top_10_similar_items]
  return top_10

In [46]:
get_top_similar_items('84029E')


Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
84029E,RED WOOLLY HOTTIE WHITE HEART.
84029G,KNITTED UNION FLAG HOT WATER BOTTLE
21479,WHITE SKULL HOT WATER BOTTLE
22110,BIRD HOUSE HOT WATER BOTTLE
22111,SCOTTIE DOG HOT WATER BOTTLE
21481,FAWN BLUE HOT WATER BOTTLE
21485,RETROSPOT HEART HOT WATER BOTTLE
22114,HOT WATER BOTTLE TEA AND SYMPATHY
22633,HAND WARMER UNION JACK
22113,GREY HEART HOT WATER BOTTLE
