In [2]:
import numpy as np 
import pandas as pd 
import sklearn
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt


# load the data

In [3]:
df = pd.read_csv('ratings_Beauty.csv')

In [4]:
# top 10k records from the original dataset
df1 = df.head(10000)

In [5]:
df1_utility_matrix = df1.pivot_table(values='Rating', index='UserId', columns='ProductId', fill_value=0)

In [6]:
# utility matrix is a sparse matrix where most of the values are 0
df1_utility_matrix.head()

ProductId,0205616461,0558925278,0733001998,0737104473,0762451459,1304139212,1304139220,130414089X,130414643X,1304146537,...,B000052YPE,B000052YPF,B000052YPG,B000052YPH,B000052YPM,B000052YPU,B000052YPV,B000052YPY,B000052YQ0,B000052YQ2
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00205921JHJK5X9LNP42,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A024581134CV80ZBLIZTZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A03056581JJIOL5FSKJY7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A03099101ZRK4K607JVHH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0505229A7NSH3FRXRR4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df1_utility_matrix.shape

(9697, 886)

In [8]:
X = df1_utility_matrix.T
X.head()

UserId,A00205921JHJK5X9LNP42,A024581134CV80ZBLIZTZ,A03056581JJIOL5FSKJY7,A03099101ZRK4K607JVHH,A0505229A7NSH3FRXRR4,A05492663T95KW63BR75K,A059547920Q3LZVFHLPI3,A07410232KYRFR25CIUGJ,A082796624UNM47DSAI6K,A0864963DOAY7LXGS5I6,...,AZW1HXXYAC15B,AZWRTJPN7NXT,AZWTXHXZXFAYP,AZYQEFB9Y5N22,AZZHB6U54UDYW,AZZHJZP4GQPPZ,AZZNK89PXD006,AZZOFVMQC0BJG,AZZQXL8VDCFTV,AZZTJQ7CQZUD8
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
205616461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
558925278,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
733001998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
737104473,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
762451459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X.shape

(886, 9697)

In [16]:
# Unique products in subset of data
X1=X

In [18]:
# Decomposing the matrix
svd = TruncatedSVD(n_components=10)
decomposed_matrix = svd.fit_transform(X)
decomposed_matrix.shape

(886, 10)

In [19]:
decomposed_matrix[:3,:3]

array([[ 1.77142652e-16,  1.08502431e-13,  7.44634257e-14],
       [-1.02054831e-14, -2.60624897e-13,  2.95565582e-13],
       [ 1.22010192e-15, -1.01698300e-14, -9.15313292e-15]])

In [20]:
# correlation matrix
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(886, 886)

In [22]:
X.index[99]

'6117036094'

In [23]:
i = '6117036094'

product_name = list(X.index)
product_id = product_name.index(i)
product_id

99

In [24]:
correlation_product_id = correlation_matrix[product_id]
correlation_product_id.shape

(886,)

# Recommending top 10 highly correlated products in sequence

In [30]:
recommend = list(X.index[correlation_product_id>0.90])

# remove the product already brought by the customer
recommend.remove(i)

recommend[0:9]


['1304146537',
 '1304168522',
 '1304196070',
 '1304482596',
 '1304482634',
 '1304488608',
 '1304495396',
 '130451112X',
 '1304622428']

In [31]:
recommend2 = list(X.index[correlation_product_id>0.90])
recommend2[0:9]

['1304146537',
 '1304168522',
 '1304196070',
 '1304482596',
 '1304482634',
 '1304488608',
 '1304495396',
 '130451112X',
 '1304622428']