SVD matrix factorization

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD

preparing the data

In [4]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
frame = pd.read_csv('u.data', sep='\t', names=columns)
frame.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [16]:
columns = ['item_id', 'movie title', 'release_date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
			'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
			'Mistery', 'Romance', 'Sci-fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies.loc[:, ['item_id', 'movie title']]

In [17]:
combined_movies_data = pd.merge(frame, movie_names, on='item_id')
combined_movies_data

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)
...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962)
99996,655,1640,3,888474646,"Eighth Day, The (1996)"
99997,655,1637,3,888984255,Girls Town (1996)
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1..."


In [19]:
combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [22]:
Filter = combined_movies_data['item_id'] == 50
combined_movies_data[Filter]['movie title'].unique()

array(['Star Wars (1977)'], dtype=object)

building an utility matrix

In [24]:
ratings_crosstab = combined_movies_data.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)
ratings_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


transposing the matrix

In [26]:
ratings_crosstab.shape

(943, 1664)

In [29]:
X = ratings_crosstab.values.T
X.shape

(1664, 943)

decomposing the matrix

In [33]:
SVD = TruncatedSVD(n_components=12, random_state=17)
resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape

(1664, 12)

generate a correlation matrix

In [50]:
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape

array([[ 1.        , -0.10875097,  0.52265963, ...,  0.39286484,
         0.21820479,  0.51111162],
       [-0.10875097,  1.        ,  0.0653973 , ...,  0.15737062,
         0.51273503,  0.24591037],
       [ 0.52265963,  0.0653973 ,  1.        , ...,  0.76769554,
         0.44401258,  0.20018928],
       ...,
       [ 0.39286484,  0.15737062,  0.76769554, ...,  1.        ,
         0.18141816,  0.11120279],
       [ 0.21820479,  0.51273503,  0.44401258, ...,  0.18141816,
         1.        ,  0.1940996 ],
       [ 0.51111162,  0.24591037,  0.20018928, ...,  0.11120279,
         0.1940996 ,  1.        ]])

isolating star wars from the correlation matrix

In [39]:
movies_names = ratings_crosstab.columns
movies_list = list(movies_names)

star_wars = movies_list.index('Star Wars (1977)')
print(star_wars)

1398


In [40]:
corr_star_wars = corr_mat[star_wars]
corr_star_wars.shape

(1664,)

recommending a highly correlated movie

In [49]:
list(movies_names[(corr_star_wars < 1.0) & (corr_star_wars > 0.95)])

['Return of the Jedi (1983)', 'Star Wars (1977)']