# Model based Collaborative Filtering System using SVD Matrix Factorization

In [2]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.decomposition import TruncatedSVD

### build a model-based movie recommender from open-sourced dataset (http://grouplens.org/datasets/movielens/100k)

### MovieLens data sets were collected by the GroupLens Research Project at the University of Minnesota.
### This data set consists of:
### * 100,000 ratings (1-5) from 943 users on 1682 movies. 
### * Each user has rated at least 20 movies. 
### * Simple demographic info for the users (age, gender, occupation, zip)

In [7]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
frame = pd.read_csv('u.data', sep='\t', names=columns)
frame

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [10]:
columns = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDB_URL', 'unknown', 'Action',
          'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie_title']]
movie_names.head()

Unnamed: 0,item_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [11]:
combined_movies_data = pd.merge(frame, movie_names, on='item_id')
combined_movies_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


### from the above table, there are duplicated entries because more than movie goer reviewed the same movie

In [13]:
combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

### the most popular movie, item_id '50' was reviewed 583

In [30]:
filter = combined_movies_data['item_id']==50
combined_movies_data[filter]['movie_title'].unique()

array(['Star Wars (1977)'], dtype=object)

### Build Utility Matrix

In [32]:
rating_crosstab = combined_movies_data.pivot_table(values='rating', index='user_id', columns='movie_title', fill_value=0)
rating_crosstab.head()

movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


### Transpose the Utility Matrix

In [33]:
rating_crosstab.shape

(943, 1664)

In [36]:
#transpose utility matrix
x = rating_crosstab.values.T
x.shape

(1664, 943)

### Decompose the matrix


In [37]:
#decompose transposed matrix
SVD = TruncatedSVD(n_components=12, random_state=17) #random_state=17 in orderto get the same repeatable result
resultant_matrix = SVD.fit_transform(x)
resultant_matrix.shape

(1664, 12)

### Generate Correlation matrix

In [38]:
#correlation matrix will return shape of (1664, 1664) 
#goal is to recommend movie that has highest correlation to movie of interest based on general interest

corr_matrix = np.corrcoef(resultant_matrix)
corr_matrix.shape

(1664, 1664)

### Isolate Movie 'Star Wars (1977)' from correlation matrix

In [58]:
movie_names = rating_crosstab.columns
movie_list = list(movie_names)

starwars = movie_list.index('Star Wars (1977)')
starwars

1398

In [65]:
#isolate the array 1398
corr_starwars = corr_matrix[1398]
corr_starwars.shape

(1664,)

### Recommend a highly correlated movie with 'Star Wars (1977)' based on user interest

In [66]:
#get a list of movie names that are highly correlated with 'Star Wars (1977)'
list(movie_names[(corr_starwars<1.0) & (corr_starwars>0.9)])

['Die Hard (1988)',
 'Empire Strikes Back, The (1980)',
 'Fugitive, The (1993)',
 'Raiders of the Lost Ark (1981)',
 'Return of the Jedi (1983)',
 'Star Wars (1977)',
 'Terminator 2: Judgment Day (1991)',
 'Terminator, The (1984)',
 'Toy Story (1995)']

In [73]:
#get a list of movies with a higher correlation
list(movie_names[(corr_starwars<1.0) & (corr_starwars > 0.94)])

['Return of the Jedi (1983)',
 'Star Wars (1977)',
 'Terminator 2: Judgment Day (1991)']