### CSC478 Final Project: MovieLens
### Jun Tae Son |  Yuchen Wu | Tzu Hao Peng

In [106]:
import os
import math
import pandas as pd
import numpy as np
from numpy import *
from numpy import linalg as la
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error, mean_absolute_error, jaccard_similarity_score
from sklearn import preprocessing
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.cross_validation import train_test_split


# change the directory
os.chdir('C:\\Users\\rkfql\\Desktop\\Movielens-02')

In [36]:
# read rating data
rating_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=rating_cols)
print(ratings.shape)
ratings.head()

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [37]:
# read movie data
item_cols = ['movie_id','movie_title','release_date','video_release_date','IMDb_URL','unknown','Action','Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film_Noir','Horror','Musical','Mystery','Romance','Sci_Fi','Thriller','War' ,'Western']
movies=pd.read_csv('u.item',sep='|',encoding='iso-8859-1', names=item_cols)
print(movies.shape)
movies.head()

(1682, 24)


Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [38]:
# read user data
user_cols = ['user_id','age','gender','occupation','zip_code']
users = pd.read_csv('u.user', sep='|', names=user_cols)
print(users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


# Matrix Factorization

In [40]:
# reformat rating matrix to be one row per user and one column per movie.
df = ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
print(df.shape)
df.head()

(943, 1682)


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# convert normalized df into numpy array
df_array = df.as_matrix()

# normalize df by each users mean
user_ratings_mean = np.mean(df_array, axis = 1)
print(len(user_ratings_mean))
df_norm = df_array - user_ratings_mean.reshape(-1, 1)
df_norm

943


array([[ 4.41617122,  2.41617122,  3.41617122, ..., -0.58382878,
        -0.58382878, -0.58382878],
       [ 3.86325803, -0.13674197, -0.13674197, ..., -0.13674197,
        -0.13674197, -0.13674197],
       [-0.08977408, -0.08977408, -0.08977408, ..., -0.08977408,
        -0.08977408, -0.08977408],
       ..., 
       [ 4.9470868 , -0.0529132 , -0.0529132 , ..., -0.0529132 ,
        -0.0529132 , -0.0529132 ],
       [-0.20035672, -0.20035672, -0.20035672, ..., -0.20035672,
        -0.20035672, -0.20035672],
       [-0.34066587,  4.65933413, -0.34066587, ..., -0.34066587,
        -0.34066587, -0.34066587]])

# Movie recommender bsaed on SVD

In [47]:
# Singular Value Decomposition
U, sigma, V = svds(df_norm, k = 50)
sigma = np.diag(sigma)

In [50]:
# prediction from the decomposed matrices
pred_ratings = np.dot(np.dot(U, sigma), V) + user_ratings_mean.reshape(-1, 1)
pred_df = pd.DataFrame(pred_ratings, columns = df.columns)
print(pred_df.shape)
pred_df.head()

(943, 1682)


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,6.488436,2.959503,1.634987,3.024467,1.656526,1.659506,3.630469,0.240669,1.791518,3.347816,...,0.011976,-0.092017,-0.074553,-0.060985,0.009427,-0.035641,-0.039227,-0.037434,-0.025552,0.023513
1,2.347262,0.129689,-0.098917,0.328828,0.159517,0.481361,0.213002,0.097908,1.8921,0.671,...,0.003943,-0.026939,-0.03546,-0.029883,-0.027153,-0.015244,-0.008277,-0.01176,0.011639,-0.046924
2,0.291905,-0.26383,-0.151454,-0.179289,0.013462,-0.088309,-0.057624,0.568764,-0.018506,0.280742,...,-0.028964,-0.031622,0.045513,0.026089,-0.021705,0.002282,0.032363,0.017322,-0.006644,-0.00948
3,0.36641,-0.443535,0.041151,-0.007616,0.055373,-0.080352,0.299015,-0.010882,-0.160888,-0.118834,...,0.020069,0.015981,-0.000182,0.005593,0.026634,0.023562,0.036405,0.029984,0.015612,-0.008713
4,4.263488,1.937122,0.052529,1.04935,0.652765,0.002836,1.730461,0.870584,0.341027,0.569055,...,0.019973,-0.053521,-0.017242,-0.007137,-0.038987,0.010338,0.004869,0.007603,-0.020575,0.00333


In [65]:
# a fuction for recommender system
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.user_id == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movie_id', right_on = 'movie_id').
                     sort_values(['rating'], ascending=False)
                 )

    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending the highest {0} predicted ratings movies:'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movie_id'].isin(user_full['movie_id'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movie_id',
               right_on = 'movie_id').
         rename(columns = {user_row_number: 'predictions'}).
         sort_values('predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

In [68]:
# test with movie_id = 837
orig_r, pred_r = recommend_movies(pred_df, 837, movies, ratings, 5)
pred_r

User 837 has already rated 46 movies.
Recommending the highest 5 predicted ratings movies.


Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
11,14,"Postino, Il (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Postino,%20Il...",0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
107,116,Cold Comfort Farm (1995),23-Apr-1996,,http://us.imdb.com/M/title-exact?Cold%20Comfor...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116,126,"Spitfire Grill, The (1996)",06-Sep-1996,,http://us.imdb.com/M/title-exact?Spitfire%20Gr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42,50,Star Wars (1977),01-Jan-1977,,http://us.imdb.com/M/title-exact?Star%20Wars%2...,0,1,1,0,0,...,0,0,0,0,0,1,1,0,1,0


In [147]:
def recommended_title(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    user_full, recommendations = recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5)
    items = np.array(recommendations.movie_title)
    index = np.array(recommendations.movie_id)
    for i in range(num_recommendations):
        print(movie_genre_dic[index[i]])
        print(items[i])
        
    return user_full, recommendations

In [148]:
orig_r, pred_r = recommended_title(pred_df, 1, movies, ratings, 5)

User 1 has already rated 272 movies.
Recommending the highest 5 predicted ratings movies.
['Action', 'Crime', 'Thriller']
Heat (1995)
['Drama']
An Unforgettable Summer (1994)
['Comedy']
Friday (1995)
['Thriller']
Assassins (1995)
['Romance']
Kissed (1996)


# Dictionary for analysis

In [118]:
# make genre dictionary
genre = pd.read_csv('u.genre', sep='|', names = ['genre','key'])
genre = np.array(genre.iloc[:,0])

genre_dic = {}
for i in range(len(genre)):
    genre_dic[i] = genre[i]

genre_dic

{0: 'unknown',
 1: 'Action',
 2: 'Adventure',
 3: 'Animation',
 4: "Children's",
 5: 'Comedy',
 6: 'Crime',
 7: 'Documentary',
 8: 'Drama',
 9: 'Fantasy',
 10: 'Film-Noir',
 11: 'Horror',
 12: 'Musical',
 13: 'Mystery',
 14: 'Romance',
 15: 'Sci-Fi',
 16: 'Thriller',
 17: 'War',
 18: 'Western'}

In [119]:
temp = pd.DataFrame.copy(movies)
temp = temp.drop(['movie_title','release_date','video_release_date','IMDb_URL'], axis=1)
temp = np.array(temp)

movie_dic = {}
for i in range(len(temp)):
    movie_dic[temp[i][0]] = temp[i][1:]

movie_genre = []
for j in range(len(movie_dic)):
    entry_genre=[]
    for k in range(len(movie_dic[j+1])):
        if movie_dic[j+1][k] != 0:
            entry_genre.append(genre_dic[k])
    movie_genre.append(entry_genre)

In [120]:
movie_genre_dic = {}
for i in range(len(temp)):
    movie_genre_dic[temp[i][0]] = movie_genre[i]