In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise import SVD
from surprise import accuracy
from surprise import NormalPredictor
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
#from adjustText import adjust_text
%matplotlib inline


In [19]:
#Reading ratings file:
dataCols = ['user_id', 'movie_id', 'rating']
dataDf = pd.read_csv('../data/data.txt', sep='\t', names=dataCols, encoding='latin-1')
dataDf.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [20]:
#Reading items file:
movieCols = ['movie id', 'movie title', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
moviesDf = pd.read_csv('../data/movies.txt', sep='\t', names=movieCols, encoding='latin-1')
moviesDf.head()

Unnamed: 0,movie id,movie title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
#Reading test file:
testSetCols = ['user_id', 'movie_id', 'rating']
testSetDf = pd.read_csv('../data/test.txt', sep='\t', names=testSetCols, encoding='latin-1')
testSetDf.head()

Unnamed: 0,user_id,movie_id,rating
0,707,766,3
1,943,188,4
2,772,313,5
3,828,26,3
4,854,514,4


In [22]:
#Reading train file:
trainSetCols = ['user_id', 'movie_id', 'rating']
trainSetDf = pd.read_csv('../data/train.txt', sep='\t', names=trainSetCols, encoding='latin-1')
trainSetDf.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,166,346,1
4,298,474,4


In [23]:
reader = Reader(rating_scale=(1, 5))

dataSet = Dataset.load_from_df(dataDf, reader)
trainSet, testSet = train_test_split(dataSet, test_size = 0.1)
#= dataSet.build_full_trainset()

#trainSet = Dataset.load_from_df(trainSetDf, reader)
#trainSet = Dataset.load_from_file('data/data.txt', reader)



In [24]:
k = 20
algo = SVD(n_factors = k, biased = True)

algo.fit(trainSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10dbf6c50>

In [25]:
testPred = algo.test(testSet)

In [26]:
accuracy.rmse(testPred, verbose=True)

RMSE: 0.9271


0.92711257470944308

In [27]:
uMatrix = algo.pu
vMatrix = algo.qi
uMatrixBias = algo.bu
vMatrixBias = algo.bi
print "Matrix Shapes:"
print "U: " + str(uMatrix.shape)
print "V: " + str(vMatrix.shape)
print "U Bias: " + str(uMatrixBias.shape)
print "V Bias: " + str(vMatrixBias.shape)
print "Train Set Users: " + str(trainSet.n_users)
print "Train Set Items: " + str(trainSet.n_items)
print "Train Set Ratings: " + str(trainSet.n_ratings)
print "Test Set Len: " + str(len(testSet))

Matrix Shapes:
U: (943, 20)
V: (1668, 20)
U Bias: (943,)
V Bias: (1668,)
Train Set Users: 943
Train Set Items: 1668
Train Set Ratings: 90000
Test Set Len: 10000


In [28]:
movie_info = np.loadtxt('../data/movies.txt', dtype="str", delimiter="\t", usecols=(0, 1, 3, 7, 16))
print movie_info
data = np.loadtxt('../data/data.txt').astype(int)
#Y_train = np.loadtxt('../data/train.txt').astype(int)
#Y_test = np.loadtxt('../data/test.txt').astype(int)

[['1' 'Toy Story (1995)' '0' '1' '0']
 ['2' 'GoldenEye (1995)' '1' '0' '0']
 ['3' 'Four Rooms (1995)' '0' '0' '0']
 ..., 
 ['1680' 'Sliding Doors (1998)' '0' '0' '1']
 ['1681' 'You So Crazy (1994)' '0' '1' '0']
 ['1682' 'Scream of Stone (Schrei aus Stein) (1991)' '0' '0' '0']]


In [29]:
#model.score(Y_test)
a, sigma, b = np.linalg.svd(vMatrix.T)
print vMatrix.shape, a.shape
a_t =  a #np.transpose(a)

#movie ID starts at 1, but matrix starts at 0
v_proj = np.transpose(np.dot(a_t[:2], vMatrix.T))

x = []
y = []
for i in v_proj:
    x.append(i[0])
    y.append(i[1])

ratings = {}
for user, movie_id, rating in data:
    if movie_id in ratings:
        ratings[movie_id].append(rating)
    else:
        ratings[movie_id] = [rating]
#x = v_proj[0]
#y = v_proj[1]
#print(x)

print v_proj.shape


(1668, 20) (20, 20)
(1668, 2)


In [30]:
# Setup

ids = movie_info[:,0].astype(int)
movie_names = movie_info[:,1]

In [31]:
# 1. 10 movies of our choice from the MovieLens dataset 

plt.scatter(x[2:12], y[2:12])
texts = []
for j, txt in enumerate(movie_names[2:12]):
    #texts.append(plt.text(x[2:12][j], y[2:12][j], txt))
    plt.annotate(txt, (x[j], y[j]))
#adjust_text(texts)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.title('10 Movies of Our Choice')
plt.savefig('Shelf-choice.png')
plt.clf()

<matplotlib.figure.Figure at 0x10a7e9a10>

In [32]:
# 2. All ratings of the ten most popular movies 

max_10 = dict(sorted(ratings.items(), key=lambda r: len(r[1]), reverse=True)[:10])
x_pop = []
y_pop = []
top_ratings = []
top_ratings = max_10.keys()
movie_title = []
print(top_ratings)
counter = 0
for i in v_proj:
    counter += 1
    if counter in top_ratings:
        x_pop.append(i[0])
        y_pop.append(i[1])
        movie_title.append(movie_names[counter])
print(movie_title)

plt.scatter(x_pop, y_pop)
texts = []
for j, txt in enumerate(movie_title):
    #texts.append(plt.text(x_pop[j], y_pop[j], txt))
    plt.annotate(txt, (x_pop[j], y_pop[j]))
#adjust_text(texts)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.title('10 Most Popular Movies')
plt.savefig('Shelf-popular.png')
plt.clf()

[288, 1, 258, 100, 294, 300, 50, 181, 121, 286]
['GoldenEye (1995)', 'Legends of the Fall (1994)', 'Heavy Metal (1981)', '"Cable Guy, The (1996)"', 'GoodFellas (1990)', 'George of the Jungle (1997)', "Marvin's Room (1996)", 'Evita (1996)', 'Breakdown (1997)', 'In & Out (1997)']


<matplotlib.figure.Figure at 0x10f682a90>

In [33]:
# 3. All ratings of the ten best movies 

best_10 = dict(sorted(ratings.items(), key=lambda r: sum(r[1])/len(r[1]), reverse=True)[:10])
x_best = []
y_best = []
best = []
best = best_10.keys()
print(best)
count = 0
for i in v_proj:
    count += 1
    if count in best:
        x_best.append(i[0])
        y_best.append(i[1])

for j, txt in enumerate(movie_title):
    plt.annotate(txt, (x_best[j], y_best[j]))
plt.scatter(x_best, y_best)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.title('10 Best Movies')
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.savefig('Shelf-best.png')
plt.clf()

[1536, 1122, 1189, 1293, 814, 1201, 1653, 1467, 1500, 1599]


<matplotlib.figure.Figure at 0x10f6c3710>

In [34]:
# 4. 10 ratings of movies from three genres of your choice


ids = movie_info[:,0].astype(int)
movie_names = movie_info[:,1]


action = (movie_info[:,2].astype(int))
action_movies = dict((k, v) for k, v in zip(ids, action) if v == 1)
action_ratings_dict = dict((k, ratings[k]) for k in action_movies.keys())
x_action = []
y_action = []
action_ratings = []
action_ratings = action_ratings_dict.keys()

comedy = movie_info[:,3].astype(int)
comedy_movies = dict((k, v) for k, v in zip(ids, comedy) if v == 1)
comedy_ratings_dict = dict((k, ratings[k]) for k in comedy_movies.keys())
x_comedy = []
y_comedy = []
comedy_ratings = []
comedy_ratings = comedy_ratings_dict.keys()

romance = movie_info[:,4].astype(int)
romance_movies = dict((k, v) for k, v in zip(ids, romance) if v == 1)
romance_ratings_dict = dict((k, ratings[k]) for k in romance_movies.keys())
x_romance = []
y_romance = []
romance_ratings = []
romance_ratings = romance_ratings_dict.keys()

count = 0
for i in v_proj:
    count += 1
    if count in action_ratings:
        x_action.append(i[0])
        y_action.append(i[1])

    if count in comedy_ratings:
        x_comedy.append(i[0])
        y_comedy.append(i[1])

    if count in romance_ratings:
        x_romance.append(i[0])
        y_romance.append(i[1])

plt.scatter(x_action[2:12], y_action[2:12], label = "Action")
plt.scatter(x_comedy[2:12], y_comedy[2:12], color = 'orange', label = "Comedy")
plt.scatter(x_romance[2:12], y_romance[2:12], color = 'green', label = "Romance")
plt.legend()
plt.title("Three Genres")
plt.savefig('Shelf-genres.png')
plt.clf()

<matplotlib.figure.Figure at 0x10a7e90d0>