In [1]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from random import randint
import seaborn as sns
from surprise import Reader, Dataset, SVD, evaluate
%matplotlib inline
sns.set_style("darkgrid")

In [3]:
dataset = pd.read_csv('./ratings.csv', header = None, names = ['userId','movieId', 'rating'], usecols = [0,1,2],skiprows=1)

In [4]:
dataset['rating'] = dataset['rating'].astype('float')
print('Ratings Dataset : {}'.format(dataset.shape))
print(dataset.iloc[::5000000, :])

Ratings Dataset : (26024289, 3)
          userId  movieId  rating
0              1      110     1.0
5000000    51433     1376     4.0
10000000  103170      924     3.0
15000000  156028     1918     3.0
20000000  207965    89864     4.0
25000000  260079      585     3.0


In [5]:
p = dataset.astype(int).groupby('rating')['rating'].agg(['count'])

print(p)

          count
rating         
0        404897
1       1246917
2       3017798
3       8372935
4       9169243
5       3812499


In [6]:
userfreq = dataset.groupby('userId')['userId'].size()
print(userfreq[:10])

userId
1      27
2      22
3      10
4      62
5      26
6       4
7      53
8     113
9      84
10     13
Name: userId, dtype: int64


In [7]:
maxrating = max(list(userfreq))
useridmax = list(userfreq).index(maxrating)
print("Maximum rating given : " + str(maxrating) + " by user with id: " + str(useridmax))
minrating = min(list(userfreq))
useridmin = list(userfreq).index(minrating)
print("Maximum rating given : " + str(minrating) + " by user with id: " + str(useridmin))

Maximum rating given : 18276 by user with id: 45810
Maximum rating given : 1 by user with id: 44


In [8]:
datasetmovies = pd.read_csv('./movies.csv', header = None, names = ['movieId','title', 'genres'], usecols = [0,1,2])
datasetmovies.set_index('movieId', inplace = True)

In [9]:
datasetmovies.head(10)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


In [10]:
reader = Reader()

# get just top 100K rows for faster run time
data = Dataset.load_from_df(dataset[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=10)

algo = SVD()
evaluate(algo, data, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1


KeyboardInterrupt: 

In [44]:
from surprise.prediction_algorithms import SlopeOne
algo1 = SlopeOne()
evaluate(algo1, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SlopeOne.

------------
Fold 1
RMSE: 0.8685
MAE:  0.6615
------------
Fold 2
RMSE: 0.8659
MAE:  0.6597
------------
Fold 3
RMSE: 0.8660
MAE:  0.6603
------------
------------
Mean RMSE: 0.8668
Mean MAE : 0.6605
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.66151840378977811,
                             0.65969220537182427,
                             0.66026198018314119],
                            'rmse': [0.86852120216184969,
                             0.86593817952121743,
                             0.86603639877684468]})

In [35]:
from surprise.prediction_algorithms import KNNWithMeans
algo1 = KNNWithMeans()
evaluate(algo1, data, measures=['RMSE', 'MAE'])


Evaluating RMSE, MAE of algorithm KNNWithMeans.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8803
MAE:  0.6742
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8818
MAE:  0.6752
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8811
MAE:  0.6751
------------
------------
Mean RMSE: 0.8811
Mean MAE : 0.6749
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.67424925376493527,
                             0.67524588529266583,
                             0.67510772895352855],
                            'rmse': [0.88028112397628977,
                             0.88182395917468892,
                             0.88114303000332339]})

In [10]:

data = Dataset.load_from_df(dataset[['userId', 'movieId', 'rating']], reader)

trainset = data.build_full_trainset()
algo.train(trainset)


In [11]:

data = Dataset.load_from_df(dataset[['userId', 'movieId', 'rating']], reader)

some = pd.DataFrame([[200000,300,5.0]],columns=['userId','movieId','rating'])
dataset.append(some,ignore_index=True)

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0
5,1,1968,4.0
6,1,2762,4.5
7,1,2918,5.0
8,1,2959,4.0
9,1,4226,4.0


In [12]:
def userresult(num,algorithm):
    user_x = datasetmovies.copy()
    user_x = user_x.reset_index()
    # getting full dataset
    user_x['Estimate_Score'] = user_x['movieId'].apply(lambda x: algorithm.predict(num, x).est)

    user_x = user_x.drop('movieId', axis = 1)

    user_x = user_x.sort_values('Estimate_Score', ascending=False)
    return user_x

In [13]:
userresult(4800,algo).head(10)

Unnamed: 0,title,genres,Estimate_Score
5854,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,4.666669
4898,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,4.653481
7042,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,4.627518
257,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.584853
2706,Airplane! (1980),Comedy,4.503335
1171,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,4.469293
1173,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,4.436751
843,"Godfather, The (1972)",Crime|Drama,4.428275
1194,"Blues Brothers, The (1980)",Action|Comedy|Musical,4.386037
1155,Paths of Glory (1957),Drama|War,4.379819


In [58]:
userresult(5000,algo).head(10)

Unnamed: 0,title,genres,Estimate_Score
257,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,5.0
1173,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,5.0
1471,Austin Powers: International Man of Mystery (1...,Action|Adventure|Comedy,4.979927
1171,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,4.909303
220,Clerks (1994),Comedy,4.829663
108,Braveheart (1995),Action|Drama|War,4.813576
315,"Shawshank Redemption, The (1994)",Crime|Drama,4.798143
1017,Die Hard (1988),Action|Crime|Thriller,4.790618
293,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.778193
1945,Saving Private Ryan (1998),Action|Drama|War,4.777121


In [76]:
userresult(247,algo).head(10)

Unnamed: 0,title,genres,Estimate_Score
625,Land and Freedom (Tierra y libertad) (1995),Drama|War,4.864094
3189,"Boondock Saints, The (2000)",Action|Crime|Drama|Thriller,4.763609
16808,Castaway on the Moon (Kimssi pyoryugi) (2009),Comedy|Drama|Romance,4.725522
1856,"Best Years of Our Lives, The (1946)",Drama|War,4.708511
4338,Sweet Smell of Success (1957),Drama|Film-Noir,4.698266
39432,Planet Earth (2006),Documentary,4.696256
6430,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy,4.683621
18766,"Civil War, The (1990)",Documentary|War,4.662194
888,It Happened One Night (1934),Comedy|Romance,4.658199
6502,Umberto D. (1952),Drama,4.647698


In [14]:
import socket      

In [20]:
s = socket.socket()         # Create a socket object
host = socket.gethostname() # Get local machine name
port = 8082                # Reserve a port for your service.
s.bind(("localhost", port))        # Bind to the port
s.listen(5)                 # Now wait for client connection.
while True:
    c, addr = s.accept()     # Establish connection with client.
    numer = randint(1,9000)
    print('Got connection from', addr)
    data = c.recv(1024).decode("ASCII")
    datalist = data.split("-")
    newlist = datalist[1].split("_")
    for i in newlist:
        ass = i.split("@")
        chan = []
        chan.append(datalist[0])
        chan.extend(ass)
        some = pd.DataFrame([chan],columns=['userId','movieId','rating'])
        dataset.append(some)   
    final = userresult(numer,algo).head(10)
    title = final["title"].values.tolist()
    genre = final["genres"].values.tolist()
    str = ''
    for i in range(0,10):
        str = str + title[i] + "@" + genre[i] + "-"
    print(str)    
    c.send(str.encode('ASCII'))
    c.close()
    break
s.close()

Got connection from ('127.0.0.1', 60504)
Citizen Kane (1941)@Drama|Mystery-Raising Arizona (1987)@Comedy-Being John Malkovich (1999)@Comedy|Drama|Fantasy-Sting, The (1973)@Comedy|Crime-Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)@Comedy|War-Monty Python and the Holy Grail (1975)@Adventure|Comedy|Fantasy-Up (2009)@Adventure|Animation|Children|Drama-Usual Suspects, The (1995)@Crime|Mystery|Thriller-Departed, The (2006)@Crime|Drama|Thriller-American Beauty (1999)@Drama|Romance-
