In [1]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

In [2]:
import pandas as pd
unames = ['userId', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('users.dat', sep='::', 
                      header=None, names=unames, engine='python')
rnames = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_table('ratings.dat', sep='::', header=None, names=rnames, engine='python')

mnames = ['movieId', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames, engine='python')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [6]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,5.0,978824268.0
1,1,Toy Story (1995),Animation|Children's|Comedy,6.0,4.0,978237008.0
2,1,Toy Story (1995),Animation|Children's|Comedy,8.0,4.0,978233496.0
3,1,Toy Story (1995),Animation|Children's|Comedy,9.0,5.0,978225952.0
4,1,Toy Story (1995),Animation|Children's|Comedy,10.0,5.0,978226474.0


In [7]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [8]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),5.0
1,6.0,Toy Story (1995),4.0
2,8.0,Toy Story (1995),4.0
3,9.0,Toy Story (1995),5.0
4,10.0,Toy Story (1995),5.0


In [9]:
dataset.rating.min()

1.0

In [10]:
dataset.rating.max()

5.0

In [11]:
#dataset = dataset[0:1000]

In [12]:
reader = Reader(rating_scale=(1.0, 5.0))

In [13]:
data = Dataset.load_from_df(dataset, reader)

In [14]:
trainset, testset = train_test_split(data, test_size=.15)

In [15]:
algo = KNNWithMeans(k=5, sim_options={'name': 'msd', 'user_based': True})

In [16]:
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7d14b70>

In [17]:
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 1.0071


1.0070553245871927

In [18]:
new_pred = algo.predict(uid=2, iid='Fight Club (1999)')
new_pred

Prediction(uid=2, iid='Fight Club (1999)', r_ui=None, est=4.962070126990104, details={'actual_k': 5, 'was_impossible': False})

In [19]:
len(dataset)

1000209