In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx
import re

In [2]:
# Initialize empty dictionary of ratings
netflix_ratings = {'movieID':[], 'userID':[], 'rating':[]}

ratings_files = os.listdir('training_set/')[:2500]
print(len(ratings_files))
for x,rating_file in enumerate(ratings_files):
    if x%1000==0:
        print(x/len(ratings_files))
    with open('training_set/'+rating_file) as f:
        for i,line in enumerate(f):
            if i == 0:
                # Get the movie ID
                mID = re.findall('([0-9]*):',line)[0]
            # If not the first line, then record the rating.
            else:
                row = line.split(',')
                netflix_ratings['movieID'].append(mID)
                netflix_ratings['userID'].append(row[0])
                netflix_ratings['rating'].append(row[1])
                # Ignore the date information for now (may use later)

2500
0.0
0.4
0.8


In [3]:
df = pd.DataFrame(netflix_ratings)

In [4]:
# Remove the netflix_ratings dict from memory (this just makes it a very very small None object)
netflix_ratings = None

In [5]:
df.head()

Unnamed: 0,movieID,userID,rating
0,373,643460,4
1,373,349399,5
2,373,1315469,2
3,373,1022903,3
4,373,302715,3


### Building the baseline:
First, set up the dataset object. Then call the recommender in a cross-validate system.

In [6]:
# Build a very basic RS, using just a standard normal distribution about the 
# mean of the ratings in the dataset.
from surprise import NormalPredictor

# Baseline with global mean, user bias, and item bias only.
from surprise.prediction_algorithms.baseline_only import BaselineOnly

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# Create the surprise Dataset object:
data = Dataset.load_from_df(df[['userID', 'movieID', 'rating']], reader)

In [7]:
# Get rid of the dataframe.
df = None
# Pickle the dataset so we have it later. This takes too long and crashes the computer.
# import pickle
# pickle.dump(data, open( "dataset.p", "wb" ))

In [8]:
# Test the baseline validator
cross_validate(BaselineOnly(), data, cv=3, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9369  0.9375  0.9373  0.9372  0.0003  
MAE (testset)     0.7406  0.7408  0.7407  0.7407  0.0001  
Fit time          49.73   64.11   65.55   59.80   7.14    
Test time         215.51  177.91  235.81  209.74  23.99   


{'test_rmse': array([0.9368745 , 0.93747759, 0.93727878]),
 'test_mae': array([0.74057394, 0.74075342, 0.74073485]),
 'fit_time': (49.727607011795044, 64.11378788948059, 65.54763293266296),
 'test_time': (215.51144981384277, 177.90801525115967, 235.81061100959778)}