<a href="https://colab.research.google.com/github/jpgerber/Recommender-for-movie-snobs/blob/master/1_Moviesnob_ML_ends.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### ML approach - top and bottom on each indicator

In [None]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

df = pd.read_csv('/content/gdrive/My Drive/moviesnob_ml_df.zip')


In [2]:
! pip install surprise
from surprise import Dataset, Reader, accuracy, SVDpp, SVD
from surprise.model_selection import train_test_split, GridSearchCV

df.head()

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 5.0MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1675734 sha256=dfbcac883a02120dbd3bfd9e9629a343434f87d27e88481350b7937cf24f4001
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.0 surprise-0.1


Unnamed: 0,userId,movieId,rating,canonical,newold_r,statler_waldorf,obscurist,contrariness
0,1,307,3.5,1,-0.43949,0.0,-0.019481,0.573882
1,1,481,3.5,0,-0.43949,0.0,-0.019481,0.573882
2,1,1091,1.5,0,-0.43949,0.0,-0.019481,0.573882
3,1,1257,4.5,0,-0.43949,0.0,-0.019481,0.573882
4,1,1449,4.5,0,-0.43949,0.0,-0.019481,0.573882


In [28]:
# First filter the dataset because extreme values reflect infrequent raters

# Filtering out the extremes here
new_old_filter = df.newold_r.isin([-1,1])
statler_waldorf_filter = df.statler_waldorf == 1
obscurist_filter = df.obscurist.isin([-1,1])
contrariness_filter = df.contrariness > 2.5

filter = new_old_filter | statler_waldorf_filter | obscurist_filter | contrariness_filter
df = df[filter == 0]

print(df.shape)
df.head()

(27681984, 8)


Unnamed: 0,userId,movieId,rating,canonical,newold_r,statler_waldorf,obscurist,contrariness
0,1,307,3.5,1,-0.43949,0.0,-0.019481,0.573882
1,1,481,3.5,0,-0.43949,0.0,-0.019481,0.573882
2,1,1091,1.5,0,-0.43949,0.0,-0.019481,0.573882
3,1,1257,4.5,0,-0.43949,0.0,-0.019481,0.573882
4,1,1449,4.5,0,-0.43949,0.0,-0.019481,0.573882


In [7]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVDpp()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
print(accuracy.rmse(predictions))

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k
RMSE: 0.9246
0.9245871858367066


### Doing the indicator tuning (first version)

In [21]:
results = pd.DataFrame(columns=['indicator','RMSE_uppersplit','RMSE_lowersplit'])
#results.set_index([pd.Index(['statler_waldorf','newold_r','obscurist','contrariness'])])
saved_params = pd.DataFrame(columns=['indicator','n_factors','n_epochs','lr_all','reg_all'])
fitted_rmse = []

In [25]:
# This cell is my machine learning part

sort_key = ['statler_waldorf','newold_r','obscurist','contrariness']
for item in sort_key:
    # Set up parameter grid
    param_grid = {'n_factors':[50,100,150],'n_epochs':[20,30],  'lr_all':[0.005,0.01],'reg_all':[0.02,0.1]}
    print('Making rating samples')
    # Set up the raw samples
    df = df.sort_values(by=item, ascending=False)
    ratings_upper = df.iloc[0:100000,:]
    ratings_lower = df.iloc[-100000:,:]
    reader = Reader(rating_scale=(1, 5)) # Make a reader
    # Now for the upper half (load reader)
    data = Dataset.load_from_df(ratings_upper[['userId', 'movieId', 'rating']], reader)
    raw_ratings = data.raw_ratings #get raw ratings
    random.shuffle(raw_ratings) # shuffle
    threshold = int(.75 * len(raw_ratings))
    A_raw_ratings = raw_ratings[:threshold]
    B_raw_ratings = raw_ratings[threshold:] #split into two samples
    data.raw_ratings = A_raw_ratings #keep sample A in reader
    print('Loaded data, now grid search...')
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5) # instantiate the model
    print('Training on {} upper'.format(item))
    gs.fit(data) #fit it!
    params = gs.best_params['rmse'] #get best params
    saved_params = saved_params.append(params, ignore_index=True) #append to df
    print(saved_params.head())
    print('Saved parameters for upper {}'.format(item))
    tuned_algo = gs.best_estimator['rmse'] #get tuned version
    trainset = data.build_full_trainset() #surprise requires the full trainset
    testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
    predictions = algo.test(testset) #make predictions
    rmse_uppersplit = accuracy.rmse(predictions, verbose=False) #get accuracy score
    print('Unbiased accuracy is {}'.format(rmse_uppersplit))
    fitted_rmse.append(rmse_uppersplit) # add that to my results list
    print(fitted_rmse)
    
    ## Start the other half
    data = Dataset.load_from_df(ratings_lower[['userId', 'movieId', 'rating']], reader)
    raw_ratings = data.raw_ratings
    random.shuffle(raw_ratings)
    threshold = int(.75 * len(raw_ratings))
    A_raw_ratings = raw_ratings[:threshold]
    B_raw_ratings = raw_ratings[threshold:]
    data.raw_ratings = A_raw_ratings
    print('Loaded data, now grid search...')
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
    print('Training on {} lower'.format(item))
    gs.fit(data)
    params = gs.best_params['rmse']
    saved_params = saved_params.append(params, ignore_index=True)
    print(saved_params.head())
    print('Saved parameters for lower {}'.format(item))
    tuned_algo = gs.best_estimator['rmse']
    trainset = data.build_full_trainset()
    testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
    predictions = algo.test(testset)
    rmse_lowersplit = accuracy.rmse(predictions, verbose=False)
    print('Unbiased accuracy is {}'.format(rmse_lowersplit))
    fitted_rmse.append(rmse_lowersplit)
    print(fitted_rmse)
    new_row = pd.DataFrame({"RMSE_uppersplit":rmse_uppersplit, "RMSE_lowersplit":rmse_lowersplit}, index=[item])
    results = results.append(new_row)



Making rating samples
Loaded data, now grid search...
Training on statler_waldorf upper
   indicator  n_factors  n_epochs  lr_all  reg_all
0        NaN       50.0      30.0    0.01      0.1
Saved parameters for upper statler_waldorf
Unbiased accuracy is 1.7094967016054754
[1.7094967016054754]
Loaded data, now grid search...
Training on statler_waldorf lower
   indicator  n_factors  n_epochs  lr_all  reg_all
0        NaN       50.0      30.0   0.010      0.1
1        NaN       50.0      30.0   0.005      0.1
Saved parameters for lower statler_waldorf
Unbiased accuracy is 0.711035355520385
[1.7094967016054754, 0.711035355520385]
Making rating samples
Loaded data, now grid search...
Training on newold_r upper
   indicator  n_factors  n_epochs  lr_all  reg_all
0        NaN       50.0      30.0   0.010      0.1
1        NaN       50.0      30.0   0.005      0.1
2        NaN       50.0      30.0   0.010      0.1
Saved parameters for upper newold_r
Unbiased accuracy is 1.2619136591700717
[1.7

### Now for turicreate

In [None]:
# Use the factorizer model and add the features one by one
