# Movie snobbery - III. Machine learning

We will do a SVC and a logistic regression of the canonical status of ratings.

In [57]:
# Get the right modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

# Makes a pipeline class to handle column selection
'''Borrowed from https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines'''
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, subset):
        self.subset = subset

    def transform(self, X, *_):
        return X.loc[:, self.subset]

    def fit(self, *_):
        return self

In [39]:
###################################################################
## Load and construct the labels and predictors

# Load pickled ratings
X = pd.read_pickle('moviesnob_df.pkl')
# Drop extraneous variables
X = X.drop(['movieId','tmdbId','rating_days_after','rating_date','release_date'], axis=1)
#print(X.head())

# Load pickled by_user database
snob_features = pd.read_pickle('moviesnob_by_user_df.pkl').reset_index()
# Drop extraneous features
snob_features = snob_features.drop(['canon_pref_meandiff'], axis=1)
#print(snob_features.head())

# Join the two
X = X.merge(snob_features, how='left', on='userId').drop(['userId'], axis=1)
# Then delete the frame we don't need
del snob_features

# Make the labels vector
y = X.canonical.to_numpy()

# Drop the indicator from X
X = X.drop(['canonical'], axis=1)
# Extract the columns for later on
col_names = X.columns

############################################################################################
# Setting up some column lists for later use with ColumnSelector 
rating_only = ['rating']
lens_only = ['rating', 'rating_year', 'release_year']
snob_only = ['newold_r', 'statler_waldorf', 'obscurist', 'contrariness']
lens_plus_snob = ['rating', 'rating_year', 'release_year', 'newold_r', 'statler_waldorf', 
                  'obscurist', 'contrariness']


In [140]:
X.corr()

Unnamed: 0,rating,rating_year,release_year,rating_count,rating_mean,canonical_sum,canon_prop,canonical_mean,canon_pref_stat,newold_r,statler_waldorf,obscurist,contrariness
rating,1.0,0.006754,-0.080317,-0.1375,0.451302,-0.109707,0.151882,0.436094,0.07849,-8.7e-05,0.130182,-0.047057,-0.126539
rating_year,0.006754,1.0,0.302511,0.149242,0.016581,0.109992,-0.047667,0.060732,0.139165,0.061888,-0.139533,-0.123882,-0.014293
release_year,-0.080317,0.302511,1.0,0.000555,-0.002078,-0.090398,-0.200688,0.037242,0.068048,0.11856,-0.060892,0.042219,-0.009427
rating_count,-0.1375,0.149242,0.000555,1.0,-0.304675,0.866272,-0.373503,-0.274419,-0.199122,-0.094522,-0.216034,0.039012,-0.074005
rating_mean,0.451302,0.016581,-0.002078,-0.304675,1.0,-0.243089,0.336541,0.966999,0.175299,-0.000194,0.288458,-0.104716,-0.280387
canonical_sum,-0.109707,0.109992,-0.090398,0.866272,-0.243089,1.0,-0.164755,-0.248506,-0.23305,-0.203757,-0.232782,-0.050606,-0.124325
canon_prop,0.151882,-0.047667,-0.200688,-0.373503,0.336541,-0.164755,1.0,0.220543,0.019417,-0.157532,0.184702,-0.159991,-0.091672
canonical_mean,0.436094,0.060732,0.037242,-0.274419,0.966999,-0.248506,0.220543,1.0,0.356478,0.11397,0.256458,-0.174149,-0.244576
canon_pref_stat,0.07849,0.139165,0.068048,-0.199122,0.175299,-0.23305,0.019417,0.356478,1.0,0.478864,0.168498,-0.513319,0.285859
newold_r,-8.7e-05,0.061888,0.11856,-0.094522,-0.000194,-0.203757,-0.157532,0.11397,0.478864,1.0,0.007271,-0.086195,0.132086


In [75]:
# Then split, stratifying on canonical status
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=2, stratify=y)

In [162]:
# Making a test dataset to show that it works well first
X_test = X[::100]
y_test = y[::100]
Xtrain, Xtest, ytrain, ytest = train_test_split(X_test, y_test, test_size=0.3, random_state=2, stratify=y_test)


In [188]:
# Construct a pipeline for selecting columns, impute missing data, scale, and run the SVM

score_list_svc_unbalanced=[]
#rating_only
#lens_only
#snob_only 
#lens_plus_snob
# Create scaler, imputer and LinearSVC objects
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
scaler = StandardScaler()
#clf = LogisticRegression(dual=False) #, class_weight='balanced')

clf = LinearSVC(penalty='l1', dual=False) #, class_weight='balanced') # primal because n_samples >> n_features, and fix the imbalance


# Cross-validation to check for regularization parameter
param_grid = {'clf__C': [0.0001, 0.001,0.01, 0.1, 1,10,100,1000]}
# list of variables to try
variable_lists = [rating_only, lens_only, snob_only, lens_plus_snob]
for variable_list in variable_lists:
    columnselector = ColumnSelector(variable_list)
    # Create pipeline: pipeline
    pipe = Pipeline([('rating_only', ColumnSelector(variable_list)),
                 ('imputer',SimpleImputer()),
                 ('scaler', scaler),
                 ('clf', clf)])
    model = GridSearchCV(pipe, param_grid, scoring='f1_micro', return_train_score=True) #Stratified 5-fold is default
                    # fl_micro is better for imbalanced classes as sums over all classes before averaging
        # Fit the pipeline to samples
    model.fit(Xtrain, ytrain)
    ypred = model.predict(Xtest)
    confusion_matrix = pd.crosstab(ytest, ypred)
    score_list_svc_unbalanced.append((variable_list ,model.best_score_, confusion_matrix))





In [189]:
score_list_svc_unbalanced

[(['rating'], 0.7206400560512705, col_0    0.0
  row_0       
  0.0    59950
  1.0    23240),
 (['rating', 'rating_year', 'release_year'],
  0.7505821501432193,
  col_0    0.0   1.0
  row_0             
  0.0    57190  2760
  1.0    17883  5357),
 (['newold_r', 'statler_waldorf', 'obscurist', 'contrariness'],
  0.7211037154573743,
  col_0    0.0  1.0
  row_0            
  0.0    59873   77
  1.0    23156   84),
 (['rating',
   'rating_year',
   'release_year',
   'newold_r',
   'statler_waldorf',
   'obscurist',
   'contrariness'],
  0.7508861046427762,
  col_0    0.0   1.0
  row_0             
  0.0    57194  2756
  1.0    17846  5394)]

In [184]:
# Full model - including snobbery indices

# Plot a basic confusion matrix
#plt.imshow(metrics.confusion_matrix(ypred, ytest), interpolation='nearest', cmap=plt.cm.binary)
#plt.colorbar()
#plt.xlabel("true label")
#plt.ylabel("predicted label")

NameError: name 'plt' is not defined