# Model Training Testing and Evaluation 
This notebook contains the code relating to training models and evaluating them 

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv('merged70k.csv').set_index('tconst')
df.dropna(inplace=True) 
df

Unnamed: 0_level_0,averageRating,numVotes,isAdult,startYear,runtimeMinutes,actor_nm0555550,actor_nm0245596,actor_nm0068501,actor_nm0001825,actor_nm0138287,...,writers_nm5238285,writers_nm3298048,writers_nm1018426,writers_nm1889381,writers_nm9555338,writers_nm1493942,writers_nm0470741,writers_nm2562555,writers_nm2010047,writers_non_100_category
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt2126403,7.2,2623.0,0,2013.0,97.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0252695,7.8,8943.0,0,1975.0,79.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt2425486,6.9,8839.0,0,2013.0,110.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0246266,6.9,2690.0,0,2000.0,88.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt5029208,6.3,754.0,0,2017.0,126.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt0427873,6.7,355.0,0,2005.0,95.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt6494358,7.5,958.0,0,2017.0,123.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0297753,6.0,4016.0,0,2003.0,85.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0019760,8.4,24481.0,0,1929.0,68.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(df, test_size=0.25, shuffle=True)  

y_train = list(x_train['averageRating'])
y_test = list(x_test['averageRating'])

x_train.drop(columns=['averageRating','numVotes'], inplace=True) 
x_test.drop(columns=['averageRating','numVotes'], inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [4]:
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score 

def eval_preds(preds, actual):
    mse = mean_squared_error(actual, preds) 
    r2 = r2_score(actual, preds)
    
    print(f"MSE: {mse} R2: {r2}")
    

In [5]:
# Entire space TSNE plot -> Takes a while to run so keep that in mind 
from sklearn.manifold import TSNE

import plotly.express as px 

encodings = TSNE(n_components=2).fit_transform(x_train) 



In [6]:
df.keys()

Index(['averageRating', 'numVotes', 'isAdult', 'startYear', 'runtimeMinutes',
       'actor_nm0555550', 'actor_nm0245596', 'actor_nm0068501',
       'actor_nm0001825', 'actor_nm0138287',
       ...
       'writers_nm5238285', 'writers_nm3298048', 'writers_nm1018426',
       'writers_nm1889381', 'writers_nm9555338', 'writers_nm1493942',
       'writers_nm0470741', 'writers_nm2562555', 'writers_nm2010047',
       'writers_non_100_category'],
      dtype='object', length=337)

In [7]:
x = encodings[:,0]
y = encodings[:,1]

fig = px.scatter(x=x, y=y, color=y_train) 
fig.show() 

In [8]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression().fit(x_train, y_train) 
# Pure Linear model is terrible 

In [9]:
preds = clf.predict(x_test)

# for idx in range(10):
#     print(f"Actual {y_test[idx]} - Pred: {preds[idx]}")
eval_preds(preds, y_test)

MSE: 356265409603779.4 R2: -229924302122832.53


In [28]:
from sklearn.linear_model import Lasso

clf = Lasso(random_state=0).fit(x_train, y_train)

eval_preds(clf.predict(x_test), y_test)

MSE: 1.4472226676299063 R2: 0.06600064754750834


In [29]:
from sklearn.linear_model import Ridge

clf = Ridge(random_state=0).fit(x_train, y_train) 

preds = clf.predict(x_test)
eval_preds(preds, y_test)


MSE: 1.1179732926626704 R2: 0.2784895132162617


In [22]:
from sklearn.ensemble import GradientBoostingRegressor
forest = GradientBoostingRegressor(random_state=0) 
forest.fit(x_train, y_train) 



GradientBoostingRegressor(random_state=0)

In [23]:
preds = forest.predict(x_test)
eval_preds(preds, y_test) 

MSE: 1.0172408807027955 R2: 0.34349955600088766


In [27]:
px.bar(x=list(x_train.keys()), y=forest.feature_importances_)