# Learning Models

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold
import numpy as np

In [2]:
movies = pd.read_csv("../data/movies.csv")

#getting the top 10K films of all time
movies = movies.sort_values(by=['score'], ascending=False).head(10000)

In [3]:
#getting the writers and directors from the top 250 films of all time. These are the top 1% of
#Hollywood and they are the only ones that matter
best_directors = movies.sort_values(by=['score', 'directors'], ascending=False).head(250)['directors'].unique().tolist()
best_writers = movies.sort_values(by=['score', 'writers'], ascending=False).head(250)['writers'].unique().tolist()

#the bottom 99% will do into these replaceable lists
replacable_directors = []
replacable_writers = []

for x in movies['directors']:
    if x not in best_directors:
        replacable_directors.append(x)
        
for x in movies['writers']:
    if x not in best_directors:
        replacable_writers.append(x)       

In [4]:
#replacing the bottom 99% 's unique ID number with 'unknown'
#this will help when getting dummies
movies.replace(replacable_directors, 'unknown', inplace=True)
movies.replace(replacable_writers, 'unknown', inplace=True)

In [5]:
#get dummies for: genres directors, writers
genre_dummies = pd.get_dummies(movies['genres'])
director_dummies = pd.get_dummies(movies['directors'])
writer_dummies = pd.get_dummies(movies['writers'])

frames = [movies['startYear'], genre_dummies, director_dummies, writer_dummies, movies['runtimeMinutes'], movies['averageRating']]

## Getting DataFrame Ready for ML

In [6]:
#creating a ready for model df that has only the relavant features and target
ready_for_model = pd.concat(frames, axis=1)
ready_for_model = ready_for_model.drop(['unknown'], axis=1)
ready_for_model

Unnamed: 0,startYear,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,nm1218281,nm1443502,nm1512910,nm1732981,nm2284484,nm2588606,nm3227090,nm3363032,runtimeMinutes,averageRating
50023,1994,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,142,9.3
80640,2008,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,152,9.0
95371,2010,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,148,8.8
49910,1994,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,154,8.9
49435,1994,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,142,8.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88033,2008,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,96,5.9
36021,1978,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,110,6.8
86116,2007,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,140,6.8
14149,1949,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,99,7.4


## Linear Regression

In [44]:
import sklearn.model_selection as sk
import sklearn.metrics as sk
import sklearn.ensemble as ske


from sklearn.neural_network import MLPClassifier

In [49]:
#features are: year, genre dummies, writer/director dummies, runtime
X = ready_for_model.drop(['averageRating'], axis=1)
#target is: average rating
y = ready_for_model['averageRating']

In [50]:
#function that does the following:
# train test split, fit, and prediction
#measures model with rmse
#prints model name and rmse
def regression_model(X, y, my_model):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    model = my_model
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)
    rmse = sk.mean_squared_error(y_test,predicted)
    print("Model: {}".format(model))
    print("RMSE: {}".format(rmse))     

In [51]:
regression_model(X, y, LinearRegression())

Model: LinearRegression()
RMSE: 1.7797380794673277e+18


In [52]:
regression_model(X, y, ske.GradientBoostingRegressor())

Model: GradientBoostingRegressor()
RMSE: 0.3794749679375236


In [56]:
set_mean = ready_for_model['averageRating'].mean()

In [54]:
dup = ready_for_model

In [57]:
dup['error'] = dup['averageRating'] - set_mean

In [58]:
dup

Unnamed: 0,startYear,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,nm1443502,nm1512910,nm1732981,nm2284484,nm2588606,nm3227090,nm3363032,runtimeMinutes,averageRating,error
50023,1994,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,142,9.3,2.28766
80640,2008,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,152,9.0,1.98766
95371,2010,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,148,8.8,1.78766
49910,1994,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,154,8.9,1.88766
49435,1994,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,142,8.8,1.78766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88033,2008,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,96,5.9,-1.11234
36021,1978,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,110,6.8,-0.21234
86116,2007,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,140,6.8,-0.21234
14149,1949,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,99,7.4,0.38766


In [59]:
summation = 0
for x in dup['error']:
    summation += x**2

answer = (summation/dup['error'].size)**(0.5)
    

In [60]:
answer

0.7268794428239123