# Learning Models

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold
import numpy as np

In [47]:
movies = pd.read_csv("../data/movies.csv")

#getting the top 10K films of all time
movies = movies.sort_values(by=['score'], ascending=False).head(10000)

In [8]:
#getting the writers and directors from the top 250 films of all time. These are the top 1% of
#Hollywood and they are the only ones that matter
best_directors = movies.sort_values(by=['score', 'directors'], ascending=False).head(250)['directors'].unique().tolist()
best_writers = movies.sort_values(by=['score', 'writers'], ascending=False).head(250)['writers'].unique().tolist()

#the bottom 99% will do into these replaceable lists
replacable_directors = []
replacable_writers = []

for x in movies['directors']:
    if x not in best_directors:
        replacable_directors.append(x)
        
for x in movies['writers']:
    if x not in best_directors:
        replacable_writers.append(x)       

In [10]:
#replacing the bottom 99% 's unique ID number with 'unknown'
#this will help when getting dummies
movies.replace(replacable_directors, 'unknown', inplace=True)
movies.replace(replacable_writers, 'unknown', inplace=True)

In [12]:
#get dummies for: genres directors, writers
genre_dummies = pd.get_dummies(movies['genres'])
director_dummies = pd.get_dummies(movies['directors'])
writer_dummies = pd.get_dummies(movies['writers'])

frames = [movies['startYear'], genre_dummies, director_dummies, writer_dummies, movies['runtimeMinutes'], movies['averageRating']]

## Getting DataFrame Ready for ML

In [49]:
#creating a ready for model df that has only the relavant features and target
ready_for_model = pd.concat(frames, axis=1)
ready_for_model = ready_for_model.drop(['unknown'], axis=1)

## Linear Regression

In [50]:
import sklearn.model_selection as sk
import sklearn.metrics as sk

In [51]:
#features are: year, genre dummies, writer/director dummies, runtime
X = ready_for_model.drop(['averageRating'], axis=1)
#target is: average rating
y = ready_for_model['averageRating']

In [52]:
# train test split, fit, and prediction
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = LinearRegression()
model.fit(X_train, y_train)
predicted = model.predict(X_test)

In [53]:
#measuring model with rmse
rmse = sk.mean_squared_error(y_test, pred)