### Random Forest
We used year, duration, budget, genre, and director. The categorical data was transformed using get_dummies and the rating class was transformed using label encoder. Since this is random forest, we did not need to scale our data. The training score was 99% and the testing score was 71.1%. When looking at the feature importance, we saw that the most popular splitting points were budget, duration, and year. Then came the genres, and finally directors. 

The grid search for this model included n_estimators (number of trees), max_depth (maximum levels per tree), max_features (number of features to consider when looking for a split), and max_samples(number of samples to draw from X to train each tree). With the optimized hyperparameters, the models test accuracy increased to 71.5%.

In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from numpy.random import seed
seed(42)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import tensorflow
tensorflow.keras.__version__
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier



In [None]:
# Read in movie csv
movies = pd.read_csv("./Resources/imdb_final.csv")
movies.head()

In [None]:
# Set values for X and y
X = movies[["year", "genre", "duration", "director", "budget"]]
y = movies["rating_class"].values.reshape(-1,1)
print(X.shape, y.shape)

In [None]:
data = X.copy()
data

In [None]:
# Dummy Encoding for genre column
data_binary_encoded = pd.get_dummies(data, columns= ["genre", "director"])
data_binary_encoded.head()

In [None]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y, random_state=42) 

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
encoded_y_train

## Create Random Forest Model

In [None]:
rf = RandomForestClassifier(n_estimators = 50)
rf = rf.fit(X_train, encoded_y_train)
print(f"Random Forest Training Score: {rf.score(X_train, encoded_y_train)}")
print(f"Random Forest Testing Score: {rf.score(X_test, encoded_y_test)}")

In [None]:
importances = rf.feature_importances_
importances

In [None]:
sorted(zip(rf.feature_importances_, data_binary_encoded.columns))

In [None]:
encoded_predictions = rf.predict(X_test)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"Predictions: {encoded_predictions[:15]}")
print(f"Predicted classes: {prediction_labels[:15]}")
print(f"Actual Labels: {y_test[:15]}")

In [None]:
# CLassification report
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction_labels))

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix as cm
cm(y_test, prediction_labels)

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
n_estimators = [50, 150, 200]
max_features = [1, 5, 10]
max_depth = [10, 100, 200]
max_samples = [50, 200, 400]
param_grid = dict(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, max_samples=max_samples)
grid = GridSearchCV(rf, param_grid, verbose=2, scoring = "accuracy")

In [None]:
grid.fit(X_train, np.ravel(encoded_y_train))

In [None]:
print(grid.best_params_)
print(grid.best_score_)

## Recreate model with optimized parameters

In [None]:
rf2 = RandomForestClassifier(n_estimators = 200, max_depth= 200, max_features= 5, max_samples= 400, random_state = 30)
rf2 = rf.fit(X_train, encoded_y_train)
print(f"Random Forest Training Score: {rf.score(X_train, encoded_y_train)}")
print(f"Random Forest Testing Score: {rf.score(X_test, encoded_y_test)}")

## Save Model

In [None]:
import joblib
filename = 'rf.sav'
joblib.dump(rf2, filename)


## Create Dataframe for Testing Input

In [None]:
# Make copy of X dataframe
test_array = data_binary_encoded.copy()
test_array.head()

In [None]:
# Make new dataframe of just 1 row
test_array2 = test_array.head(1)
test_array2

In [None]:
# Replace all values in test with 0
# empty_test = test_array2.replace(test_array2, 0)

for col in test_array2.columns:
    test_array2[col].values[:] = 0

    
empty_test = test_array2    
test_array2.to_csv("../empty_test.csv", index=False, header=True)


In [None]:
# Example test case from inputs on website
test_case = [2001, 118, 48000000, "Comedy", "James Mangold"]

In [None]:
# Assign column names to variable
c = empty_test.columns
c

In [None]:
# List comprehension to get correct column for genre and director to change to 1
display([c for c in empty_test if c.endswith(test_case[3])])
[c for c in empty_test if c.endswith(test_case[4])]

In [None]:
empty_test['year'] = test_case[0]
empty_test['duration'] = test_case[1]
empty_test['budget'] = test_case[2]
empty_test[[c for c in empty_test if c.endswith(test_case[3])]] = 1
empty_test[[c for c in empty_test if c.endswith(test_case[4])]] = 1
empty_test

## Predict Rating Using Loaded Model

In [None]:
# Load saved model
movie_model = joblib.load("rf.sav")
result = movie_model.score(X_test, encoded_y_test)
print(result)

In [None]:
# Predict rating and convert to Good, Bad, or Excellent

test_prediction = movie_model.predict(empty_test)
prediction = label_encoder.inverse_transform(test_prediction)
prediction[0]
test_prediction[0]