In [1]:
import pandas as pd
from scipy.sparse import vstack, hstack
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn import metrics



# feel free to import other libraries/functions 

#Import Processed and Cleaned Data

##Topic modelling data chosen due to good performance in linear regression models

In [2]:
df_x_train = pd.read_csv('final_data/X_train_keybert.csv')
df_y_train = pd.read_csv('final_data/y_train.csv')
df_x_test = pd.read_csv('final_data/X_test_keybert.csv')
df_y_test = pd.read_csv('final_data/y_test.csv')


In [3]:
features = pd.read_csv('final_data/feature_scores_keybert.csv')

In [4]:
print(df_x_train.columns)

Index(['title', 'release_year', 'end_year', 'type', 'runtime', 'Action',
       'Adult', 'Adventure', 'Animation', 'Biography',
       ...
       'Keyword_town', 'Keyword_travel', 'Keyword_village', 'Keyword_war',
       'Keyword_wedding', 'Keyword_wife', 'Keyword_woman', 'Keyword_writer',
       'Keyword_york', 'Keyword_young'],
      dtype='object', length=1113)


In [5]:
features = features.transpose()

In [6]:
features.head()

Unnamed: 0,0
Horror,5368.489161
type,3766.108935
Drama,2628.458755
Thriller,1247.980408
Sci-Fi,1043.069004


In [7]:
features.columns = ['importance']

In [8]:
features.head()

Unnamed: 0,importance
Horror,5368.489161
type,3766.108935
Drama,2628.458755
Thriller,1247.980408
Sci-Fi,1043.069004


##Feature selection by creating list of top 100 features by importance

In [9]:
selected_features = features.sort_values(by='importance', ascending=False).head(100)

In [10]:
selected_features

Unnamed: 0,importance
Horror,5368.489161
type,3766.108935
Drama,2628.458755
Thriller,1247.980408
Sci-Fi,1043.069004
...,...
Cast_vernonwells,22.949852
Cast_stevenseagal,22.936663
Keyword_child,22.909459
Cast_michaelironside,21.992027


In [11]:
selected_features_lst = list(selected_features.index)

In [12]:
df_y_train.head()

Unnamed: 0,rating
0,5.4
1,7.6
2,3.2
3,7.1
4,5.3


##Create test and train dataframes with selected features

In [13]:
X_train = df_x_train[selected_features_lst]

In [14]:
Y_train = df_y_train

In [15]:
X_test = df_x_test[selected_features_lst]
Y_test = df_y_test

In [16]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(58181, 100)
(14546, 100)
(58181, 1)
(14546, 1)


In [38]:
X_train.head()

Unnamed: 0,Horror,type,Drama,Thriller,Sci-Fi,Animation,Documentary,History,release_year,Biography,...,Cast_briankrause,Keyword_journey,Cast_billyzane,Cast_cynthiarothrock,Keyword_character,Cast_vernonwells,Cast_stevenseagal,Keyword_child,Cast_michaelironside,Keyword_death
0,0.0,0,1.0,1.0,0.0,0.0,0.0,0.0,2015,0.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,1
1,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1934,0.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0
2,1.0,0,0.0,0.0,1.0,0.0,0.0,0.0,1975,0.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0
3,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,2011,0.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0
4,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,1973,0.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0


#Initializing Random Forest Modelling

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [40]:
# Create a random forest classifier
#clf = RandomForestRegressor(n_estimators=10000, random_state=0, n_jobs=-1)
clf = RandomForestRegressor(random_state=1)

# Train the classifier
clf.fit(X_train, Y_train)


  clf.fit(X_train, Y_train)


RandomForestRegressor(random_state=1)

In [42]:
# Use the forest's predict method on the test data
predictions = clf.predict(X_test)
predictions = predictions.reshape(14546,1)
# Calculate the absolute errors
errors = abs(predictions - Y_test)
# Print out the mean absolute error (mae), mean squared error (mse), root mean squared error (rmse)
print('Mean Absolute Error: {0:.4f}'.format(metrics.mean_absolute_error(Y_test, predictions)))  
print('Mean Squared Error: {0:.4f}'.format(metrics.mean_squared_error(Y_test, predictions))) 
print('Root Mean Squared Error: {0:.4f}'.format(np.sqrt(metrics.mean_squared_error(Y_test, predictions))))

Mean Absolute Error: 0.8653
Mean Squared Error: 1.2990
Root Mean Squared Error: 1.1397


In [43]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / Y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2)[0], '%.')

Accuracy: 83.49 %.


##Increasing number of estimators

In [44]:
# Create a random forest classifier=
clf_2 = RandomForestRegressor(n_estimators=1000, random_state=1)

# Train the classifier
clf_2.fit(X_train, Y_train)


  clf_2.fit(X_train, Y_train)


RandomForestRegressor(n_estimators=1000, random_state=1)

In [45]:
# Use the forest's predict method on the test data
predictions_2 = clf_2.predict(X_test)
predictions_2 = predictions_2.reshape(14546,1)
# Calculate the absolute errors
errors_2 = abs(predictions_2 - Y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error: {0:.4f}'.format(metrics.mean_absolute_error(Y_test, predictions_2)))  
print('Mean Squared Error: {0:.4f}'.format(metrics.mean_squared_error(Y_test, predictions_2))) 
print('Root Mean Squared Error: {0:.4f}'.format(np.sqrt(metrics.mean_squared_error(Y_test, predictions_2))))

Mean Absolute Error: 0.8614
Mean Squared Error: 1.2876
Root Mean Squared Error: 1.1347


In [46]:
# Calculate mean absolute percentage error (MAPE)
mape_2 = 100 * (errors_2 / Y_test)
# Calculate and display accuracy
accuracy_2 = 100 - np.mean(mape_2)
print('Accuracy:', round(accuracy_2, 2)[0], '%.')

Accuracy: 83.56 %.


#Hyperparameter tuning via Random Search

In [18]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [19]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state=1)
# Random search of parameters, using 3 fold cross validation, 
# search across 10 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, scoring='neg_mean_squared_error', n_iter = 10, cv = 3, verbose=2, random_state=1, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 51.6min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=1),
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=1, scoring='neg_mean_squared_error', verbose=2)

In [20]:
rf_random.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': True}

In [21]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    predictions = predictions.reshape(14546,1)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Mean Absolute Error: {0:.4f}'.format(metrics.mean_absolute_error(test_labels, predictions)))  
    print('Mean Squared Error: {0:.4f}'.format(metrics.mean_squared_error(test_labels, predictions))) 
    print('Root Mean Squared Error: {0:.4f}'.format(np.sqrt(metrics.mean_squared_error(test_labels, predictions))))
    print('Accuracy = {0:.2f}%.'.format(accuracy[0]))
    
    return accuracy

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, Y_test)

Model Performance
Mean Absolute Error: 0.8104
Mean Squared Error: 1.1334
Root Mean Squared Error: 1.0646
Accuracy = 84.35%.


In [None]:
#Increase number of iterations

In [22]:
# First create the base model to tune
rf_2 = RandomForestRegressor(random_state=1)
# Random search of parameters, using 3 fold cross validation, 
# search across 10 different combinations, and use all available cores
rf_random_2 = RandomizedSearchCV(estimator = rf_2, scoring='neg_mean_squared_error', param_distributions = random_grid, n_iter = 60, cv = 3, verbose=2, random_state=1, n_jobs = -1)
# Fit the random search model
rf_random_2.fit(X_train, Y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 39.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 177.2min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 206.8min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=1),
                   n_iter=60, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=1, scoring='neg_mean_squared_error', verbose=2)

In [23]:
rf_random_2.best_params_

{'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': True}

In [24]:
best_random_2 = rf_random_2.best_estimator_
random_accuracy_2 = evaluate(best_random_2, X_test, Y_test)

Model Performance
Mean Absolute Error: 0.8107
Mean Squared Error: 1.1340
Root Mean Squared Error: 1.0649
Accuracy = 84.34%.
