In [118]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import matplotlib.pyplot as plt

In [119]:
#Load in Training and Test Data
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

# Separate Training data into attributes and labels
X = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]

In [120]:
# Save id column for later csv creation
id_col = test_data['id']
# Remove nominal/categorical data from both sets
numeric_attributes = X.select_dtypes(include='number').columns

X = X[numeric_attributes]
test_data = test_data[numeric_attributes]

In [121]:
# Perform Feature Selection with MI and Chi-square
mi_scores = mutual_info_classif(X, y)
chi2_scores, _ = chi2(X, y)

mi_df = pd.DataFrame({'Feature': X.columns, 'MI Score': mi_scores})
print(mi_df.sort_values(by='MI Score', ascending=False))
print('----------------------')
top_attributes = mi_df.sort_values(by='MI Score', ascending=False)['Feature'].head(15)
print(top_attributes.tolist())
top_attributes = top_attributes.tolist()

                      Feature  MI Score
7             num_voted_users  0.176228
13       movie_facebook_likes  0.098781
10       num_user_for_reviews  0.087833
3     director_facebook_likes  0.080547
2                    duration  0.073377
1      num_critic_for_reviews  0.067657
5      actor_1_facebook_likes  0.052829
8   cast_total_facebook_likes  0.047044
4      actor_3_facebook_likes  0.036914
12     actor_2_facebook_likes  0.034295
14  average_degree_centrality  0.027317
9        facenumber_in_poster  0.025960
6                       gross  0.022481
11                 title_year  0.015656
0                          id  0.000000
----------------------
['num_voted_users', 'movie_facebook_likes', 'num_user_for_reviews', 'director_facebook_likes', 'duration', 'num_critic_for_reviews', 'actor_1_facebook_likes', 'cast_total_facebook_likes', 'actor_3_facebook_likes', 'actor_2_facebook_likes', 'average_degree_centrality', 'facenumber_in_poster', 'gross', 'title_year', 'id']


In [122]:
# Change data based on top MI attributes
# X = X[top_attributes]
# test_data = test_data[top_attributes]

In [123]:
# Instantiate Random Forest model
rf = RandomForestClassifier()
rf.fit(X, y)

In [124]:
# Evaluate performance of model using k-folds cross validation
k_folds = StratifiedKFold(n_splits=10, shuffle=True)
cv_scores = cross_val_score(rf, X, y, cv=k_folds)

print("K-Folds Accuracies:", cv_scores)
print("K-Folds Mean Accuracy:", cv_scores.mean())

K-Folds Accuracies: [0.68438538 0.70099668 0.73754153 0.71760797 0.71666667 0.72
 0.73666667 0.70666667 0.71       0.72      ]
K-Folds Mean Accuracy: 0.7150531561461794


In [127]:
# Train and make test set predictions for Kaggle
predictions = rf.predict(test_data)
predictions_df = pd.DataFrame({'id': id_col, 'imdb_score_binned': predictions})
predictions_df.to_csv('predictions.csv', index=False)