In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline



In [3]:
## TEST DATA PRE PROCESS
#Plan: Remove categorical, Include genres and country, remove NA, compute MI and redundantness to remove further.
# Load in data
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

# Load in Doc2Vec genre feature
train_D2V_genres = np.load('train_doc2vec_features_genre.npy')
test_D2V_genres = np.load('test_doc2vec_features_genre.npy')
pca = PCA(n_components = 10)
pca.fit(train_D2V_genres)
reduced_train_D2V_genres = pca.transform(train_D2V_genres)
reduced_test_D2V_genres = pca.transform(test_D2V_genres)

reduced_train_D2V_genres_df = pd.DataFrame(reduced_train_D2V_genres, columns=[f"doc2vec_genres_{i}" for i in range(reduced_train_D2V_genres.shape[1])])
reduced_test_D2V_genres_df = pd.DataFrame(reduced_test_D2V_genres, columns=[f"doc2vec_genres_{i}" for i in range(reduced_test_D2V_genres.shape[1])])

# Save id column for later Kaggle submission
id_col = test_data['id']
train_data = train_data.drop(['id'], axis=1)
test_data = test_data.drop(['id'], axis=1)

# Check which countries have the most high rated appearances
# filtered_df = train_data[train_data['imdb_score_binned'] >= 4]
# print(filtered_df[['country', 'imdb_score_binned']])

# Count the occurrences of each country
# country_counts = filtered_df['country'].value_counts()

# Replace top 3 rated countries with ordered values 
def map_country(country):
    if country in high_rated_countries:
        if country == 'USA':
            return 3
        elif country == 'UK':
            return 2
        else:
            return 1
    else:
        return 0

high_rated_countries = ['USA', 'UK', 'France']
train_data['country'] = train_data['country'].map(map_country)
test_data['country'] = test_data['country'].map(map_country)

# Drop redundant numeric data
#redundant_attributes = ['actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes', 'facenumber_in_poster', 'average_degree_centrality', 'title_embedding']
#train_data = train_data.drop(columns=redundant_attributes, axis=1)
#test_data = test_data.drop(columns=redundant_attributes, axis=1)

# Split into attributes and labels

attributes = train_data.iloc[:, :-1]
numeric_attributes = attributes.select_dtypes(include='number')
label = train_data.iloc[:, -1]

# Concatenate genre D2V to X and test data 
combined_train = pd.concat([numeric_attributes, reduced_train_D2V_genres_df, label], axis=1)
combined_train = combined_train.select_dtypes(include='number')
combined_train.dropna(axis=0, inplace=True)
test_data = pd.concat([test_data, reduced_test_D2V_genres_df], axis=1)

# Split combined data into X and y and have a validation set with 10% of the data
X = combined_train.iloc[:2700, :-1]
y = combined_train.iloc[:2700, -1]
val_X = combined_train.iloc[2700:, :-1]
val_y = combined_train.iloc[2700:, -1]

# Drop categorical/nominal data

corr_matrix = combined_train.corr()
corr_with_label = corr_matrix['imdb_score_binned'].sort_values(ascending=False)
corr_columns = corr_matrix.columns[abs(corr_matrix['imdb_score_binned']) > 0.1]
corr_columns = corr_columns.drop('imdb_score_binned')
print(corr_columns)
X = X[corr_columns]
test_data = test_data[corr_columns]

print(X.head())
print(y.head())
print(test_data.head())

Index(['num_critic_for_reviews', 'duration', 'director_facebook_likes',
       'gross', 'num_voted_users', 'num_user_for_reviews', 'country',
       'title_year', 'movie_facebook_likes', 'average_degree_centrality',
       'doc2vec_genres_1', 'doc2vec_genres_2'],
      dtype='object')
   num_critic_for_reviews  duration  director_facebook_likes      gross  \
0                     186        73                       28  422783777   
1                     252        97                        0   20433940   
2                     232       117                      234     371897   
3                     297       109                        0   13782838   
4                     297       171                        0  313837577   

   num_voted_users  num_user_for_reviews  country  title_year  \
0           644348                   656        3        1994   
1            78883                   662        0        2005   
2            36494                   118        3        2013   
3  

In [4]:
base_estimators = [
    ('gradient_boosting', GradientBoostingClassifier(n_estimators=200, random_state=42)),
    ('random_forest', RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=42)),
    ('logistic_regression', LogisticRegression(max_iter=10000)),
]
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

chosen_model = StackingClassifier(estimators=base_estimators,final_estimator=RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=42), cv=cv)
#chosen_model.fit(X, y)
#predictions = chosen_model.predict(test_data)
#predictions_df = pd.DataFrame({'id': id_col, 'imdb_score_binned': predictions})
#predictions_df.to_csv('predictions.csv', index=False)

In [60]:
# PIPELINE WITH JUST SCALING THE DATA AND THEN USING RANDOM FOREST
pipeline = make_pipeline(StandardScaler(), chosen_model)
pipeline.fit(X, y)
predictions = pipeline.predict(test_data)
predictions_df = pd.DataFrame({'id': id_col, 'imdb_score_binned': predictions})
predictions_df.to_csv('predictions.csv', index=False)
#kfolds = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)
#cv_scores = cross_val_score(pipeline, X, y, cv=kfolds, scoring='accuracy')

#print("CV Scores:", cv_scores)
#print("Mean CV Score:", cv_scores.mean())


In [5]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=42))

pipeline.fit(X, y)

scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

print("Accuracy Scores:", scores)
print("Mean Accuracy:", scores.mean())

Accuracy Scores: [0.72712146 0.72712146 0.72212978 0.71214642 0.72166667]
Mean Accuracy: 0.7220371602884083
