In [11]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline

In [32]:
# Load in data
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

# Load in Doc2Vec genre feature
train_D2V_genres = np.load('train_doc2vec_features_genre.npy')
test_D2V_genres = np.load('test_doc2vec_features_genre.npy')
pca = PCA(n_components = 10)
pca.fit(train_D2V_genres)
reduced_train_D2V_genres = pca.transform(train_D2V_genres)
reduced_test_D2V_genres = pca.transform(test_D2V_genres)

reduced_train_D2V_genres_df = pd.DataFrame(reduced_train_D2V_genres, columns=[f"doc2vec_genres_{i}" for i in range(reduced_train_D2V_genres.shape[1])])
reduced_test_D2V_genres_df = pd.DataFrame(reduced_test_D2V_genres, columns=[f"doc2vec_genres_{i}" for i in range(reduced_test_D2V_genres.shape[1])])

# Save id column for later Kaggle submission
id_col = test_data['id']
train_data = train_data.drop(['id'], axis=1)
test_data = test_data.drop(['id'], axis=1)

# Check which countries have the most high rated appearances
# filtered_df = train_data[train_data['imdb_score_binned'] >= 4]
# print(filtered_df[['country', 'imdb_score_binned']])

# Count the occurrences of each country
# country_counts = filtered_df['country'].value_counts()

# Replace top 3 rated countries with ordered values 
def map_country(country):
    if country in high_rated_countries:
        if country == 'USA':
            return 3
        elif country == 'UK':
            return 2
        else:
            return 1
    else:
        return 0

high_rated_countries = ['USA', 'UK', 'France']
train_data['country'] = train_data['country'].map(map_country)
test_data['country'] = test_data['country'].map(map_country)

# Drop redundant numeric data
redundant_attributes = ['actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes', 'facenumber_in_poster', 'average_degree_centrality', 'title_embedding']
train_data = train_data.drop(columns=redundant_attributes, axis=1)
test_data = test_data.drop(columns=redundant_attributes, axis=1)


# Remove missing values
train_data.dropna(axis=0, inplace=True)

# Split into attributes and labels
attributes = train_data.iloc[:, :-1]
label = train_data.iloc[:, -1]

# Concatenate genre D2V to X and test data 
combined_train = pd.concat([attributes, reduced_train_D2V_genres_df, label], axis=1)
combined_train.dropna(axis=0, inplace=True)
test_data = pd.concat([test_data, reduced_test_D2V_genres_df], axis=1)

# Split combined data into X and y

X = combined_train.iloc[:, :-1]
y = combined_train.iloc[:, -1]

# Drop categorical/nominal data
numeric_attributes = X.select_dtypes(include='number').columns
print(numeric_attributes)
X = X[numeric_attributes]

test_data = test_data[numeric_attributes]

print(X.head())

#print(X.head())


Index(['num_critic_for_reviews', 'duration', 'director_facebook_likes',
       'gross', 'num_voted_users', 'cast_total_facebook_likes',
       'num_user_for_reviews', 'country', 'movie_facebook_likes',
       'doc2vec_genres_0', 'doc2vec_genres_1', 'doc2vec_genres_2',
       'doc2vec_genres_3', 'doc2vec_genres_4', 'doc2vec_genres_5',
       'doc2vec_genres_6', 'doc2vec_genres_7', 'doc2vec_genres_8',
       'doc2vec_genres_9'],
      dtype='object')
   num_critic_for_reviews  duration  director_facebook_likes        gross  \
0                   186.0      73.0                     28.0  422783777.0   
1                   252.0      97.0                      0.0   20433940.0   
2                   232.0     117.0                    234.0     371897.0   
3                   297.0     109.0                      0.0   13782838.0   
4                   297.0     171.0                      0.0  313837577.0   

   num_voted_users  cast_total_facebook_likes  num_user_for_reviews  country  \
0   

In [26]:
base_estimators = [
    ('gradient_boosting', GradientBoostingClassifier(n_estimators=200, random_state=42)),
    ('random_forest', RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=42)),
    ('logistic_regression', LogisticRegression(max_iter=10000)),
]
chosen_model = StackingClassifier(estimators=base_estimators,final_estimator=RandomForestClassifier(n_estimators=200, criterion='entropy'), cv=5)
#chosen_model.fit(X, y)
#predictions = chosen_model.predict(test_data)
#predictions_df = pd.DataFrame({'id': id_col, 'imdb_score_binned': predictions})
#predictions_df.to_csv('predictions.csv', index=False)

In [27]:
# PIPELINE WITH JUST SCALING THE DATA AND THEN USING RANDOM FOREST
pipeline = make_pipeline(StandardScaler(), chosen_model)
pipeline.fit(X, y)
predictions = pipeline.predict(test_data)
predictions_df = pd.DataFrame({'id': id_col, 'imdb_score_binned': predictions})
predictions_df.to_csv('predictions.csv', index=False)
#kfolds = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)
#cv_scores = cross_val_score(pipeline, X, y, cv=kfolds, scoring='accuracy')

#print("CV Scores:", cv_scores)
#print("Mean CV Score:", cv_scores.mean())


In [33]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=200, criterion='entropy'))

pipeline.fit(X, y)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

print("Accuracy Scores:", scores)
print("Mean Accuracy:", scores.mean())

Accuracy Scores: [0.70549085 0.69717138 0.70715474 0.715      0.69833333]
Mean Accuracy: 0.7046300610094287
