In [40]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline

In [41]:
# Load in data
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

# Remove missing values
train_data.dropna(axis=0, inplace=True)



# Save id column for later Kaggle submission
id_col = test_data['id']
train_data = train_data.drop(['id'], axis=1)
test_data = test_data.drop(['id'], axis=1)

# Check which countries have the most high rated appearances
filtered_df = train_data[train_data['imdb_score_binned'] >= 4]
#print(filtered_df[['country', 'imdb_score_binned']])
# Count the occurrences of each country
country_counts = filtered_df['country'].value_counts()
director_counts = filtered_df['director_name'].value_counts()

# Sort the counts in descending order
sorted_country_counts = country_counts.sort_values(ascending=False)
sorted_director_counts = director_counts.sort_values(ascending=False)
# Print the sorted counts
#print(sorted_country_counts)
print(sorted_director_counts)

# Replace top 3 rated countries with ordered values
def map_country(country):
    if country in high_rated_countries:
        if country == 'USA':
            return 3
        elif country == 'UK':
            return 2
        else:
            return 1
    else:
        return 0

high_rated_countries = ['USA', 'UK', 'France']
train_data['country'] = train_data['country'].map(map_country)
test_data['country'] = test_data['country'].map(map_country)

# Drop redundant numeric data
redundant_attributes = ['actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes', 'facenumber_in_poster', 'average_degree_centrality']
train_data = train_data.drop(columns=redundant_attributes, axis=1)
test_data = test_data.drop(columns=redundant_attributes, axis=1)

# Split into attributes and labels
X = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]

# Drop categorical/nominal data
numeric_attributes = X.select_dtypes(include='number').columns
X = X[numeric_attributes]
test_data = test_data[numeric_attributes]

#print(X.head())


director_name
Steven Spielberg     5
Christopher Nolan    5
James Cameron        3
Peter Jackson        3
Martin Scorsese      3
                    ..
Joss Whedon          1
Majid Majidi         1
Katsuhiro Ôtomo      1
Steve McQueen        1
Stanley Kubrick      1
Name: count, Length: 96, dtype: int64


In [46]:
base_estimators = [
    ('gradient_boosting', GradientBoostingClassifier(n_estimators=200, random_state=42)),
    ('random_forest', RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=42)),
    ('logistic_regression', LogisticRegression(max_iter=10000)),
]
chosen_model = StackingClassifier(estimators=base_estimators,final_estimator=RandomForestClassifier(n_estimators=200, criterion='entropy'), cv=5)
chosen_model.fit(X, y)
predictions = chosen_model.predict(test_data)
predictions_df = pd.DataFrame({'id': id_col, 'imdb_score_binned': predictions})
predictions_df.to_csv('predictions.csv', index=False)

In [32]:
# PIPELINE WITH JUST SCALING THE DATA AND THEN USING RANDOM FOREST
pipeline = make_pipeline(StandardScaler(), chosen_model)
pipeline.fit(X, y)

kfolds = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=kfolds, scoring='accuracy')

print("CV Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CV Scores: [0.71547421 0.72878536 0.72545757 0.725      0.73833333]
Mean CV Score: 0.726610094287299


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=200, criterion='entropy'))

pipeline.fit(X, y)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

print("Accuracy Scores:", scores)
print("Mean Accuracy:", scores.mean())

Accuracy Scores: [0.72379368 0.70216306 0.7171381  0.71       0.725     ]
Mean Accuracy: 0.7156189683860232
