In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import numpy as np
import pandas as pd

In [66]:
# Load the numerical features from train_dataset.csv into df_train
df = pd.read_csv('train_dataset.csv')

# Load the pre-extracted features from the .npy files into separate arrays
actor1_features = np.load('features_countvec/train_countvec_features_actor_1_name.npy')
actor2_features = np.load('features_countvec/train_countvec_features_actor_2_name.npy')
director_features = np.load('features_countvec/train_countvec_features_director_name.npy')
plot_features = np.load('features_doc2vec/train_doc2vec_features_plot_keywords.npy')
title_features = np.load('features_fasttext/train_fasttext_title_embeddings.npy')

In [67]:
# Concatenate the features
df = pd.concat((df, pd.DataFrame(actor1_features), pd.DataFrame(actor2_features), pd.DataFrame(director_features), pd.DataFrame(plot_features), pd.DataFrame(title_features)), axis=1)
df.columns = df.columns.astype(str)
df = df.drop(['director_name', 'genres', 'actor_1_name', 'actor_2_name', 'plot_keywords', 'movie_title', 'actor_3_name', 'language', 'country', 'content_rating', 'title_embedding'], axis=1)
# Split the dataset into features (X) and target (y)
X = df.drop('imdb_score_binned', axis=1)
y = df['imdb_score_binned']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1003)
df

Unnamed: 0,id,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,...,90,91,92,93,94,95,96,97,98,99
0,1,186,73,28,847,2000,422783777,644348,6458,0,...,0.004746,0.003770,0.001390,0.003076,-0.008132,0.001598,0.006643,0.002400,-0.003933,-0.005527
1,2,252,97,0,233,654,20433940,78883,1876,8,...,0.005036,0.003941,0.003205,0.001162,-0.005804,0.002159,0.003575,-0.000710,-0.003438,-0.006788
2,3,232,117,234,221,12000,371897,36494,13607,2,...,-0.000173,0.000985,-0.004908,0.000435,0.002376,0.000944,0.003305,0.000132,-0.002589,-0.002180
3,4,297,109,0,145,957,13782838,258078,1757,0,...,0.005158,0.005430,0.003289,0.002486,-0.010253,0.003423,0.006926,0.001835,-0.005065,-0.008602
4,5,297,171,0,857,16000,313837577,1238746,22342,2,...,0.004583,0.005102,0.002862,0.000808,-0.007791,0.002772,0.005651,-0.000516,-0.004016,-0.006797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999,3000,161,129,42,49,97,93952276,132048,318,7,...,0.001290,0.002782,-0.000041,0.001678,-0.002992,-0.000076,0.002639,-0.000180,-0.001575,-0.002464
3000,3001,393,123,2000,471,26000,26903709,312629,37206,0,...,0.000591,-0.000388,0.001460,-0.000175,-0.000439,0.000435,-0.000345,0.003004,0.000988,-0.001010
3001,3002,216,118,473,963,18000,73343413,217480,22517,0,...,-0.000081,0.001027,-0.000284,0.000425,-0.000426,0.000349,0.002354,0.000964,-0.000928,-0.000762
3002,3003,109,95,0,0,227,1060591,9750,231,0,...,-0.001824,0.001365,-0.000980,-0.000108,-0.002340,0.002191,0.000235,0.000919,-0.000518,-0.001860


In [68]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the training data and transform both datasets
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [69]:
# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=1003)

# Train the model on the training data
rf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.64891846921797


In [70]:
# Initialize the Naive Bayes classifier
nb = MultinomialNB()

# Train the model on the training data
nb.fit(X_train + abs(X_train.min()), y_train)

# Make predictions on the test data
y_pred = nb.predict(X_test + abs(X_test.min()))

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6405990016638935


In [71]:
# Initialize the Decision Tree classifier
dt = DecisionTreeClassifier(random_state=1003)

# Train the model on the training data
dt.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dt.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6239600665557404


In [72]:
# Initialize the k-NN classifier with k=40
knn = KNeighborsClassifier(n_neighbors=40)

# Train the model on the training data
knn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6289517470881864


In [73]:
# Initialize the SVM classifier
svm = SVC(random_state=1003)

# Train the model on the training data
svm.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6372712146422629


In [74]:
# Initialize the Neural Network classifier
nn = MLPClassifier(random_state=1003)

# Train the model on the training data
nn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = nn.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5990016638935108


In [75]:
# Initialize the Gradient Boosting classifier
gb = GradientBoostingClassifier(random_state=1003)

# Train the model on the training data
gb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gb.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6938435940099834


In [76]:
# Initialize the Logistic Regression classifier
lr = LogisticRegression(random_state=1003)

# Train the model on the training data
lr.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lr.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6173044925124792


In [77]:
# Initialize the Ridge Classifier
rc = RidgeClassifier(random_state=1003)

# Train the model on the training data
rc.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rc.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5607321131447587


In [78]:
# Initialize the AdaBoost classifier
ada = AdaBoostClassifier(
    estimator=rf,
    n_estimators=100,
    random_state=1003,
    algorithm='SAMME'
)

# Train the model on the training data
ada.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ada.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6389351081530782


In [80]:
train_accuracy_scores = [
    accuracy_score(y_train, rf.predict(X_train)),  # Random Forest
    accuracy_score(y_train, nb.predict(X_train + abs(X_train.min()))),  # Naive Bayes
    accuracy_score(y_train, dt.predict(X_train)),  # Decision Tree
    accuracy_score(y_train, knn.predict(X_train)),  # k-NN
    accuracy_score(y_train, svm.predict(X_train)),  # SVM
    accuracy_score(y_train, nn.predict(X_train)),  # Neural Network
    accuracy_score(y_train, gb.predict(X_train)),  # Gradient Boosting
    accuracy_score(y_train, lr.predict(X_train)),  # Logistic Regression
    accuracy_score(y_train, rc.predict(X_train)),  # Ridge Classifier
    accuracy_score(y_train, ada.predict(X_train)), # AdaBoost Classifier
]

test_accuracy_scores = [
    accuracy_score(y_test, rf.predict(X_test)),  # Random Forest
    accuracy_score(y_test, nb.predict(X_test + abs(X_test.min()))),  # Naive Bayes
    accuracy_score(y_test, dt.predict(X_test)),  # Decision Tree
    accuracy_score(y_test, knn.predict(X_test)),  # k-NN
    accuracy_score(y_test, svm.predict(X_test)),  # SVM
    accuracy_score(y_test, nn.predict(X_test)),  # Neural Network
    accuracy_score(y_test, gb.predict(X_test)),  # Gradient Boosting
    accuracy_score(y_test, lr.predict(X_test)),  # Logistic Regression
    accuracy_score(y_test, rc.predict(X_test)),  # Ridge Classifier
    accuracy_score(y_test, ada.predict(X_test)), # AdaBoost Classifier
]

# Create a dictionary to store the model names and their accuracies
model_accuracies = {
    "Model": ["Random Forest", "Naive Bayes", "Decision Tree", "k-NN", "SVM", "Neural Network", "Gradient Boosting", "Logistic Regression", "Ridge Classifier", "AdaBoost Classifier"],
    "Train Accuracy": train_accuracy_scores,
    "Test Accuracy": test_accuracy_scores,
}

# Create the DataFrame
df_models = pd.DataFrame(model_accuracies)
df_models

Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,Random Forest,1.0,0.648918
1,Naive Bayes,0.831045,0.640599
2,Decision Tree,1.0,0.62396
3,k-NN,0.617145,0.628952
4,SVM,0.876404,0.637271
5,Neural Network,1.0,0.599002
6,Gradient Boosting,0.893467,0.693844
7,Logistic Regression,1.0,0.617304
8,Ridge Classifier,1.0,0.560732
9,AdaBoost Classifier,1.0,0.638935


In [None]:
# df = pd.concat((df, pd.DataFrame(actor1_features), pd.DataFrame(actor2_features), pd.DataFrame(director_features), pd.DataFrame(plot_features), pd.DataFrame(title_features)), axis=1)
# df.columns = df.columns.astype(str)

# # Split the dataset into features (X) and target (y)
# X = df.drop('imdb_score_binned', axis=1)
# y = df['imdb_score_binned']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1003)

In [None]:
test_df = pd.read_csv('test_dataset.csv')

actor1_features = np.load('features_countvec/test_countvec_features_actor_1_name.npy')
actor2_features = np.load('features_countvec/test_countvec_features_actor_2_name.npy')
director_features = np.load('features_countvec/test_countvec_features_director_name.npy')
plot_features = np.load('features_doc2vec/test_doc2vec_features_plot_keywords.npy')
title_features = np.load('features_fasttext/test_fasttext_title_embeddings.npy')

# test_df = pd.concat((test_df, pd.DataFrame(actor1_features), pd.DataFrame(actor2_features), pd.DataFrame(director_features), pd.DataFrame(plot_features), pd.DataFrame(title_features)), axis=1)
test_df.columns = test_df.columns.astype(str)
test_df = test_df.drop(['director_name', 'genres', 'actor_1_name', 'actor_2_name', 'plot_keywords', 'movie_title', 'actor_3_name', 'language', 'country', 'content_rating', 'title_embedding'], axis=1)

test_df

Unnamed: 0,id,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,title_year,actor_2_facebook_likes,movie_facebook_likes,average_degree_centrality
0,1,27,118,14,400,2000,2246000,2302,3384,4,20,2015,769,0,0.000375
1,2,339,141,0,404,749,47307550,104301,1948,4,269,2012,463,28000,0.002176
2,3,78,95,89,388,963,37606,31836,2658,0,90,2009,654,0,0.000900
3,4,226,117,0,818,15000,104054514,200359,16828,0,1009,2002,1000,0,0.003452
4,5,97,104,38,690,801,3447339,29517,2667,7,79,2013,727,0,0.000450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747,748,179,93,0,766,13000,17096053,134458,15716,2,640,1998,933,5000,0.002777
748,749,393,105,335,911,3000,37516013,128629,8281,0,348,2012,3000,98000,0.001801
749,750,55,117,133,249,687,20966644,29610,1665,0,94,1985,443,0,0.001126
750,751,85,72,0,384,3000,47887943,11634,4480,0,58,2003,455,227,0.000825


In [None]:
import os
if os.path.isfile("submission.csv"):
    os.remove("submission.csv")

In [None]:
chosen_model = rf

# Make predictions on the test data
test_pred = chosen_model.predict(test_df)

# Create the submission file
submission = pd.DataFrame({'id': test_df.id, 'imdb_score_binned': test_pred})

# Save the submission file

submission.to_csv('submission.csv', index=False)

submission



Unnamed: 0,id,imdb_score_binned
0,1,4
1,2,4
2,3,4
3,4,4
4,5,4
...,...,...
747,748,4
748,749,4
749,750,4
750,751,4
