In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

df = pd.read_pickle("df.pkl")

In [2]:
# Classification
# Create popularity categories (e.g. low/medium/high) based on popularity
df['popularity_class'] = pd.qcut(df['popularity'], q=3, labels=['low', 'medium', 'high'])
X = df.drop(columns=['popularity', 'popularity_class'])
y_class = df['popularity_class']

In [3]:
# Regression
# The goal is to predict the continuous value of popularity
y_reg = df['popularity']

In [4]:
# Create test and train set
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.2, random_state=42, stratify=y_class)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

In [5]:
numeric_features = ['budget', 'runtime', 'vote_average', 'vote_count', 'revenue']
categorical_features = ['genres', 'original_language']

# Preprocessing definition
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [6]:
# Models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor

classifiers = {
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'MLP': MLPClassifier(max_iter=300)
}

regressors = {
    'RandomForest': RandomForestRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'KNN': KNeighborsRegressor(),
    'SVR': SVR(),
    'MLP': MLPRegressor(max_iter=300)
}


In [7]:
# Resampling (for classification)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

smote = SMOTE(random_state=42)


In [8]:
# Train classificator with resampling
from sklearn.metrics import classification_report, mean_squared_error

for name, model in classifiers.items():
    clf_pipeline = ImbPipeline(steps=[('preprocess', preprocessor),
                                      ('resample', smote),
                                      ('model', model)])
    clf_pipeline.fit(X_train_class, y_train_class)
    y_pred = clf_pipeline.predict(X_test_class)
    print(f"{name} Classification Report:\n", classification_report(y_test_class, y_pred))


RandomForest Classification Report:
               precision    recall  f1-score   support

        high       0.90      0.94      0.92       318
         low       0.90      0.90      0.90       318
      medium       0.84      0.81      0.82       319

    accuracy                           0.88       955
   macro avg       0.88      0.88      0.88       955
weighted avg       0.88      0.88      0.88       955

DecisionTree Classification Report:
               precision    recall  f1-score   support

        high       0.91      0.89      0.90       318
         low       0.86      0.87      0.86       318
      medium       0.78      0.79      0.78       319

    accuracy                           0.85       955
   macro avg       0.85      0.85      0.85       955
weighted avg       0.85      0.85      0.85       955

KNN Classification Report:
               precision    recall  f1-score   support

        high       0.69      0.74      0.71       318
         low       0.72    

In [9]:
# Regression with SMOTER
import smogn

# Data preparation for SMOTER
train_reg = pd.concat([X_train_reg, y_train_reg], axis=1)
print(train_reg.columns)
print(train_reg.shape)
print(train_reg.isnull().sum())
train_reg = train_reg.reset_index(drop=True)
train_reg['popularity'] = train_reg['popularity'].astype(float)
train_smogn = smogn.smoter(data=train_reg, y='popularity')

# Separate features from target
X_train_smogn = train_smogn.drop(columns=['popularity'])
y_train_smogn = train_smogn['popularity']


Index(['budget', 'genres', 'original_title', 'runtime', 'original_language',
       'vote_average', 'vote_count', 'revenue', 'release_year', 'popularity'],
      dtype='object')
(3818, 10)
budget               0
genres               0
original_title       0
runtime              0
original_language    0
vote_average         0
vote_count           0
revenue              0
release_year         0
popularity           0
dtype: int64


dist_matrix: 100%|##########| 608/608 [00:47<00:00, 12.78it/s]
synth_matrix: 100%|##########| 608/608 [00:01<00:00, 368.94it/s]
r_index: 100%|##########| 84/84 [00:00<00:00, 715.39it/s]
1       Thriller Drama History
2       Thriller Drama History
3       Thriller Drama History
4       Thriller Drama History
                 ...          
1295    Thriller Drama History
1296    Thriller Drama History
1297    Thriller Drama History
1298    Thriller Drama History
1299    Thriller Drama History
Name: 1, Length: 1300, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  data_new.iloc[:, j] = data_new.iloc[:, j].replace(x, cat_list[x])
1       Zero Dark Thirty
2       Zero Dark Thirty
3       Zero Dark Thirty
4       Zero Dark Thirty
              ...       
1295    Zero Dark Thirty
1296    Zero Dark Thirty
1297    Zero Dark Thirty
1298    Zero Dark Thirty
1299    Zero Dark Thirty
Name: 2, Length: 1300, dtype: object' has dtype incompatibl

In [10]:
# Separate features from target
X_train_smogn = train_smogn.drop(columns=['popularity'])
y_train_smogn = train_smogn['popularity']


In [11]:
# Train regressor
for name, model in regressors.items():
    reg_pipeline = Pipeline(steps=[('preprocess', preprocessor),
                                   ('model', model)])
    reg_pipeline.fit(X_train_smogn, y_train_smogn)
    y_pred_reg = reg_pipeline.predict(X_test_reg)
    mse = mean_squared_error(y_test_reg, y_pred_reg)
    print(f"{name} MSE:", mse)


RandomForest MSE: 574.4190166936638
DecisionTree MSE: 748.1148075138908
KNN MSE: 1076.0844581299527
SVR MSE: 766.3269304639949
MLP MSE: 29846899.298398346
