In [36]:
import pandas as pd
import numpy as np

In [37]:
# read a zipped csv file
df = pd.read_csv('data/movies.csv.zip')

In [38]:
# To use columns
categorical_columns_yes = ['genres', 'original_language', 'production_companies', 'release_date', 'credits']
numerical_columns_yes = ['budget', 'runtime']
target = 'revenue'

# Drop columns      
df = df[categorical_columns_yes + numerical_columns_yes + [target]]

In [39]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split the genres column by "-" and rejoin ordering alphabetically
df['genres'] = df['genres'].str.split('-')
df['genres'] = df['genres'].apply(lambda x: [y.strip().lower().replace(' ', '') for y in x] if isinstance(x, list) else [])

# Apply MultiLabelBinarizer to encode the genres column
MLB = MultiLabelBinarizer()
genre_encoded = MLB.fit_transform(df['genres'])
genre_encoded_df = pd.DataFrame(genre_encoded, columns=MLB.classes_)

# Reset index and drop the original genres column
genre_encoded_df = genre_encoded_df.reset_index(drop=True)
mod_df = df.drop(['genres'], axis=1).reset_index(drop=True)

# Concatenate the modified dataframe with the encoded genres dataframe
df = pd.concat([mod_df, genre_encoded_df], axis=1)

In [40]:
# With production companies and credits we want to update their columns into a "has-famous-company" and "has-famous-credits" columns
# To do this we first split each column value (individual credits and companies conneceteed by "-")
df["production_companies"] = df["production_companies"].str.split("-")
df["credits"] = df["credits"].str.split("-")


In [41]:

# Then we create a list of famous companies and credits
# A company or credit is considered famous if it has appeared in a movie that has a revenue greater than the average revenue of all movies
famous_companies = set(df[df["revenue"] > df["revenue"].mean()]["production_companies"].explode().value_counts().index.tolist())
famous_credits = set(df[df["revenue"] > df["revenue"].mean()]["credits"].explode().value_counts().index.tolist())
print(famous_companies)

{'Little Shark Entertainment', 'Dee Gee Entertainment', 'One Two Films', 'Nadiadwala Grandson Entertainment', 'The Virginia Company LLC', 'A Steven Haft Production', 'Monsoon Pictures', 'Desert Flower', 'Cinema Four', 'La Compagnie Cinématographique', 'Siren Pictures', 'Prain Global', 'Solomon Films Production', 'Framestore', 'MFF (Sound of Thunder)', "D'Antoni Productions", 'Cecchi Gori Europa N.V.', 'Bold Films', 'Paradox Film Group', 'Future Films', 'Neutrinos Productions', 'Cecchi Gori Pictures', 'Runteam III Ltd.', 'Ozumi Films', 'Khorgas New Splendid Entertainment', 'Balcor Film Investors', 'Anto Joseph Film Company', 'Paranoid Pictures', 'Fabula', 'Argosy Pictures', 'Svengali Productions', 'Seven Arts Productions', 'Timnick Films', 'kth', 'Lord Miller Productions', 'Gloria Sanchez Productions', 'Third Elm Street Venture', 'Code Productions', 'Salem Street Entertainment', 'Schubert International Filmproduktions', 'WanDa Productions', 'Giai Phong Film Studio', 'Realizaciones Sol S

In [42]:

# We create a function to check if a company or credit is famous
def is_famous(value, famous_list):
    if isinstance(value, list):
        return any(item in famous_list for item in value)
    else:
        return False

# Apply the function to create the "has-famous-company" and "has-famous-credits" columns
df["has-famous-company"] = df["production_companies"].apply(lambda x: is_famous(x, famous_companies))
df["has-famous-credits"] = df["credits"].apply(lambda x: is_famous(x, famous_credits))


In [43]:
# Drop the original production companies and credits columns
df = df.drop(["production_companies", "credits"], axis=1)

In [44]:
# Specify columns for specific transformations
cat_cols = ['original_language', 'has-famous-company', 'release_date', 'has-famous-credits', 'budget']
num_cols = [
    'runtime', 'action', 'adventure', 'animation', 'comedy', 'crime', 
    'documentary', 'drama', 'family', 'fantasy', 'history', 'horror', 'music', 
    'mystery', 'romance', 'sciencefiction', 'thriller', 'tvmovie', 'war', 'western'
]


In [45]:
# Preprocessing of the dataframe
df['release_date'] = pd.to_datetime(df['release_date']).dt.year
df = df[df['release_date'] <= 2025]
bins = [x for x in range(1870, 2031, 10)]
df['release_date'] = pd.cut(df['release_date'], bins=bins, labels=[f'{x}s' for x in range(1870, 2030, 10)], right=False)
df['budget'] = pd.cut(df['budget'], bins=[x for x in range(0, 100000001, 10000000)] + [np.inf], labels=[f'${x/1000000:.0f}M - ${x/1000000 + 10:.0f}M' for x in range(0, 100000000, 10000000)] + ['> $100M'])


In [46]:
# finally lets inpute the missing values
# Inpute for categorical columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Inpute for numerical columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].mean())

In [47]:
# only use the columns we need
df = df[cat_cols + num_cols + [target]]

In [48]:
# Lets create the final pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from category_encoders import BinaryEncoder


In [None]:



# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_encoder', OneHotEncoder(), cat_cols),
        ('scaler', StandardScaler(), ['runtime']),
        
    ],
    remainder='passthrough' 
)


In [15]:
# lets test the pipeline

# 1. lets split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(target, axis=1), df[target], test_size=0.2, random_state=42)

In [16]:
# 2. lets fit the pipeline to the train data
preprocessor.fit(X_train, y_train)
# 3. lets transform the train and test data
X_train_transformed = preprocessor.transform(X_train)

In [17]:
X_train_transformed

array([[0, 0, 0, ..., -0.17040853774712147, -0.11477391315098616,
        -0.10889659236461688],
       [0, 0, 0, ..., -0.17040853774712147, -0.11477391315098616,
        -0.10889659236461688],
       [0, 0, 0, ..., -0.17040853774712147, -0.11477391315098616,
        -0.10889659236461688],
       ...,
       [0, 0, 0, ..., -0.17040853774712147, -0.11477391315098616,
        -0.10889659236461688],
       [0, 0, 0, ..., -0.17040853774712147, -0.11477391315098616,
        -0.10889659236461688],
       [0, 0, 0, ..., -0.17040853774712147, -0.11477391315098616,
        -0.10889659236461688]], dtype=object)

In [69]:
# Lets import some models as to compare the results. We're using MSE as the metric to compare the models

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score


from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

# Define models to test
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'XGBoost': XGBRegressor(),
}

# create a modified y_train without negatives as to apply the log transformation


from sklearn.metrics import mean_squared_error
# Evaluate each model using cross-validation
results = {}
for model_name, model in models.items():
    # Create a pipeline with the preprocessor and the model
    pipeline = make_pipeline(preprocessor, model)
    # Fit the pipeline to the training data
    pipeline.fit(X_train, y_train)
    # Make predictions on the test data
    y_pred = pipeline.predict(X_test)
    # Calculate the mean squared error
    mse = mean_squared_error(y_test, y_pred)
    # Calculate the absolute percentage difference
    percentage_diff = np.abs(y_test - y_pred) / np.abs(y_test)

    # Find the number of records with a difference greater than 10%
    num_records_greater_than_10_percent = np.sum(percentage_diff > 0.1)
    total_records = len(y_test)

    # Print the result
    print(f'N greater than 10%: {num_records_greater_than_10_percent}', f'Total records: {total_records}', f'% greater than 10%: {num_records_greater_than_10_percent / total_records * 100:.2f}%')
    
    # Store the results
    results[model_name] = mse

# Print the results
for model_name, scores in results.items():
    print(f'{model_name}: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')
    

N greater than 10%: 134010 Total records: 134136 % greater than 10%: 99.91%
N greater than 10%: 18387 Total records: 134136 % greater than 10%: 13.71%
N greater than 10%: 133999 Total records: 134136 % greater than 10%: 99.90%
Linear Regression: 232575188697312.906 +/- 0.000
Decision Tree: 355992112948108.312 +/- 0.000
XGBoost: 176451923935986.312 +/- 0.000


In [None]:
# lets do a grid search to find the best hyperparameters for the best model

from sklearn.model_selection import GridSearchCV
# The best model was the decision tree, so we will use it for the grid search
model = DecisionTreeRegressor()
# Define the hyperparameters to search
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
# Create the grid search object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
# Fit the grid search to the data
grid_search.fit(X_train_transformed, y_train)
# Get the best hyperparameters
best_params = grid_search.best_params_
print(f'Best hyperparameters: {best_params}')
# Get the best model
best_model = grid_search.best_estimator_


# create the final pipeline
final_pipeline = make_pipeline(preprocessor, best_model)
# Fit the pipeline to the data
final_pipeline.fit(X_train, y_train)
# Make predictions on the test data
y_pred = final_pipeline.predict(X_test)


Best hyperparameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Mean squared error: 198695236958323.219


In [77]:
# save model as pkl
import pickle
# Save the final pipeline as a pickle file
with open('final_pipeline.pkl', 'wb') as f:
    pickle.dump(final_pipeline, f)
    