Using GridSearchCV to tune the hyperparameters

# Part 1: Import libraries

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import numpy as np


# Part 2: Load the dataset

In [None]:
df = pd.read_csv('movies_modeling.csv')
df.head()

From the rated column replace **G**, **Not Rated**, **Unrated**, **NC-17**, and **TV-MA** into another category called **Other**

In [None]:
to_replace = ["G", "Not Rated","Unrated", "NC-17", "TV-MA"]
df[["rated"]] = df[["rated"]].replace(to_replace,"Other")

From the genre column replace **Fantasy**, **Mistery**, **Thriller**, **Family**, **Sci-Fi**, and **Romance** into another category called **Other**

In [None]:
to_replace_genre = ["Fantasy", "Mystery","Family", "Sci-Fi", "Thriller", "Romance"]
df[["genre"]] = df[["genre"]].replace(to_replace_genre,"Other")

Drop helping columns

In [None]:
# Drop revenue-related and roi-related columns
to_drop_columns = ["imdb_score","domestic_gross_usd", "non_dom_gross_usd", "worldwide_roi", "domestic_roi", 
                   "non_dom_roi", "success_level_ww"]
new_df = df.drop(to_drop_columns, axis = 1)

# Part 3: Split the data

In [None]:
X = new_df.drop('worldwide_gross_usd', axis=1)
y = new_df['worldwide_gross_usd']

# Split 70/30 with a random state of 37 to ensure reproductibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=37)

In [None]:
X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)

# Part 4: One Hot Encoding 

In [None]:
# OneHotEncode the rating train data

# Start the OneHotEncoder
ohe = OneHotEncoder(sparse=False)

# Make a copy of the dataset to avoid any data leakage and maintain integrity.
movie_rating = X_train[['rated']].copy() 

# Fit the Encoder to the data frame
ohe.fit(movie_rating)  

# Convert the results to a data frame and make the column names have the rating they are representing.
movie_rating_ohe = pd.DataFrame(
     data=ohe.transform(movie_rating),
     columns=[f"rating_{rating}" for rating in ohe.categories_[0]])

In [None]:
# OneHotEncode the genre train data

# Initializing the Encoder
ohe = OneHotEncoder(sparse=False)

# Make a copy of the dataset to avoid any data leakage and maintain integrity.
movie_genre = X_train[['genre']].copy() 

# Fit the encoder to the dataframe
ohe.fit(movie_genre)  

# Convert the results to a data frame and make the column names have the genres they are representing.
movie_genre_ohe = pd.DataFrame(
     data=ohe.transform(movie_genre),
     columns=[f"genre_{genre}" for genre in ohe.categories_[0]])

In [None]:
X_train_ohe = pd.concat([X_train, movie_rating_ohe, movie_genre_ohe], axis=1)
X_train_ohe.drop(['rated', 'genre'], axis=1, inplace=True)

In [None]:
# OneHotEncode the rating test data

# Start the OneHotEncoder
ohe = OneHotEncoder(sparse=False)

# Make a copy of the dataset to avoid any data leakage and maintain integrity.
movie_rating = X_test[['rated']].copy() 

# Fit the Encoder to the data frame
ohe.fit(movie_rating)  

# Convert the results to a data frame and make the column names have the rating they are representing.
movie_rating_ohe = pd.DataFrame(
     data=ohe.transform(movie_rating),
     columns=[f"rating_{rating}" for rating in ohe.categories_[0]])

In [None]:
# OneHotEncode the genre test data

# Initializing the Encoder
ohe = OneHotEncoder(sparse=False)

# Make a copy of the dataset to avoid any data leakage and maintain integrity.
movie_genre = X_test[['genre']].copy() 

# Fit the encoder to the dataframe
ohe.fit(movie_genre)  

# Convert the results to a data frame and make the column names have the genres they are representing.
movie_genre_ohe = pd.DataFrame(
     data=ohe.transform(movie_genre),
     columns=[f"genre_{genre}" for genre in ohe.categories_[0]])

In [None]:
X_test_ohe = pd.concat([X_test, movie_rating_ohe, movie_genre_ohe], axis=1)
X_test_ohe.drop(['rated', 'genre'], axis=1, inplace=True)

# Part 5: Scale the features

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_ohe)
X_test_scaled = scaler.transform(X_test_ohe)

In [None]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_ohe.columns, index=X_train_ohe.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_ohe.columns, index=X_test_ohe.index)

# Part 6: Build and fit the model

In [None]:
# Define the parameter grid
param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'alpha': [5, 10, 15],
    'n_estimators': [5, 10, 20]
}

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror'),
                          param_grid = param_grid,
                          cv = 3, n_jobs=-1, verbose=2)

In [None]:
grid_search.fit(X_train_scaled, y_train)

In [None]:
print(grid_search.best_params_)

In [None]:
# Instantiate the XGBRegressor with the best parameters
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.7, learning_rate=0.2, max_depth=5,
                         alpha=15, n_estimators=20)

In [None]:
xg_reg.fit(X_train_scaled, y_train)

In [None]:
preds = xg_reg.predict(X_test_scaled)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print('RMSE: ', rmse)

In [None]:
print('R2: ', r2_score(y_test, preds))

In [None]:
range_target = y_test.max() - y_test.min()
print(range_target)