### For this model I used the model V2 and added the Ridge model

# Part 1: Import necesarry libraries

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt
from sklearn.linear_model import LinearRegression, Ridge

# Part 2: Load the dataset

In [5]:
df = pd.read_csv('movies_modeling.csv')
df.head()

Unnamed: 0,month_released,rated,genre,runtime_minutes,belongs_to_collection,production_budget_usd,domestic_gross_usd,worldwide_gross_usd,imdb_score,dir_acted,...,dir_special_effects,dir_stunts,dir_visual_effects,dir_writer,non_dom_gross_usd,title_length,worldwide_roi,domestic_roi,non_dom_roi,success_level_ww
0,5,R,Drama,146.0,0,19000000,44568631,45613093,8.4,0,...,0,0,0,1,1044462,11,1.400689,1.345717,-0.945028,2
1,6,R,Adventure,104.0,1,4500000,47923795,47923795,5.8,1,...,0,0,0,1,0,15,9.649732,9.649732,-1.0,2
2,7,PG,Comedy,88.0,1,3500000,83453539,83453539,7.7,0,...,0,0,0,0,0,9,22.843868,22.843868,-1.0,2
3,7,R,Comedy,98.0,1,6000000,39846344,39849764,7.3,1,...,0,0,0,1,3420,10,5.641627,5.641057,-0.99943,2
4,5,R,Horror,95.0,1,550000,39754601,59754601,6.4,0,...,0,0,0,1,20000000,15,107.644729,71.281093,35.363636,2


From the rated column replace **G**, **Not Rated**, **Unrated**, **NC-17**, and **TV-MA** into another category called **Other**

In [6]:
to_replace = ["G", "Not Rated","Unrated", "NC-17", "TV-MA"]
df[["rated"]] = df[["rated"]].replace(to_replace,"Other")

From the genre column replace **Fantasy**, **Mistery**, **Thriller**, **Family**, **Sci-Fi**, and **Romance** into another category called **Other**

In [7]:
to_replace_genre = ["Fantasy", "Mystery","Family", "Sci-Fi", "Thriller", "Romance"]
df[["genre"]] = df[["genre"]].replace(to_replace_genre,"Other")

Drop helping columns

In [8]:
# Drop revenue-related and roi-related columns
to_drop_columns = ["imdb_score","domestic_gross_usd", "non_dom_gross_usd", "worldwide_roi", "domestic_roi", 
                   "non_dom_roi", "success_level_ww"]
new_df = df.drop(to_drop_columns, axis = 1)

# Part 3: Split the data

In [9]:
X = new_df.drop('worldwide_gross_usd', axis=1)
y = new_df['worldwide_gross_usd']

# Split 70/30 with a random state of 37 to ensure reproductibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=37)

In [10]:
X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)

# Part 4: Perform one hot encoding on 'rating' and 'genre'

In [11]:
# OneHotEncode the rating train data

# Start the OneHotEncoder
ohe = OneHotEncoder(sparse=False)

# Make a copy of the dataset to avoid any data leakage and maintain integrity.
movie_rating = X_train[['rated']].copy() 

# Fit the Encoder to the data frame
ohe.fit(movie_rating)  

# Convert the results to a data frame and make the column names have the rating they are representing.
movie_rating_ohe = pd.DataFrame(
     data=ohe.transform(movie_rating),
     columns=[f"rating_{rating}" for rating in ohe.categories_[0]])

In [12]:
# OneHotEncode the genre train data

# Initializing the Encoder
ohe = OneHotEncoder(sparse=False)

# Make a copy of the dataset to avoid any data leakage and maintain integrity.
movie_genre = X_train[['genre']].copy() 

# Fit the encoder to the dataframe
ohe.fit(movie_genre)  

# Convert the results to a data frame and make the column names have the genres they are representing.
movie_genre_ohe = pd.DataFrame(
     data=ohe.transform(movie_genre),
     columns=[f"genre_{genre}" for genre in ohe.categories_[0]])

In [13]:
X_train_ohe = pd.concat([X_train, movie_rating_ohe, movie_genre_ohe], axis=1)
X_train_ohe.drop(['rated', 'genre'], axis=1, inplace=True)

In [14]:
# OneHotEncode the rating test data

# Start the OneHotEncoder
ohe = OneHotEncoder(sparse=False)

# Make a copy of the dataset to avoid any data leakage and maintain integrity.
movie_rating = X_test[['rated']].copy() 

# Fit the Encoder to the data frame
ohe.fit(movie_rating)  

# Convert the results to a data frame and make the column names have the rating they are representing.
movie_rating_ohe = pd.DataFrame(
     data=ohe.transform(movie_rating),
     columns=[f"rating_{rating}" for rating in ohe.categories_[0]])

In [15]:
# OneHotEncode the genre test data

# Initializing the Encoder
ohe = OneHotEncoder(sparse=False)

# Make a copy of the dataset to avoid any data leakage and maintain integrity.
movie_genre = X_test[['genre']].copy() 

# Fit the encoder to the dataframe
ohe.fit(movie_genre)  

# Convert the results to a data frame and make the column names have the genres they are representing.
movie_genre_ohe = pd.DataFrame(
     data=ohe.transform(movie_genre),
     columns=[f"genre_{genre}" for genre in ohe.categories_[0]])

In [16]:
X_test_ohe = pd.concat([X_test, movie_rating_ohe, movie_genre_ohe], axis=1)
X_test_ohe.drop(['rated', 'genre'], axis=1, inplace=True)

# Part 5: Scale the features

In [17]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_ohe)
X_test_scaled = scaler.transform(X_test_ohe)

In [18]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_ohe.columns, index=X_train_ohe.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_ohe.columns, index=X_test_ohe.index)

# Part 6: Build and fit the Ridge model

In [19]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

Ridge()

In [20]:
y_pred = ridge.predict(X_test_scaled)

In [22]:
print('Mean Squared Error : ', mean_squared_error(y_test, y_pred))
print('Mean Absolute Error : ', mean_absolute_error(y_test, y_pred))
print('R-squared: ', ridge.score(X_test_scaled, y_test))

Mean Squared Error :  1.3382047512658524e+16
Mean Absolute Error :  76972409.55192854
R-squared:  0.6055410788183748


In [23]:
# After the model V6 the best model is Ridge with an alpha of 10
ridge = Ridge(alpha=10)
ridge.fit(X_train_scaled, y_train)
y_pred = ridge.predict(X_test_scaled)
print('Mean Squared Error : ', mean_squared_error(y_test, y_pred))
print('Mean Absolute Error : ', mean_absolute_error(y_test, y_pred))
print('R-squared: ', ridge.score(X_test_scaled, y_test))

Mean Squared Error :  1.3373549541353328e+16
Mean Absolute Error :  76873382.71496634
R-squared:  0.605791571173159
