In [93]:
import numpy as np 
import pandas as pd 

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import math

In [271]:
df_train = pd.read_csv('train.csv')
df_valid = pd.read_csv('validation.csv')
# df_test = pd.read_csv('test.csv')

((6716, 62), (840, 62))

In [272]:
var_columns = [c for c in df_train.columns if c not in ['profit_margin']]

x_train = df_train.loc[:,var_columns]
y_train = df_train.loc[:,'profit_margin']

x_valid = df_valid.loc[:,var_columns]
y_valid = df_valid.loc[:,'profit_margin']


In [284]:
# baseline
gbModel_baseline = GradientBoostingRegressor(random_state=42)
gbModel_baseline.fit(x_train, y_train)
y_pred_base = gbModel_baseline.predict(x_valid)

y_actual = df_valid['profit_margin'].to_numpy()

results = pd.DataFrame({"predicted":y_pred_base, "actual":y_actual}).sort_values('actual')
num_neg = 0
for i in results['predicted']:
    if i < 0:
        num_neg += 1
print(num_neg)
# print(results.to_string())

MSE = mean_squared_error(y_actual, y_pred_base)
print(f'Baseline MSE: {MSE}')

RMSE = math.sqrt(MSE)
print(f'Baseline RMSE: {RMSE}')

MAE = mean_absolute_error(y_actual, y_pred_base)
print(f'Baseline MAE: {MAE}')

42
Baseline MSE: 0.004530369673540047
Baseline RMSE: 0.06730802087077027
Baseline MAE: 0.050196899957259754


In [283]:
# changed
gbModel = GradientBoostingRegressor(random_state=42, loss='huber', learning_rate=0.01, n_estimators=2000)
gbModel.fit(x_train, y_train)
y_pred_changed = gbModel.predict(x_valid)

y_actual = df_valid['profit_margin'].to_numpy()

results = pd.DataFrame({"predicted":y_pred_changed, "actual":y_actual}).sort_values('actual')
num_neg = 0
for i in results['predicted']:
    if i < 0:
        num_neg += 1
print(num_neg)
# print(results.to_string())

MSE = mean_squared_error(y_actual, y_pred_changed)
print(f'Changed MSE: {MSE}')

RMSE = math.sqrt(MSE)
print(f'Changed RMSE: {RMSE}')

MAE = mean_absolute_error(y_actual, y_pred_changed)
print(f'Changed MAE: {MAE}')

48
Changed MSE: 0.002948401879044686
Changed RMSE: 0.054299188567092656
Changed MAE: 0.03685989304279326


In [244]:
features_importance  = pd.DataFrame({'Variable_Name':var_columns, 'Importance':gbModel.feature_importances_}).sort_values('Importance', ascending=False)

print(features_importance)

           Variable_Name  Importance
2                revenue    0.730891
1                 budget    0.268547
4           vote_average    0.000498
3                runtime    0.000015
50  original_language_hi    0.000011
..                   ...         ...
21         overview: war    0.000000
35      genres: TV Movie    0.000000
25      overview: school    0.000000
26   genres: Documentary    0.000000
60  original_language_zh    0.000000

[61 rows x 2 columns]
