In [1]:
import numpy as np 
import pandas as pd 

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import math

In [24]:
df_train = pd.read_csv('train2.csv')
df_valid = pd.read_csv('validation2.csv')
# df_test = pd.read_csv('test.csv')

In [25]:
var_columns = [c for c in df_train.columns if c not in ['profit_margin']]

x_train = df_train.loc[:,var_columns]
y_train = df_train.loc[:,'profit_margin']

x_valid = df_valid.loc[:,var_columns]
y_valid = df_valid.loc[:,'profit_margin']


In [26]:
# baseline
gbModel_baseline = GradientBoostingRegressor(random_state=42)
gbModel_baseline.fit(x_train, y_train)
y_pred_base = gbModel_baseline.predict(x_valid)

y_actual = df_valid['profit_margin'].to_numpy()

results = pd.DataFrame({"predicted":y_pred_base, "actual":y_actual}).sort_values('actual')
num_neg = 0
for i in results['predicted']:
    if i < 0:
        num_neg += 1
print(num_neg)
print(results.to_string())

MSE = mean_squared_error(y_actual, y_pred_base)
print(f'Baseline MSE: {MSE}')

RMSE = math.sqrt(MSE)
print(f'Baseline RMSE: {RMSE}')

MAE = mean_absolute_error(y_actual, y_pred_base)
print(f'Baseline MAE: {MAE}')

0
     predicted    actual
504   1.064847  0.007511
236   1.191245  0.007580
511   1.448075  0.007783
247   1.762341  0.008446
3     1.149498  0.009240
288   1.439533  0.010107
165   1.005425  0.010943
366   0.967382  0.012468
336   1.622937  0.014253
523   0.453567  0.014418
128   0.385391  0.016634
272   1.126247  0.017791
461   0.549303  0.020083
251   0.574671  0.020416
577   0.391591  0.021325
353   0.840919  0.022019
419   1.239348  0.022420
670   0.778839  0.023780
524   1.888514  0.023886
654   0.898690  0.025278
67    0.973231  0.026061
325   1.081152  0.026177
402   0.665065  0.027915
197   1.719665  0.029860
129   0.920933  0.030042
439   0.775256  0.031329
668   1.094685  0.031352
76    1.183949  0.033110
15    0.815955  0.035054
717   1.002048  0.036306
33    1.043413  0.039731
160   1.235447  0.041249
337   1.009588  0.043159
522   1.356730  0.049039
25    1.869107  0.049787
356   1.768848  0.050153
321   1.362164  0.052707
565   1.844013  0.052874
144   1.549559  0.05406

In [27]:
# changed
gbModel = GradientBoostingRegressor(random_state=42, loss='huber', learning_rate=0.01, n_estimators=2000)
gbModel.fit(x_train, y_train)
y_pred_changed = gbModel.predict(x_valid)

y_actual = df_valid['profit_margin'].to_numpy()

results = pd.DataFrame({"predicted":y_pred_changed, "actual":y_actual}).sort_values('actual')
num_neg = 0
for i in results['predicted']:
    if i < 0:
        num_neg += 1
print(num_neg)
# print(results.to_string())

MSE = mean_squared_error(y_actual, y_pred_changed)
print(f'Changed MSE: {MSE}')

RMSE = math.sqrt(MSE)
print(f'Changed RMSE: {RMSE}')

MAE = mean_absolute_error(y_actual, y_pred_changed)
print(f'Changed MAE: {MAE}')

0
Changed MSE: 0.3687111792557262
Changed RMSE: 0.6072159247382485
Changed MAE: 0.4716382575621015


In [23]:
features_importance  = pd.DataFrame({'Variable_Name':var_columns, 'Importance':gbModel.feature_importances_}).sort_values('Importance', ascending=False)

print(features_importance.to_string())

              Variable_Name  Importance
1                    budget    0.352076
4                vote_count    0.340729
3              vote_average    0.075991
0                popularity    0.049010
2                   runtime    0.029824
41           genres: Comedy    0.023589
5             release_month    0.016358
49     original_language_fr    0.013949
26            genres: Drama    0.009215
60     original_language_zh    0.008080
39  genres: Science Fiction    0.007935
40           genres: Horror    0.006662
50     original_language_hi    0.005466
38           genres: Family    0.005175
56     original_language_ru    0.003707
31          genres: Romance    0.003638
32          genres: Fantasy    0.003427
58     original_language_te    0.003396
37           genres: Action    0.002857
51     original_language_it    0.002683
42          genres: History    0.002673
47     original_language_en    0.002492
11         overview: family    0.002007
53     original_language_ko    0.001943


In [34]:
from sklearn.model_selection import cross_val_score, KFold

n_folds = 5

kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

scores = cross_val_score(gbModel, x_train, y_train, cv=kf)
mean_score = scores.mean()
print('Mean RSME: ', mean_score)

Mean RSME:  0.3587708000049852


In [35]:
print(scores)

[0.36550055 0.33003535 0.36778973 0.3879404  0.34258796]
