In [64]:
import numpy as np 
import pandas as pd 

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error as mae

import math

In [65]:
df_train = pd.read_csv('train.csv')
df_valid = pd.read_csv('validation.csv')
# df_test = pd.read_csv('test.csv')

In [66]:
var_columns = [c for c in df_train.columns if c not in ['profit_margin']]

x_train = df_train.loc[:,var_columns]
y_train = df_train.loc[:,'profit_margin']

x_valid = df_valid.loc[:,var_columns]
y_valid = df_valid.loc[:,'profit_margin']


In [67]:
gbModel = GradientBoostingRegressor(random_state=42)

gbModel.fit(x_train, y_train)

y_pred_gb = gbModel.predict(x_valid)

In [68]:
print(y_pred_gb)

y_actual = df_valid['profit_margin'].to_numpy()
print(y_actual)

results = pd.DataFrame({"predicted":y_pred_gb, "actual":y_actual}).sort_values('actual')
print(results.to_string())

[0.91705706 0.88720266 0.97699842 ... 0.82345339 0.92072499 0.93997648]
[0.99167184 0.98743912 0.99362103 ... 0.98124769 1.         0.9914045 ]
      predicted    actual
743    0.637402  0.000000
3337   0.575134  0.000000
959    0.566137  0.000000
956    0.736295  0.000000
2200   0.569915  0.000000
2207   0.643180  0.000000
2748   0.586863  0.000000
943    0.520509  0.000000
2741   0.799083  0.000000
144    0.724999  0.000000
3581   0.668053  0.000000
2215   0.508584  0.000000
147    0.849079  0.000000
4133   0.513775  0.000000
2238   0.699116  0.000000
3182   0.619779  0.000000
3183   0.621626  0.000000
153    0.763078  0.000000
3186   0.473546  0.000000
4005   0.791139  0.000000
2285   0.876102  0.000000
568    0.661558  0.000000
553    0.712706  0.000000
551    0.932410  0.000000
540    0.591466  0.000000
2188   0.524056  0.000000
3965   0.842008  0.000000
1087   0.630197  0.000000
2064   0.935119  0.000000
2068   0.852904  0.000000
3787   0.623200  0.000000
2100   0.670743  0.00000

In [69]:
MSE = mean_squared_error(y_actual, y_pred_gb)
print(f'MSE: {MSE}')

RMSE = math.sqrt(MSE)
print(f'RMSE: {RMSE}')

MAE = mae(y_actual, y_pred_gb)
print(f'MAE: {MAE}')

MSE: 0.03865806943459188
RMSE: 0.19661655432488861
MAE: 0.11637969117177989


In [70]:
features_importance  = pd.DataFrame({'Variable_Name':var_columns, 'Importance':gbModel.feature_importances_}).sort_values('Importance', ascending=False)

print(features_importance)

       Variable_Name  Importance
4         vote_count    0.367133
1             budget    0.207189
3       vote_average    0.132147
2            runtime    0.089507
60    genres: Comedy    0.033348
..               ...         ...
34    overview: girl    0.000000
61     genres: Crime    0.000000
27   overview: group    0.000000
43    overview: york    0.000000
67  genres: TV Movie    0.000000

[90 rows x 2 columns]
