In [1]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
import numpy as np

from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error, mean_absolute_percentage_error



  import pandas.util.testing as tm


In [2]:
def evaluate_predictions(model, dataset_type, y_pred, y_truth):
    rmse = np.round(np.sqrt(mean_squared_error(y_truth, y_pred)), 0)
    mae = np.round(median_absolute_error(y_truth, y_pred), 0)
    mape = np.round(mean_absolute_percentage_error(y_truth, y_pred), 4)
    return {'model': [model], 'dataset_type': [dataset_type], 'rmse':[rmse], 'mae': [mae], 'mape':[mape]}

class PredictionEvaluation:
    
    def __init__(self, y_pred_train, y_truth_train, y_pred_test, y_truth_test, model_name, results_df):
        self.y_pred_train = y_pred_train
        self.y_truth_train = y_truth_train
        self.y_pred_test = y_pred_test
        self.y_truth_test = y_truth_test
        self.model_name = model_name
        self.results_df = results_df        
    
    def fill_results_df(self):
        train_results = pd.DataFrame(evaluate_predictions(self.model_name, 'train',
                                                          self.y_pred_train, self.y_truth_train))
        test_results = pd.DataFrame(evaluate_predictions(self.model_name, 'test',
                                                         self.y_pred_test, self.y_truth_test))
        return pd.concat([self.results_df, train_results, test_results])

    def diagnostics_plots(y_pred, y_truth):
    
        diag_plot = pd.DataFrame({'y_pred':y_pred,'y':y_truth, 'error': y_pred-y_truth})
        diag_plot.plot.scatter(x='y',y='y_pred')
        plt.plot([0,max(y_truth)], [0,max(y_truth)], c='black')
        plt.show()

In [3]:
STATS_COLUMNS = ["player", "position", "age","team_id","g","gs","mp_per_g","fg_per_g",
"fga_per_g","fg_pct","fg3_per_g","fg3a_per_g","fg3_pct","fg2_per_g","fg2a_per_g",
"fg2_pct","efg_pct","ft_per_g","fta_per_g","ft_pct","orb_per_g","drb_per_g","trb_per_g",
"ast_per_g","stl_per_g","blk_per_g","tov_per_g","pf_per_g","pts_per_g"]
NUM_STATS_COLUMNS = [col for col in STATS_COLUMNS if col not in ('"player", "position","team_id"')]

ADVANCED_STATS_COLUMNS =["player","position","age","team_id","g","mp","per","ts_pct","fg3a_per_fga_pct",
"fta_per_fga_pct","orb_pct","drb_pct","trb_pct","ast_pct","stl_pct","blk_pct","tov_pct","usg_pct",
                         "ows","dws","ws","ws_per_48","obpm","dbpm","bpm","vorp"]

NUM_ADVANCED_STATS_COLUMNS = [col for col in ADVANCED_STATS_COLUMNS if col not in ('"player", "position","team_id"')]
INJURIES_COLUMNS = ['out_for_season', 'out_indefinitely','acum_out_for_season', 'acum_out_indefinitely']

In [5]:
df = pd.read_csv('datasets/preprocessed_dataset.csv').fillna(0)
#results_df = pd.read_csv('datasets/results/performance_metrics.csv')
results_df = pd.DataFrame(columns=['model', 'dataset_type', 'rmse', 'mae', 'mape'])

In [6]:
relevant_cols = list(set(NUM_STATS_COLUMNS + NUM_ADVANCED_STATS_COLUMNS + ['position'] + INJURIES_COLUMNS))

In [7]:
X = df[relevant_cols+['free_agency_year']]
y = df[['mean_salary','free_agency_year']]

In [8]:
X['position'] = X.position.str.split('-').apply(lambda x: x[0])
X = pd.get_dummies(X, columns=["position"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [9]:
x_columns = X.columns.to_list()
x_columns.remove('free_agency_year')

In [10]:
train_X, test_X = X.query('free_agency_year < 2020').drop('free_agency_year', axis=1).values,\
                  X.query('free_agency_year == 2020').drop('free_agency_year', axis=1).values
train_y, test_y = y.query('free_agency_year < 2020').drop('free_agency_year', axis=1).mean_salary.values,\
                    y.query('free_agency_year == 2020').drop('free_agency_year', axis=1).mean_salary.values

### Baseline

In [11]:
baseline_prediction = train_y.mean()

In [12]:
train_y_baseline = np.full((len(train_y)), baseline_prediction)
test_y_baseline = np.full((len(test_y)), baseline_prediction)

In [13]:
baseline_evaluation = PredictionEvaluation(train_y_baseline, train_y, test_y_baseline,
                                           test_y, 'baseline', results_df)

In [14]:
results_df = baseline_evaluation.fill_results_df()

### Regresión lineal

In [15]:
ols = linear_model.LinearRegression(fit_intercept=True)
ols.fit(train_X, train_y)

LinearRegression()

In [16]:
train_y_pred = ols.predict(train_X)

In [17]:
test_y_pred = ols.predict(test_X)

In [18]:
ols_evaluation = PredictionEvaluation(train_y_pred, train_y, test_y_pred,
                                           test_y, 'modelo lineal', results_df)

results_df = ols_evaluation.fill_results_df()

### Regresion lineal no negativa

In [19]:
nn_ols = linear_model.LinearRegression(positive=True)
nn_ols.fit(train_X, train_y)
train_y_pred = nn_ols.predict(train_X)

In [20]:
test_y_pred = nn_ols.predict(test_X)

In [21]:
nn_ols_evaluation = PredictionEvaluation(train_y_pred, train_y, test_y_pred,
                                           test_y, 'modelo lineal no negativo', results_df)

results_df = nn_ols_evaluation.fill_results_df()

### Transformación logaritmica

In [22]:
log_ols = linear_model.LinearRegression(fit_intercept=True)
log_ols.fit(train_X, np.log(train_y))

LinearRegression()

In [23]:
train_y_pred = np.exp(log_ols.predict(train_X))

In [24]:
test_y_pred = np.exp(log_ols.predict(test_X))

In [25]:
log_ols_evaluation = PredictionEvaluation(train_y_pred, train_y, test_y_pred,
                                           test_y, 'modelo lineal logaritmo', results_df)

results_df = log_ols_evaluation.fill_results_df()

In [28]:
print(results_df.to_latex())

\begin{tabular}{lllrrr}
\toprule
{} &                      model & dataset\_type &       rmse &        mae &    mape \\
\midrule
0 &                   baseline &        train &  7046669.0 &  4191739.0 &  1.7206 \\
0 &                   baseline &         test &  6136372.0 &  3988694.0 &  1.5356 \\
0 &              modelo lineal &        train &  3924033.0 &  2111549.0 &  0.8302 \\
0 &              modelo lineal &         test &  3559228.0 &  2088352.0 &  0.8496 \\
0 &  modelo lineal no negativo &        train &  4343415.0 &  2166715.0 &  0.8812 \\
0 &  modelo lineal no negativo &         test &  3798000.0 &  1817943.0 &  0.7398 \\
0 &    modelo lineal logaritmo &        train &  4492837.0 &  1382346.0 &  0.5387 \\
0 &    modelo lineal logaritmo &         test &  4095432.0 &  1059238.0 &  0.4941 \\
\bottomrule
\end{tabular}



In [29]:
results_df.to_csv('datasets/results/performance_metrics.csv', index=False)