In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, classification_report

from sklearn.model_selection import train_test_split,  KFold, cross_val_score
from sklearn.neural_network import MLPRegressor, MLPClassifier

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
DATASET = 'SEASON'
# DATASET = 'TEAM'

if DATASET == 'TEAM':
    train = pd.read_csv('../data/train_team.csv')
    test = pd.read_csv('../data/test_team.csv')

if DATASET == 'TEAM':
    train = pd.read_csv('../data/train_season.csv')
    test = pd.read_csv('../data/test_season.csv')

In [3]:
target_column = "shot_made_flag"  # y_column_name

In [4]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    pred = pd.Series(pred)
    true = pd.Series(true)
    
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    print(cm)
    cm = cm[cm.index]
    return cm

In [5]:
def compute_metrics(y_true,y_pred):
    accuracy = accuracy_score(y_true,y_pred)
    f1_score_1 = f1_score(y_true,y_pred,average='binary',pos_label=1)
    f1_score_0 = f1_score(y_true,y_pred,average='binary',pos_label=0)
    f1_score_macro = f1_score(y_true,y_pred,average='macro')
    return [accuracy,f1_score_1,f1_score_0,f1_score_macro]
    
    

results = pd.DataFrame(columns=['Accuracy', 'F1-score (class 1)', 'F1-score (class 0)', 'F1-score (macro avg)'])

In [6]:
def split_x_y(df, target):
    y = df.loc[:, [target]]
    x = df.drop([target], axis=1)
    return x, y

def split_regular_playoff(df, season, train=None, test=None, validation=None):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    df_validaiton = None

    df_season = df[df['season'] == season]
    isPlayoffs = df_season.playoffs == 1

    df_train = df_season[~isPlayoffs]
    df_test  = df_season[isPlayoffs]

    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    return df_train, df_test
#     train = split_x_y(df_train, 'shot_made_flag')
#     test = split_x_y(df_test, 'shot_made_flag')
#     return train[0], train[1], test[0], test[1]
#     return train, test

# x_train, y_train, x_test, y_test = split_regular_playoff(df, '2010-11')
# train, test = split_regular_playoff(df, '2010-11')

In [7]:
train.head()

Unnamed: 0,period,playoffs,shot_distance,shot_made_flag,shot_zone_basic,shot_zone_range,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,...,shot_type_3PT Field Goal,shot_zone_area_Center(C),shot_zone_area_Left Side Center(LC),shot_zone_area_Left Side(L),shot_zone_area_Right Side Center(RC),shot_zone_area_Right Side(R),matchup_away,matchup_home,shot_zone_area_Back Court(BC),combined_shot_type_Tip Shot
0,1,0,17,1.0,2,2,652,0.0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,4,0,11,1.0,1,1,119,0.0,1,30,...,0,1,0,0,0,0,0,1,0,0
2,4,0,1,1.0,0,0,189,0.0,0,28,...,0,1,0,0,0,0,0,1,0,0
3,4,0,26,0.0,4,3,193,0.0,4,28,...,1,0,0,0,1,0,0,1,0,0
4,4,0,2,1.0,0,0,239,0.0,3,26,...,0,1,0,0,0,0,0,1,0,0


## Split X & Y

In [8]:
x_train, y_train = split_x_y(train, target_column)
x_test, y_test = split_x_y(test, target_column)

In [9]:
lambdas = [1e-10,1e-5,1e-4,1e-3,1e-2,0.1, 0.5,1,5,10,50,100]

# Linear Regression

In [10]:
reg = LinearRegression()
a = reg.fit(x_train, y_train)
reg.score(x_train, y_train)

0.5045136977256841

#### train data

In [11]:
y_train_pred = reg.predict(x_train)
y_train_pred = y_train_pred >= 0.6
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

0.7916666666666666

#### Test Data

In [12]:
y_test_pred = reg.predict(x_test)
y_test_pred = y_test_pred >= 0.5
y_test_pred = np.multiply(y_test_pred, 1)
accuracy_score(y_test.values, y_test_pred)
results.loc['Linear',:] = compute_metrics(y_test.values, y_test_pred)
results

Feature names must be in the same order as they were in fit.



0.5808080808080808

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.580808,0.442953,0.663968,0.55346


# Lasso Regression

In [13]:
lasso_cv = LassoCV(alphas=lambdas, cv=5, max_iter=100_000)
a = lasso_cv.fit(x_train.values, y_train.values.flatten())

lasso_r2 =  np.mean(cross_val_score(lasso_cv, x_train.values, y_train.values.flatten()))

print('Best lambda:', lasso_cv.alpha_, 'R2 score:',lasso_r2)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Best lambda: 0.01 R2 score: 0.03297614519299037


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [14]:
y_train_pred = lasso_cv.predict(x_train.values)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

0.8055555555555556

In [15]:
y_test_pred = lasso_cv.predict(x_test.values)
y_test_pred = y_test_pred >= 0.5
y_test_pred = np.multiply(y_test_pred, 1)
accuracy_score(y_test.values, y_test_pred)
results.loc['Lasso',:] = compute_metrics(y_test.values, y_test_pred)
results

0.6919191919191919

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.580808,0.442953,0.663968,0.55346
Lasso,0.691919,0.625767,0.738197,0.681982


# Ridge Regression

In [16]:
ridge_cross_val_metrics = pd.DataFrame(columns=['mean MSE', 'mean norm_MSE', 'mean R2'])
# We calculate the cross-validation metrics for each lambda 
for lambda_val in lambdas:
    kf = KFold(n_splits=5)
    i=1
    cv_mse = []
    cv_nmse = []
    cv_r2 = []
    # We compute the metrics for each fold and then perform the mean.
    for train_index, test_index in kf.split(x_train):
        x_train_fold = x_train.values[train_index]
        y_train_fold = y_train.values[train_index]
        x_test_fold = x_train.values[test_index,:]
        y_test_fold = y_train.values[test_index]

        ridge = Ridge(alpha=lambda_val)
        a = ridge.fit(x_train_fold,y_train_fold)
        y_pred_fold = ridge.predict(x_test_fold)
        fold_mse = mean_squared_error(y_test_fold, y_pred_fold)
        fold_nmse =  1-r2_score(y_test_fold, y_pred_fold)
        fold_r2 = r2_score(y_test_fold, y_pred_fold)
        cv_mse.append(fold_mse)
        cv_nmse.append(fold_nmse)
        cv_r2.append(fold_r2)
    ridge_cross_val_metrics.loc['Lambda={}'.format(lambda_val),:] = [np.mean(cv_mse),np.mean(cv_nmse),np.mean(cv_r2)]

ridge_cross_val_metrics.loc['Mean', :] = ridge_cross_val_metrics.mean()
ridge_cv_r2 = ridge_cross_val_metrics.loc["Mean", "mean R2"]

ridge_cross_val_metrics.sort_values(by='mean R2',ascending=False)

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


Unnamed: 0,mean MSE,mean norm_MSE,mean R2
Lambda=0.5,0.204973,0.833508,0.166492
Lambda=1,0.207672,0.844531,0.155469
Lambda=0.1,0.208347,0.847196,0.152804
Lambda=0.01,0.211551,0.860261,0.139739
Lambda=0.001,0.211975,0.861992,0.138008
Lambda=0.0001,0.212019,0.862171,0.137829
Lambda=1e-05,0.212023,0.862189,0.137811
Lambda=1e-10,0.212024,0.862191,0.137809
Mean,0.219136,0.890891,0.109109
Lambda=5,0.227101,0.92327,0.07673


In [17]:
temp_df = ridge_cross_val_metrics.sort_values(by='mean R2',ascending=False)
temp_df.iloc[0]

mean MSE         0.204973
mean norm_MSE    0.833508
mean R2          0.166492
Name: Lambda=0.5, dtype: object

In [18]:
ridge = Ridge(alpha=5)

ridge.fit(x_train.values, y_train.values.flatten())

y_train_pred = ridge.predict(x_train)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

Ridge(alpha=5)



0.8055555555555556

In [19]:
y_test_pred = ridge.predict(x_test)
y_test_pred = y_test_pred >= 0.5
y_test_pred = np.multiply(y_test_pred, 1)
accuracy_score(y_test.values, y_test_pred)
results.loc['Ridge',:] = compute_metrics(y_test.values, y_test_pred)
results



0.5909090909090909

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.580808,0.442953,0.663968,0.55346
Lasso,0.691919,0.625767,0.738197,0.681982
Ridge,0.590909,0.584615,0.597015,0.590815


# Neural Network

In [20]:
regr = MLPClassifier(hidden_layer_sizes=(12, ), learning_rate_init=0.00001,  max_iter=1_000_000, tol=1e-8)
regr.fit(x_train.values, y_train.values.flatten())
y_train_pred = regr.predict(x_train.values)
regr.score(x_train.values, y_train.values.flatten())

MLPClassifier(hidden_layer_sizes=(12,), learning_rate_init=1e-05,
              max_iter=1000000, tol=1e-08)

0.8611111111111112

In [21]:
y_train_pred = regr.predict(x_train.values)
y_train_pred = y_train_pred >= 0.5
y_train_pred = np.multiply(y_train_pred, 1)
accuracy_score(y_train.values, y_train_pred)

0.8611111111111112

In [22]:
y_test_pred = regr.predict(x_test.values)
y_test_pred = y_test_pred >= 0.5
y_test_pred = np.multiply(y_test_pred, 1)
accuracy_score(y_test.values, y_test_pred)
results.loc['Neural Net',:] = compute_metrics(y_test.values, y_test_pred)
results

0.5

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
Linear,0.580808,0.442953,0.663968,0.55346
Lasso,0.691919,0.625767,0.738197,0.681982
Ridge,0.590909,0.584615,0.597015,0.590815
Neural Net,0.5,0.60241,0.326531,0.46447


In [25]:
# confusion(y_test.values.flatten(), y_test_pred)
confusion(y_test_pred, y_test.values.flatten())

print( classification_report(y_test.values, y_test_pred))

predicted  0.0  1.0
target             
0           24   18
1           81   75


predicted,0.0,1.0
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24,18
1,81,75


              precision    recall  f1-score   support

         0.0       0.57      0.23      0.33       105
         1.0       0.48      0.81      0.60        93

    accuracy                           0.50       198
   macro avg       0.53      0.52      0.46       198
weighted avg       0.53      0.50      0.46       198



In [24]:
# Get numerical feature importances
importances = list(regr.feature_importances_)# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(x_train.columns), importances)]# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

AttributeError: 'MLPClassifier' object has no attribute 'feature_importances_'