In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

import warnings
warnings.filterwarnings("ignore")

In [7]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import matthews_corrcoef
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('processed_0323.csv')

In [3]:
del df['Unnamed: 0']

In [4]:
len(df)

8666

In [5]:
df.shape

(8666, 414)

In [6]:
## replace wrongly mapped ages to average age
cols = ['HOME_F_PLAYER_AGE1', 'HOME_C_PLAYER_AGE0', 'HOME_G_PLAYER_AGE0', 'HOME_G_PLAYER_AGE1',
        'HOME_F_PLAYER_AGE0', 'AWAY_F_PLAYER_AGE1', 'AWAY_C_PLAYER_AGE0', 'AWAY_G_PLAYER_AGE0',
        'AWAY_G_PLAYER_AGE1', 'AWAY_F_PLAYER_AGE0']
for i in cols:
    df.loc[df[i]>40, i] = 28

In [8]:
numeric_cols = list(df.select_dtypes(include=['float64', 'int']).columns)

In [9]:
## remove columns that cannot be used for prediction 
cols=[]
for i in numeric_cols:
    if ('TEAM_WINS' not in i)&('SECONDS' not in i)&('ID' not in i)&('MINS' not in i)&('WINNING' not in i)&('GAME_YEAR' not in i)&('born' not in i)&('PTS' not in i):
        cols.append(i)

In [10]:
len(cols)

240

In [11]:
# Split the dataset
output_variable = 'HOME_TEAM_WINS'
X = df[cols] # Here no need to set inplace=True
y = df['HOME_TEAM_WINS']

X_train_noscale, X_valid_noscale, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=1234)

In [12]:
from sklearn.preprocessing import StandardScaler
# Normalize features
scaler = StandardScaler()
scaler.fit(X_train_noscale.fillna(0))

StandardScaler(copy=True, with_mean=True, with_std=True)

#### 1. LASSO to shrink insignificant coefficients

In [13]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

In [14]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(scaler.transform(X_train_noscale.fillna(0)), y_train)

SelectFromModel(estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                             fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='auto',
                                             n_jobs=None, penalty='l1',
                                             random_state=None,
                                             solver='liblinear', tol=0.0001,
                                             verbose=0, warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [15]:
sel_.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,

In [16]:
len(sel_.get_support())

240

In [17]:
selected_feat = X_train_noscale.columns[(sel_.get_support())]
print('total features: {}'.format((X_train_noscale.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)))

total features: 240
selected features: 213
features with coefficients shrank to zero: 27


In [18]:
## 0 coef 
shrink_feats = X_train_noscale.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
shrink_feats

Index(['HOME_F_FTA0', 'HOME_F_REB0', 'HOME_F_OREB1', 'HOME_F_REB1',
       'HOME_F_weight1', 'HOME_C_FG3A0', 'HOME_C_FTA0', 'HOME_C_REB0',
       'HOME_G_REB0', 'HOME_G_weight0', 'HOME_G_FG3A1', 'HOME_G_FGM1',
       'HOME_G_REB1', 'AWAY_F_FTM0', 'AWAY_F_OREB0', 'AWAY_F_PLAYER_AGE0',
       'AWAY_F_DREB1', 'AWAY_F_FGM1', 'AWAY_F_FT_PCT1', 'AWAY_F_weight1',
       'AWAY_C_AST0', 'AWAY_C_REB0', 'AWAY_G_FGA0', 'AWAY_G_FTA0',
       'AWAY_G_OREB0', 'AWAY_G_DREB1', 'AWAY_G_MIN1'],
      dtype='object')

In [19]:
selected_feats = X_train_noscale.columns[(sel_.estimator_.coef_ != 0).ravel().tolist()]
selected_feats

Index(['FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home',
       'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away',
       ...
       'AWAY_G_OREB1', 'AWAY_G_PERC_WIN1', 'AWAY_G_PF1', 'AWAY_G_PLAYER_AGE1',
       'AWAY_G_PLUS_MINUS1', 'AWAY_G_REB1', 'AWAY_G_STL1', 'AWAY_G_TO1',
       'AWAY_G_height1', 'AWAY_G_weight1'],
      dtype='object', length=213)

In [20]:
selected_coefs = []
for i in list(sel_.estimator_.coef_[0]):
    if i !=0:
        selected_coefs.append(i)

In [21]:
lasso_feats = pd.DataFrame()

In [22]:
lasso_feats['feature'] = selected_feats

In [23]:
lasso_feats['coef'] = selected_coefs
lasso_feats['abs_coef'] = abs(lasso_feats['coef'])

In [24]:
x = lasso_feats.sort_values(by='abs_coef', ascending=False)
x

Unnamed: 0,feature,coef,abs_coef
5,FG_PCT_away,-1.659477,1.659477
0,FG_PCT_home,1.540879,1.540879
9,REB_away,-0.747484,0.747484
2,FG3_PCT_home,0.712547,0.712547
197,AWAY_G_FGA1,-0.665851,0.665851
...,...,...,...
19,HOME_F_FTM0,0.002417,0.002417
182,AWAY_G_MIN0,0.001975,0.001975
93,HOME_G_BLK1,-0.001890,0.001890
43,HOME_F_MIN1,0.001561,0.001561


In [25]:
x.reset_index()

Unnamed: 0,index,feature,coef,abs_coef
0,5,FG_PCT_away,-1.659477,1.659477
1,0,FG_PCT_home,1.540879,1.540879
2,9,REB_away,-0.747484,0.747484
3,2,FG3_PCT_home,0.712547,0.712547
4,197,AWAY_G_FGA1,-0.665851,0.665851
...,...,...,...,...
208,19,HOME_F_FTM0,0.002417,0.002417
209,182,AWAY_G_MIN0,0.001975,0.001975
210,93,HOME_G_BLK1,-0.001890,0.001890
211,43,HOME_F_MIN1,0.001561,0.001561


In [28]:
## take features of individual players instead of team feature
features = list(x[(x.feature.str.contains('HOME_'))|(x.feature.str.contains('AWAY_'))].feature)

In [36]:
len(features)

203

In [37]:
X_train_noscale = X_train_noscale[features]
X_valid_noscale = X_valid_noscale[features]

In [38]:
scaler = StandardScaler()
scaler.fit(X_train_noscale.fillna(0))

StandardScaler(copy=True, with_mean=True, with_std=True)

## Recursive Feature Elimination

In [39]:
from sklearn.feature_selection import RFE

# input:
# model - the model used for this run
# n - number of features should be selected
# output:
# the F1 score for this particular model with n features
def rfe_selection(model, n, X_train, X_test, Y_train, Y_test):
    # fitting the model to RFE 
    rfe = RFE(model, n)
    fit= rfe.fit(X_train, Y_train)
    
    # transform the x inputs based on feature selection
    X_train_transformed = rfe.fit_transform(X_train, Y_train)
    X_test_transformed = rfe.fit_transform(X_test, Y_test)
    
    model.fit(X_train_transformed, Y_train)
    Y_pred = model.predict(X_test_transformed)
    result = f1_score(Y_test, Y_pred, average = 'micro')
    ranks = fit.ranking_
    
    return [ranks, result]

In [40]:
# input:
# model - the model used for this run
# X_train, X_test: the transformed X_training and testing dataset from the result of recursive feature elimination
# Output: the F1 score for the current run
def selection_df(model, m, X_train, X_test, Y_train, Y_test):
    columns = list(range(2))
    index = list(range(m+1)[1:])
    df = pd.DataFrame(columns = columns, index = index)
    df = df.fillna(0)
    
    for i in range(1, m+1):
        rank, result = rfe_selection(model, i, X_train, X_test, Y_train, Y_test)
    
        df.iloc[i-1,0] = i
        df.iloc[i-1,1] = result
        
        print(str(i) + 'done')
    
    return df

In [43]:
import sklearn.linear_model

X_train_scaled = scaler.transform(X_train_noscale.fillna(0))
X_valid_scaled = scaler.transform(X_valid_noscale.fillna(0))

logreg= sklearn.linear_model.LogisticRegression()

In [45]:
df_logreg = selection_df(logreg, 30, X_train_scaled, X_valid_scaled, y_train, y_valid)

In [46]:
df_logreg

In [47]:
max_row = np.argmax(df_rf.iloc[:,1].values) + 1

### final model

In [211]:

#max_row = np.argmax(df_rf.iloc[:,1].values) + 1
max_row = 17   ## take directly 
print ('The optimal number of parameters chosen is ', max_row)

# fitting the model to RFE 
rfe = RFE(logreg, max_row)
fit= rfe.fit(X_train_scaled, y_train)
    
# transform the x inputs based on feature selection
X_train_transformed = rfe.fit_transform(X_train_scaled, y_train)
X_test_transformed = rfe.fit_transform(X_valid_scaled, y_valid)
    
rf.fit(X_train_transformed, y_train)
Y_pred = rf.predict(X_test_transformed)
result = f1_score(y_valid, Y_pred, average = 'micro')

The optimal number of parameters chosen is  17


In [212]:
print (result)

0.6442307692307693
