# NBA Predictor: Basketball Advanced Regression, Kyle’s Loss Estimate Yielder (BARKLEY)

## Importing Modules

In [1079]:
import pandas as pd
pd.options.display.max_columns = None

import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold,f_regression, SelectKBest

## Loading Data

In [1047]:
# 2018 season will be used as validation
games2018 = pd.read_csv('data/games_2018_season.csv')
stats2018 = pd.read_csv('data/stats_2018_season.csv')

In [1048]:
df_games = pd.concat(
    [pd.read_csv('data/games_2008_season.csv'),
     pd.read_csv('data/games_2009_season.csv'),
     pd.read_csv('data/games_2010_season.csv'),
     pd.read_csv('data/games_2011_season.csv'),
     pd.read_csv('data/games_2012_season.csv'),
     pd.read_csv('data/games_2013_season.csv'),
     pd.read_csv('data/games_2014_season.csv'),
     pd.read_csv('data/games_2015_season.csv'),
     pd.read_csv('data/games_2016_season.csv'),
     pd.read_csv('data/games_2017_season.csv')]
)

In [1049]:
df_stats = pd.concat(
    [pd.read_csv('data/stats_2010_season.csv'),
     pd.read_csv('data/stats_2011_season.csv'),
     pd.read_csv('data/stats_2012_season.csv'),
     pd.read_csv('data/stats_2013_season.csv'),
     pd.read_csv('data/stats_2014_season.csv'),
     pd.read_csv('data/stats_2015_season.csv'),
     pd.read_csv('data/stats_2016_season.csv'),
     pd.read_csv('data/stats_2017_season.csv')]
)

I retrieved both games and stats from the Ball Don't Lie API, however I'll be working mainly with df_stats as it has much more info.

In [1050]:
df_games.shape

(12893, 23)

In [1051]:
df_games.columns

Index(['id', 'date', 'home_team_score', 'period', 'postseason', 'season',
       'status', 'time', 'visitor_team_score', 'home_team.id',
       'home_team.abbreviation', 'home_team.city', 'home_team.conference',
       'home_team.division', 'home_team.full_name', 'home_team.name',
       'visitor_team.id', 'visitor_team.abbreviation', 'visitor_team.city',
       'visitor_team.conference', 'visitor_team.division',
       'visitor_team.full_name', 'visitor_team.name'],
      dtype='object')

In [239]:
games_columns = ['id', 'date', 'home_team_score', 'postseason', 'season', 'visitor_team_score', 'home_team.id',
       'home_team.abbreviation', 'home_team.conference',
       'home_team.division',
       'visitor_team.id', 'visitor_team.abbreviation',
       'visitor_team.conference', 'visitor_team.division']

In [660]:
#filter only desired columns and regular season games, and sort by date
df_games = df_games[df_games['postseason'] == False].filter(games_columns).sort_values('date')

In [924]:
df_games.head()

Unnamed: 0,id,date,home_team_score,postseason,season,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division
91,21472,2008-10-28T00:00:00.000Z,90,False,2008,85,2,BOS,East,Atlantic,6,CLE,East,Central
124,21556,2008-10-28T00:00:00.000Z,108,False,2008,95,5,CHI,East,Central,17,MIL,East,Central
92,21473,2008-10-28T00:00:00.000Z,96,False,2008,76,14,LAL,West,Pacific,25,POR,West,Northwest
95,21476,2008-10-29T00:00:00.000Z,98,False,2008,103,27,SAS,West,Southwest,24,PHX,West,Pacific
94,21475,2008-10-29T00:00:00.000Z,100,False,2008,94,9,DET,East,Central,12,IND,East,Central


In [934]:
df_stats.head()

Unnamed: 0,id,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,fta,ftm,min,oreb,pf,pts,reb,stl,turnover,game.id,game.date,game.home_team_id,game.home_team_score,game.period,game.postseason,game.season,game.status,game.time,game.visitor_team_id,game.visitor_team_score,player.id,player.first_name,player.height_feet,player.height_inches,player.last_name,player.position,player.team_id,player.weight_pounds,team.id,team.abbreviation,team.city,team.conference,team.division,team.full_name,team.name
0,581359,3.0,0.0,2.0,0.0,0.0,0.0,0.75,4.0,3.0,0.5,2.0,1.0,21:02,0.0,4.0,7.0,2.0,0.0,2.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,735,Shaquille,,,O'Neal,,22,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
1,581358,1.0,1.0,14.0,0.0,0.0,0.0,0.375,8.0,3.0,0.75,4.0,3.0,29:08:00,1.0,3.0,9.0,15.0,2.0,3.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,971,Kevin,,,Garnett,,18,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
2,581360,4.0,0.0,2.0,0.0,5.0,0.0,0.308,13.0,4.0,0.8,5.0,4.0,35:26:00,2.0,3.0,12.0,4.0,0.0,1.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,1036,Ray,,,Allen,G,17,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
3,581364,0.0,0.0,2.0,0.0,0.0,0.0,0.5,2.0,1.0,0.0,0.0,0.0,12:22,0.0,6.0,2.0,2.0,0.0,3.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,1059,Jermaine,,,O'Neal,,25,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
4,581372,6.0,0.0,2.0,0.5,4.0,2.0,0.333,12.0,4.0,0.0,2.0,0.0,34:54:00,0.0,1.0,10.0,2.0,0.0,2.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,1139,Anthony,,,Parker,,23,,6,CLE,Cleveland,East,Central,Cleveland Cavaliers,Cavaliers


In [935]:
df_stats.columns

Index(['id', 'ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga',
       'fgm', 'ft_pct', 'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl',
       'turnover', 'game.id', 'game.date', 'game.home_team_id',
       'game.home_team_score', 'game.period', 'game.postseason', 'game.season',
       'game.status', 'game.time', 'game.visitor_team_id',
       'game.visitor_team_score', 'player.id', 'player.first_name',
       'player.height_feet', 'player.height_inches', 'player.last_name',
       'player.position', 'player.team_id', 'player.weight_pounds', 'team.id',
       'team.abbreviation', 'team.city', 'team.conference', 'team.division',
       'team.full_name', 'team.name'],
      dtype='object')

## Feature Engineering

In [931]:
df_stats

Unnamed: 0,game.season,game.date,game.id,home,ast,blk,dreb,fg3a,fg3m,fga,fgm,fta,ftm,oreb,pf,pts,reb,stl,turnover,opp.id,team.id_copy
0,2010,2010-10-27T00:00:00.000Z,24362,1,24.0,1.0,32.0,12.0,3.0,72.0,34.0,21.0,16.0,6.0,24.0,87.0,38.0,8.0,19.0,6,2
1,2010,2010-10-27T00:00:00.000Z,24362,0,24.0,4.0,29.0,20.0,6.0,81.0,36.0,21.0,17.0,8.0,20.0,95.0,37.0,5.0,14.0,2,6
2,2010,2010-10-27T00:00:00.000Z,24363,1,20.0,9.0,35.0,16.0,8.0,80.0,42.0,31.0,27.0,9.0,20.0,119.0,44.0,7.0,15.0,15,1
3,2010,2010-10-27T00:00:00.000Z,24363,0,13.0,4.0,30.0,9.0,4.0,86.0,40.0,25.0,20.0,9.0,26.0,104.0,39.0,7.0,14.0,1,15
4,2010,2010-10-27T00:00:00.000Z,24364,1,21.0,6.0,36.0,14.0,2.0,93.0,40.0,22.0,13.0,15.0,28.0,95.0,51.0,6.0,14.0,21,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18719,2017,2018-04-11T00:00:00.000Z,34959,1,22.0,1.0,26.0,24.0,8.0,92.0,37.0,19.0,16.0,12.0,17.0,98.0,38.0,9.0,17.0,19,27
18720,2017,2018-04-11T00:00:00.000Z,34960,1,11.0,6.0,32.0,46.0,13.0,82.0,31.0,12.0,8.0,8.0,24.0,83.0,40.0,2.0,14.0,26,11
18721,2017,2018-04-11T00:00:00.000Z,34960,0,22.0,3.0,42.0,26.0,7.0,80.0,38.0,20.0,13.0,6.0,14.0,96.0,48.0,6.0,11.0,11,26
18722,2017,2018-04-11T00:00:00.000Z,35631,0,26.0,7.0,37.0,26.0,11.0,106.0,46.0,20.0,13.0,19.0,18.0,116.0,56.0,8.0,9.0,28,16


In [942]:
# function to group all stats by game

def group_stats_by_game(df):
    #remove postseason games
    df = df[df['game.postseason'] == False]
    
    #list of columns to keep
    stats_columns_to_keep = ['game.season','game.date','game.id','team.id','game.home_team_id','game.visitor_team_id','ast', 'blk', 'dreb', 'fg3a', 'fg3m', 'fga',
       'fgm', 'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl', 'turnover']
    
    #keep only specified columns and sort by date
    df = df[stats_columns_to_keep].sort_values('game.date')
    
    # new field indicating whether team is at home
    df['home'] = np.where(df['team.id'] == df['game.home_team_id'], 1, 0)
    
    # group all stats by game/team
    df = df.groupby(['game.season','game.date','game.id','team.id','game.home_team_id','game.visitor_team_id','home']).sum().reset_index()
    
    # create column to indicate opponent id
    df['opp.id'] = np.where(df['team.id'] == df['game.home_team_id'], df['game.visitor_team_id'], df['game.home_team_id'])
    df.drop(['game.home_team_id','game.visitor_team_id'], axis=1,inplace=True)
    
    return df

In [943]:
#function that creates dataframe with opponent stats for each team
def opponent_stats(df):
    df['team.id_copy'] = df['team.id']
    df['home'] = np.where(df['home'] == 1, 0, 1)
    df.drop(['team.id'], axis=1,inplace=True)
    df.rename(columns={'opp.id':'team.id','team.id_copy':'opp.id','ast':'opp_ast', 'blk':'opp_blk',
       'dreb':'opp_dreb', 'fg3a':'opp_fg3a', 'fg3m':'opp_fg3m', 'fga':'opp_fga', 'fgm':'opp_fgm', 'fta':'opp_fta', 'ftm':'opp_ftm', 'oreb':'opp_oreb', 'pf':'opp_pf', 'pts':'opp_pts',
       'reb':'opp_reb', 'stl':'opp_stl', 'turnover':'opp_turnover'},inplace=True)

    return df

In [973]:
#function that creates a dataframe with opponent stats grouped by game
def prepare_stats_df(df):
    df_team = group_stats_by_game(df)
    df_opponent = opponent_stats(group_stats_by_game(df))
    #merge df and df_opponents
    df = pd.merge(df_team, df_opponent, how='outer', on=['game.season', 'game.date','game.id','team.id','opp.id','home'])
    return df

In [1058]:
#creates columns with average team and opponent point totals
def create_average_scoring(df):
    #team stats
    # group by season and team and calculates a rolling mean of last x number of games with period of 1. Shift() to exclude the current row to only look at past rows
    df['score_avg_season'] = df.groupby(['game.season', 'team.id'])['pts'].transform(lambda x: x.rolling(82, 1).mean().shift())

    # last 10 games
    df['score_avg_l10'] = df.groupby(['game.season', 'team.id'])['pts'].transform(lambda x: x.rolling(10, 1).mean().shift())

    # average score for each team for the season by its home/away status(window size 41)
    df['pt_diff_avg_home'] = df.groupby(['game.season', 'team.id','home'])['pts'].transform(lambda x: x.rolling(41, 1).mean().shift())

    # vs same opponent same team last 8 games (not limited by season)
    df['score_avg_h2hl8'] = df.groupby(['team.id','opp.id'])['pts'].transform(lambda x: x.rolling(8, 1).mean().shift())
    
    #opponent stats
    df['opp_score_avg_season'] = df.groupby(['game.season', 'team.id'])['opp_pts'].transform(lambda x: x.rolling(82, 1).mean().shift())

    # last 10 games
    df['opp_score_avg_l10'] = df.groupby(['game.season', 'team.id'])['opp_pts'].transform(lambda x: x.rolling(10, 1).mean().shift())

    # average score for each team for the season by its home/away status(window size 41)
    df['opp_pt_diff_avg_home'] = df.groupby(['game.season', 'team.id','home'])['opp_pts'].transform(lambda x: x.rolling(41, 1).mean().shift())

    # vs same opponent same team last 8 games (not limited by season)
    df['opp_score_avg_h2hl8'] = df.groupby(['team.id','opp.id'])['opp_pts'].transform(lambda x: x.rolling(8, 1).mean().shift())
    
    return df

In [1059]:
def create_point_diff(df):
    # calculate point differential for the current game
    df['pt_diff'] = df['pts'] - df['opp_pts']

    # calculate average point differential for each team for the season (window size 82)
    df['pt_diff_avg_season'] = df.groupby(['game.season', 'team.id'])['pt_diff'].transform(lambda x: x.rolling(82, 1).mean().shift())

    # calculate average point differential for each team for the season by its home/away status(window size 41)
    df['pt_diff_avg_home'] = df.groupby(['game.season', 'team.id','home'])['pt_diff'].transform(lambda x: x.rolling(41, 1).mean().shift())

    #avg diff over the last 10 games (same season)
    df['pt_diff_avg_l10'] = df.groupby(['game.season', 'team.id'])['pt_diff'].transform(lambda x: x.rolling(10, 1).mean().shift())

    # vs same opponent same team last 8 games (not limited by season)
    df['pt_diff_avg_h2hl8'] = df.groupby(['team.id','opp.id'])['pt_diff'].transform(lambda x: x.rolling(8, 1).mean().shift())

    df.drop(['pt_diff'], axis=1, inplace=True)
    
    return df

In [1060]:
def create_efg_pct(df):

    #Effective field goal percentage = (fgm + (.5 * 3pm)) / fga
    df['fgm_total'] = df.groupby(['game.season', 'team.id'])['fgm'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['fg3m_total'] = df.groupby(['game.season', 'team.id'])['fg3m'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['fga_total'] = df.groupby(['game.season', 'team.id'])['fga'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['efg_pct'] = (df['fgm_total'] + (0.5 * df['fg3m_total'])) / df['fga_total']
    df.drop(['fgm_total','fg3m_total','fga_total'], axis=1, inplace=True)

    df['opp_fgm_total'] = df.groupby(['game.season', 'team.id'])['opp_fgm'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['opp_fg3m_total'] = df.groupby(['game.season', 'team.id'])['opp_fg3m'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['opp_fga_total'] = df.groupby(['game.season', 'team.id'])['opp_fga'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['opp_efg_pct'] = (df['opp_fgm_total'] + (0.5 * df['opp_fg3m_total'])) / df['opp_fga_total']
    df.drop(['opp_fgm_total','opp_fg3m_total','opp_fga_total'], axis=1, inplace=True)
    
    return df


In [1061]:
def create_avg_boxscr_stats(df):
    df['fg3m_avg'] = df.groupby(['game.season', 'team.id'])['fg3m'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_fg3m_avg'] = df.groupby(['game.season', 'team.id'])['opp_fg3m'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['ftm_avg'] = df.groupby(['game.season', 'team.id'])['ftm'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_ftm_avg'] = df.groupby(['game.season', 'team.id'])['opp_ftm'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['ast_avg'] = df.groupby(['game.season', 'team.id'])['ast'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_ast_avg'] = df.groupby(['game.season', 'team.id'])['opp_ast'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['turnover_avg'] = df.groupby(['game.season', 'team.id'])['turnover'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_turnover_avg'] = df.groupby(['game.season', 'team.id'])['opp_turnover'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['reb_avg'] = df.groupby(['game.season', 'team.id'])['reb'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_reb_avg'] = df.groupby(['game.season', 'team.id'])['opp_reb'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['oreb_avg'] = df.groupby(['game.season', 'team.id'])['oreb'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_reb_avg'] = df.groupby(['game.season', 'team.id'])['opp_oreb'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['stl_avg'] = df.groupby(['game.season', 'team.id'])['stl'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_stl_avg'] = df.groupby(['game.season', 'team.id'])['opp_stl'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['blk_avg'] = df.groupby(['game.season', 'team.id'])['blk'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_blk_avg'] = df.groupby(['game.season', 'team.id'])['opp_blk'].transform(lambda x: x.rolling(82,1).mean().shift())

    return df

In [1062]:
def feat_engineering(df):
    df = create_average_scoring(df)
    df = create_point_diff(df)
    df = create_efg_pct(df)
    df = create_avg_boxscr_stats(df)
    
    #drop some of the raw non-aggregated stats
    df.drop(['ast', 'blk', 'dreb', 'fg3a', 'fg3m', 'fga', 'fgm', 'fta', 'ftm', 'oreb', 'pf', 'reb', 'stl', 'turnover',
               'opp_ast', 'opp_blk', 'opp_dreb', 'opp.id','opp_fg3a', 'opp_fg3m', 'opp_fga', 'opp_fgm', 'opp_fta', 'opp_ftm', 'opp_oreb', 'opp_pf', 'opp_reb', 'opp_stl', 'opp_turnover','opp_pts'],
              axis=1, inplace=True)
    return df

In [1063]:
# puts the stats for each team in a game onto the same row
def combine_games(df):
    df_home = df[df['home'] == 1]
    df_away = df[df['home'] == 0]
    df = pd.merge(df_home, df_away, how='outer', on=['game.season', 'game.date','game.id'])
    return df

In [1064]:
def preprocess_df(df):
    df = prepare_stats_df(df)
    df = feat_engineering(df)
    df = combine_games(df)
    # winner column. 1 is when home team wins
    df['winner'] = np.where(df['pts_x'] > df['pts_y'], 1, 0)
    return df

Apply the above functions to prepare the stats df and the validation dataset

In [1100]:
df_stats_prepped = preprocess_df(df_stats)
df_val = preprocess_df(stats2018)

## Feature Selection

In [1101]:
columns_to_drop_clfX = ['game.season','game.date','game.id','team.id_x','home_x','pts_x','team.id_y','home_y','pts_y','winner']
columns_to_drop_regrX = ['game.season','game.date','game.id','team.id_x','home_x','team.id_y','home_y','winner']

df_stats_prepped.dropna(axis=0, inplace=True)
y = df_stats_prepped['winner']
df_X = df_stats_prepped.drop(columns_to_drop_clfX, axis =1)

val_X = df_val.drop(columns_to_drop_clfX, axis=1)
val_X = val_X.apply(lambda x: x.fillna(x.mean()),axis=0)

df_X.dropna(axis=0, inplace=True)

In [1102]:
val_X

Unnamed: 0,score_avg_season_x,score_avg_l10_x,pt_diff_avg_home_x,score_avg_h2hl8_x,opp_score_avg_season_x,opp_score_avg_l10_x,opp_pt_diff_avg_home_x,opp_score_avg_h2hl8_x,pt_diff_avg_season_x,pt_diff_avg_l10_x,pt_diff_avg_h2hl8_x,efg_pct_x,opp_efg_pct_x,fg3m_avg_x,opp_fg3m_avg_x,ftm_avg_x,opp_ftm_avg_x,ast_avg_x,opp_ast_avg_x,turnover_avg_x,opp_turnover_avg_x,reb_avg_x,opp_reb_avg_x,oreb_avg_x,stl_avg_x,opp_stl_avg_x,blk_avg_x,opp_blk_avg_x,score_avg_season_y,score_avg_l10_y,pt_diff_avg_home_y,score_avg_h2hl8_y,opp_score_avg_season_y,opp_score_avg_l10_y,opp_pt_diff_avg_home_y,opp_score_avg_h2hl8_y,pt_diff_avg_season_y,pt_diff_avg_l10_y,pt_diff_avg_h2hl8_y,efg_pct_y,opp_efg_pct_y,fg3m_avg_y,opp_fg3m_avg_y,ftm_avg_y,opp_ftm_avg_y,ast_avg_y,opp_ast_avg_y,turnover_avg_y,opp_turnover_avg_y,reb_avg_y,opp_reb_avg_y,oreb_avg_y,stl_avg_y,opp_stl_avg_y,blk_avg_y,opp_blk_avg_y
0,112.483101,115.929957,3.245949,112.040444,112.604295,116.157907,110.906184,113.301778,-0.121194,-0.22795,-1.261333,0.522317,0.523049,11.316247,11.287214,18.181024,18.253559,24.490289,24.530380,14.211326,14.229032,45.599803,10.484771,10.538358,7.846649,7.852063,5.193749,5.213983,112.553235,116.412823,-3.361352,113.301778,112.374244,116.168792,114.240309,112.040444,0.178991,0.244031,1.261333,0.522582,0.522543,11.313831,11.302078,18.209417,18.134653,24.534285,24.506736,14.207353,14.220059,45.631167,10.496559,10.527877,7.847483,7.833055,5.217468,5.183375
1,112.483101,115.929957,3.245949,112.040444,112.604295,116.157907,110.906184,113.301778,-0.121194,-0.22795,-1.261333,0.522317,0.523049,11.316247,11.287214,18.181024,18.253559,24.490289,24.530380,14.211326,14.229032,45.599803,10.484771,10.538358,7.846649,7.852063,5.193749,5.213983,112.553235,116.412823,-3.361352,113.301778,112.374244,116.168792,114.240309,112.040444,0.178991,0.244031,1.261333,0.522582,0.522543,11.313831,11.302078,18.209417,18.134653,24.534285,24.506736,14.207353,14.220059,45.631167,10.496559,10.527877,7.847483,7.833055,5.217468,5.183375
2,112.483101,115.929957,3.245949,112.040444,112.604295,116.157907,110.906184,113.301778,-0.121194,-0.22795,-1.261333,0.522317,0.523049,11.316247,11.287214,18.181024,18.253559,24.490289,24.530380,14.211326,14.229032,45.599803,10.484771,10.538358,7.846649,7.852063,5.193749,5.213983,112.553235,116.412823,-3.361352,113.301778,112.374244,116.168792,114.240309,112.040444,0.178991,0.244031,1.261333,0.522582,0.522543,11.313831,11.302078,18.209417,18.134653,24.534285,24.506736,14.207353,14.220059,45.631167,10.496559,10.527877,7.847483,7.833055,5.217468,5.183375
3,112.483101,115.929957,3.245949,112.040444,112.604295,116.157907,110.906184,113.301778,-0.121194,-0.22795,-1.261333,0.522317,0.523049,11.316247,11.287214,18.181024,18.253559,24.490289,24.530380,14.211326,14.229032,45.599803,10.484771,10.538358,7.846649,7.852063,5.193749,5.213983,112.553235,116.412823,-3.361352,113.301778,112.374244,116.168792,114.240309,112.040444,0.178991,0.244031,1.261333,0.522582,0.522543,11.313831,11.302078,18.209417,18.134653,24.534285,24.506736,14.207353,14.220059,45.631167,10.496559,10.527877,7.847483,7.833055,5.217468,5.183375
4,112.483101,115.929957,3.245949,112.040444,112.604295,116.157907,110.906184,113.301778,-0.121194,-0.22795,-1.261333,0.522317,0.523049,11.316247,11.287214,18.181024,18.253559,24.490289,24.530380,14.211326,14.229032,45.599803,10.484771,10.538358,7.846649,7.852063,5.193749,5.213983,112.553235,116.412823,-3.361352,113.301778,112.374244,116.168792,114.240309,112.040444,0.178991,0.244031,1.261333,0.522582,0.522543,11.313831,11.302078,18.209417,18.134653,24.534285,24.506736,14.207353,14.220059,45.631167,10.496559,10.527877,7.847483,7.833055,5.217468,5.183375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1180,123.358974,121.800000,12.230769,112.000000,114.128205,113.000000,112.461538,118.000000,9.230769,8.80000,-6.000000,0.545924,0.499413,14.012821,13.448718,18.961538,15.935897,27.243590,25.743590,13.820513,13.679487,52.269231,10.423077,9.782051,7.820513,7.192308,6.256410,5.000000,118.428571,113.400000,1.189189,118.000000,115.025974,111.700000,122.405405,112.000000,3.402597,1.700000,6.000000,0.512979,0.524164,11.415584,11.701299,18.337662,18.961039,24.025974,25.545455,14.207792,16.402597,49.974026,10.012987,13.142857,9.649351,8.493506,5.454545,5.337662
1181,114.628205,110.600000,7.210526,110.000000,113.230769,109.100000,106.289474,104.666667,1.397436,1.50000,5.333333,0.534049,0.531681,10.179487,12.179487,17.705128,15.333333,25.269231,25.487179,11.884615,12.089744,45.487179,9.461538,9.294872,6.358974,7.294872,4.833333,4.243590,114.397436,114.300000,-5.459459,104.666667,115.884615,111.000000,118.891892,110.000000,-1.487179,3.300000,-5.333333,0.518675,0.523582,13.205128,11.948718,20.038462,17.679487,24.397436,25.833333,14.551282,13.346154,47.320513,10.717949,10.500000,6.846154,8.371795,4.307692,4.717949
1182,113.346154,102.300000,10.105263,114.333333,110.000000,107.200000,106.789474,104.666667,3.346154,-4.90000,9.666667,0.525742,0.521915,11.141026,11.102564,16.205128,17.487179,28.166667,25.358974,13.384615,13.525641,47.371795,10.051282,12.076923,7.987179,7.807692,4.525641,5.179487,117.230769,113.800000,-6.282051,104.666667,118.717949,118.300000,123.000000,114.333333,-1.487179,-4.500000,-9.666667,0.514599,0.538298,10.538462,13.346154,19.615385,17.807692,25.641026,27.423077,13.153846,14.692308,46.833333,11.564103,11.897436,8.589744,6.987179,5.448718,5.858974
1183,119.923077,115.800000,4.078947,107.000000,118.564103,116.100000,114.289474,120.000000,1.358974,-0.30000,-13.000000,0.530653,0.512225,10.461538,10.692308,23.820513,21.333333,24.987179,24.884615,14.474359,13.012821,47.371795,11.641026,10.089744,7.064103,8.576923,4.884615,6.205128,114.164557,117.100000,2.615385,120.000000,108.746835,103.400000,109.974359,107.000000,5.417722,13.700000,13.000000,0.537917,0.506859,12.392405,10.126582,18.987342,17.075949,26.582278,22.227848,14.987342,13.620253,47.518987,9.088608,10.189873,8.215190,8.734177,5.936709,4.759494


In [1103]:
# remove features with small variance
vt = VarianceThreshold(0.1)
df_transformed = vt.fit_transform(df_X)
selected_columns = df_X.columns[vt.get_support()]
# transforming an array back to a data-frame preserves column labels
df_transformed = pd.DataFrame(df_transformed, columns = selected_columns)

# remove highly correlated pairs
df_corr = df_transformed.corr().abs()
indices = np.where(df_corr > 0.9)
indices = [(df_corr.index[x], df_corr.columns[y]) for x, y in zip(*indices)
              if x != y and x < y]
for idx in indices:
    try:
        df_transformed.drop(idx[1], axis = 1, inplace=True)
    except KeyError:
        pass

#select kbest
skb = SelectKBest(f_regression, k=15)
X = skb.fit_transform(df_transformed, y)
X = pd.DataFrame(X,columns=df_transformed.columns[skb.get_support()])

In [1104]:
feats = X.columns.to_list()
val_X = val_X[feats]

In [1075]:
def standard_scale(df):
    scaler = StandardScaler()
    X = scaler.fit_transform(df)
    return X

In [1105]:
X = standard_scale(X)
val_X = standard_scale(val_X)


## Classification Model

In [1106]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.80,random_state=1)

In [1107]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [1108]:
# Train the model using the training sets
model.fit(X_train,y_train)


GaussianNB()

In [1109]:
y_pred=model.predict(X_test)

In [1110]:
#print(f'MSE is: {mean_squared_error(y_test, y_pred)}')
#print(f'R2 score is: {r2_score(y_test, y_pred)}')
print(f'Accuracy score is: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC score is: {roc_auc_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy score is: 0.6527777777777778
ROC AUC score is: 0.6429905503057254
[[414 286]
 [314 714]]


In [1131]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
grid_search = GridSearchCV(
    clf,
    {
        'n_estimators': [20,50,80],
        'max_depth': [2,4, 6],
        'min_samples_leaf': [10,30,50,70],
    },
    cv=5,
    n_jobs=-1,
    scoring='f1' #['precision', 'recall', 'f1']
)  
grid_search.fit(X_train,y_train)
print('Best params:',grid_search.best_params_)
print('Best score:',grid_search.best_score_)

Best params: {'max_depth': 2, 'min_samples_leaf': 10, 'n_estimators': 80}
Best score: 0.7475614744629155


In [1132]:
y_pred=grid_search.predict(X_test)

In [1133]:
print(f'Accuracy score is: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC score is: {roc_auc_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy score is: 0.6701388888888888
ROC AUC score is: 0.6195219566425791
[[247 453]
 [117 911]]


In [1114]:
from sklearn.svm import SVC

In [1143]:
clf = SVC(kernel='rbf')

grid_search = GridSearchCV(
    clf,
    {
        'C': [.1, .5, 1, 10],
        'kernel': ['poly', 'rbf'],
    },
    cv=5,
    n_jobs=-1,
    scoring='recall' #['precision', 'recall', 'f1']
)  
grid_search.fit(X_train,y_train)
print('Best params:',grid_search.best_params_)
print('Best score:',grid_search.best_score_)
clf.fit(X_train,y_train)

Best params: {'C': 0.1, 'kernel': 'poly'}
Best score: 0.9636138613861386


SVC()

In [1144]:
y_pred=clf.predict(X_test)

In [1145]:
print(f'Accuracy score is: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC score is: {roc_auc_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy score is: 0.6660879629629629
ROC AUC score is: 0.6318426903835465
[[316 384]
 [193 835]]
