In [1]:
# Modeling Code

import pandas as pd
import numpy as np
import time
#from basketball_reference_web_scraper import client
import os
import gc
import datetime
import sklearn
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from itertools import product
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
import xgboost
from matplotlib import pyplot
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR

# gc collect
gc.collect()

0

In [2]:
# Configuration
feature = '/home/iocak/model_data/'
target = '/home/iocak/tuning_results/'

# Read Data
dataset = pd.read_parquet(feature + 'all_data.parquet')

In [3]:
# selected features from former all feat xgb trainings, 0.9 cumul fscore

selected_features = ['predictor_double_count_wma_30',
                    'predictor_fantasy_point_wma_30',
                    'predictor_seconds_played_wma_30',
                    'salary_edited',
                    'predictor_points_scored_wma_30',
                    'starter_yes',
                    'player_ly_total_turnovers',
                    'starter_no',
                    'predictor_made_field_goals_wma_30',
                    'fantasy_point_lag_1_rollmean_4',
                    'predictor_attempted_free_throws_wma_30',
                    'no_salary_info',
                    'team_date_sum_predictor_triple_double_wma_30',
                    'team_date_sum_predictor_points_scored_wma_30',
                    'team_date_sum_predictor_fantasy_point_wma_30',
                    'predictor_attempted_field_goals_wma_30',
                    'team_date_sum_predictor_days_since_last_game_wma_30',
                    'team_date_sum_predictor_rebounds_wma_30',
                    'per_minute_player_ly_points',
                    'predictor_turnovers_wma_30',
                    'per_minute_player_ly_games_played',
                    'team_date_sum_predictor_seconds_played_wma_30',
                    'seconds_played_lag_1',
                    'team_date_sum_predictor_made_field_goals_wma_30',
                    'team_date_sum_predictor_attempted_field_goals_wma_30',
                    'team_date_sum_predictor_personal_fouls_wma_30',
                    'team_date_sum_seconds_played_lag_1',
                    'opponent_date_sum_predictor_days_since_last_game_wma_30',
                    'team_date_sum_predictor_double_count_wma_30',
                    'fantasy_point_lag_1_rollmean_5',
                    'team_date_sum_predictor_attempted_free_throws_wma_30',
                    'team_date_sum_predictor_made_free_throws_wma_30',
                    'team_date_sum_predictor_attempted_three_point_field_goals_wma_30',
                    'team_date_sum_seconds_played_lag_1_rollmean_4',
                    'team_date_sum_predictor_turnovers_wma_30',
                    'player_ly_total_attempted_free_throws',
                    'days_since_last_game',
                    'fantasy_point_lag_1_rollmean_3',
                    'team_date_sum_seconds_played_lag_2',
                    'team_date_sum_predictor_defensive_rebounds_wma_30',
                    'team_date_sum_predictor_made_three_point_field_goals_wma_30',
                    'opponent_date_sum_predictor_attempted_three_point_field_goals_wma_30',
                    'double_double_lag_1_rollmean_4',
                    'player_ly_total_steals',
                    'odd_team',
                    'per_minute_player_ly_turnovers',
                    'opponent_date_sum_predictor_made_three_point_field_goals_wma_30',
                    'per_minute_player_ly_made_field_goals',
                    'team_date_sum_predictor_double_double_wma_30',
                    'steals_lag_1_rollmean_5',
                    'player_ly_total_offensive_rebounds',
                    'turnovers_lag_1_rollmean_4',
                    'attempted_field_goals_lag_3',
                    'opponent_date_sum_predictor_assists_wma_30',
                    'opponent_avg_player_ly_total_personal_fouls',
                    'opponent_avg_player_ly_total_points',
                    'team_avg_player_ly_total_made_three_point_field_goals',
                    'per_minute_player_cumul_assists',
                    'team_avg_player_ly_total_points',
                    'opponent_date_sum_seconds_played_lag_1_rollmean_5',
                    'team_date_sum_seconds_played_lag_3',
                    'team_date_sum_attempted_field_goals_lag_1_rollmean_3',
                    'opponent_avg_player_ly_total_attempted_field_goals',
                    'odd_opponent',
                    'opponent_avg_player_ly_total_attempted_free_throws',
                    'rebounds_lag_1_rollmean_5',
                    'team_avg_per_minute_player_cumul_offensive_rebounds',
                    'is_home',
                    'team_date_sum_predictor_assists_wma_30',
                    'team_date_sum_made_field_goals_lag_1_rollmean_4',
                    'predictor_double_double_wma_30',
                    'double_count_lag_1',
                    'predictor_assists_wma_30',
                    'team_date_sum_rebounds_lag_1_rollmean_3',
                    'per_minute_player_ly_steals',
                    'opponent_avg_per_minute_player_cumul_defensive_rebounds',
                    'opponent_date_sum_rebounds_lag_1_rollmean_5',
                    'team_date_sum_seconds_played_lag_1_rollmean_3',
                    'opponent_date_sum_predictor_steals_wma_30',
                    'team_date_sum_points_scored_lag_1_rollmean_5',
                    'per_minute_player_cumul_made_field_goals',
                    'team_date_sum_fantasy_point_lag_1',
                    'turnovers_lag_3',
                    'defensive_rebounds_lag_1_rollmean_5',
                    'opponent_avg_player_ly_total_blocks',
                    'opponent_date_sum_predictor_made_free_throws_wma_30',
                    'team_date_sum_assists_lag_1_rollmean_5',
                    'team_date_sum_turnovers_lag_1_rollmean_3',
                    'team_avg_per_minute_player_ly_made_free_throws',
                    'team_date_sum_turnovers_lag_1_rollmean_5',
                    'opponent_avg_age',
                    'fantasy_point_lag_3',
                    'turnovers_lag_1_rollmean_3',
                    'team_date_sum_rebounds_lag_1_rollmean_5',
                    'team_date_sum_made_field_goals_lag_2',
                    'per_minute_player_cumul_steals',
                    'opponent_date_sum_defensive_rebounds_lag_2',
                    'team_avg_per_minute_player_cumul_attempted_free_throws',
                    'per_minute_player_ly_assists',
                    'steals_lag_1_rollmean_3',
                    'team_date_sum_rebounds_lag_1',
                    'predictor_days_since_last_game_wma_30',
                    'opponent_avg_per_minute_player_ly_attempted_three_point_field_goals',
                    'opponent_date_sum_attempted_three_point_field_goals_lag_1',
                    'made_three_point_field_goals_lag_1_rollmean_5',
                    'team_date_sum_attempted_free_throws_lag_1_rollmean_4',
                    'team_avg_player_ly_total_blocks',
                    'per_minute_player_cumul_made_three_point_field_goals',
                    'player_ly_total_assists',
                    'attempted_field_goals_lag_2',
                    'team_avg_per_minute_player_cumul_turnovers',
                    'team_date_sum_attempted_field_goals_lag_1_rollmean_4',
                    'per_minute_player_ly_blocks',
                    'team_date_sum_rebounds_lag_1_rollmean_4',
                    'opponent_date_sum_seconds_played_lag_1_rollmean_3',
                    'opponent_avg_per_minute_player_cumul_blocks',
                    'team_avg_player_ly_total_defensive_rebounds',
                    'team_avg_per_minute_player_cumul_games_played',
                    'opponent_date_sum_predictor_personal_fouls_wma_30',
                    'team_date_sum_points_scored_lag_3',
                    'team_avg_per_minute_player_ly_blocks',
                    'opponent_avg_player_ly_total_offensive_rebounds',
                    'per_minute_player_cumul_turnovers',
                    'opponent_date_sum_predictor_rebounds_wma_30',
                    'player_ly_total_blocks',
                    'per_minute_player_cumul_made_free_throws',
                    'team_date_sum_attempted_three_point_field_goals_lag_1_rollmean_4',
                    'team_avg_per_minute_player_cumul_games_started',
                    'rebounds_lag_1_rollmean_4',
                    'opponent_avg_player_ly_total_made_field_goals',
                    'made_field_goals_lag_1_rollmean_4',
                    'player_ly_total_defensive_rebounds',
                    'opponent_avg_per_minute_player_ly_made_free_throws',
                    'opponent_date_sum_predictor_attempted_field_goals_wma_30',
                    'points_scored_lag_2',
                    'opponent_date_sum_attempted_three_point_field_goals_lag_1_rollmean_5',
                    'Pos_SF',
                    'points_scored_lag_1',
                    'opponent_avg_per_minute_player_cumul_made_free_throws',
                    'team_avg_Pos_PF',
                    'team_date_sum_predictor_steals_wma_30',
                    'opponent_avg_player_ly_total_attempted_three_point_field_goals',
                    'opponent_date_sum_predictor_double_double_wma_30',
                    'opponent_date_sum_attempted_free_throws_lag_1_rollmean_5',
                    'team_date_sum_made_free_throws_lag_1_rollmean_5',
                    'team_avg_player_ly_total_made_field_goals',
                    'team_date_sum_rebounds_lag_2',
                    'age',
                    'team_date_sum_assists_lag_1_rollmean_4',
                    'assists_lag_1_rollmean_5',
                    'opponent_avg_per_minute_player_cumul_attempted_free_throws',
                    'defensive_rebounds_lag_1',
                    'per_minute_player_ly_games_started',
                    'opponent_date_sum_points_scored_lag_1',
                    'opponent_date_sum_seconds_played_lag_1',
                    'opponent_date_sum_points_scored_lag_3',
                    'predictor_made_free_throws_wma_30',
                    'opponent_avg_Pos_PG',
                    'per_minute_player_ly_defensive_rebounds',
                    'opponent_avg_per_minute_player_cumul_steals',
                    'team_avg_per_minute_player_ly_points',
                    'opponent_date_sum_turnovers_lag_1_rollmean_5',
                    'team_date_sum_double_count_lag_2',
                    'team_date_sum_double_double_lag_2',
                    'opponent_date_sum_made_three_point_field_goals_lag_2',
                    'opponent_date_sum_steals_lag_2',
                    'predictor_triple_double_wma_30',
                    'opponent_date_sum_predictor_defensive_rebounds_wma_30',
                    'opponent_date_sum_double_count_lag_1',
                    'team_date_sum_fantasy_point_lag_1_rollmean_5',
                    'seconds_played_lag_2',
                    'opponent_avg_per_minute_player_cumul_personal_fouls',
                    'opponent_date_sum_fantasy_point_lag_3',
                    'attempted_free_throws_lag_1_rollmean_4',
                    'per_minute_player_cumul_attempted_free_throws',
                    'turnovers_lag_1',
                    'opponent_date_sum_predictor_fantasy_point_wma_30',
                    'team_avg_per_minute_player_ly_attempted_free_throws',
                    'opponent_date_sum_attempted_free_throws_lag_2',
                    'opponent_date_sum_predictor_triple_double_wma_30',
                    'per_minute_player_cumul_offensive_rebounds',
                    'team_avg_age',
                    'team_avg_per_minute_player_cumul_made_free_throws',
                    'opponent_avg_Pos_C',
                    'team_avg_player_ly_total_minutes_played',
                    'attempted_three_point_field_goals_lag_1_rollmean_3',
                    'opponent_avg_cumulative_season_experience_past',
                    'per_minute_player_cumul_defensive_rebounds',
                    'opponent_date_sum_predictor_turnovers_wma_30',
                    'predictor_defensive_rebounds_wma_30']

In [4]:
## Trial param for func
#season_end_year_list = [2015, 2017]
#model_obj = linear_model.Lasso()
#parameters = {'alpha' : [0.001, 0.01, 0.05, 0.2]}

# tuner function
def ts_cv_tuner(dataset, season_end_year_list, model_obj, parameters):
    '''
    dataset: all data before train-test split (dataframe),
    season_end_year_list: first end last season end year to be used in training - testing (list of ints),
    model_obj: your model objects, 
    parameters: your hyper parameters in a dict
    '''
    # Train Test Split
    train = dataset[(dataset['season_end_year'] >= season_end_year_list[0]) & 
                    (dataset['season_end_year'] < season_end_year_list[1])].drop(
                    columns = ['date', 'opponent', 'team', 'name', 'slug', 'season_end_year', 'fantasy_point'])
    
    train_labels = dataset[(dataset['season_end_year'] >= season_end_year_list[0]) & 
                    (dataset['season_end_year'] < season_end_year_list[1])][['fantasy_point']]
    
    test = dataset[(dataset['season_end_year'] == season_end_year_list[1])].drop(
            columns = ['date', 'opponent', 'team', 'name', 'slug', 'season_end_year', 'fantasy_point'])
    
    test_labels = dataset[(dataset['season_end_year'] == season_end_year_list[1])][['fantasy_point']]
    
    # Filter desires cols (not an input of the function, used from global env)
    train = train[selected_features]
    test = test[selected_features]

    # Scale data
    scaler = StandardScaler()
    
    # Fit on training set only.
    scaler.fit(train)
    
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)
    
    # Cross Validiation for Best Param
    clf = GridSearchCV(model_obj, 
                   parameters, 
                   n_jobs=-1, 
                   cv = TimeSeriesSplit(max_train_size=None, n_splits=3),
                   verbose = 2)
    clf.fit(X = train, y = train_labels)
    
    winner_model = clf.best_estimator_
    
    ### Train the model using the training sets
    winner_model.fit(train, train_labels)
    
    ### Make predictions using the testing set
    y_pred = winner_model.predict(test)
    
    ### General Error & Bias
    err = np.subtract(pd.DataFrame(y_pred), test_labels)
    sq_err = np.subtract(pd.DataFrame(y_pred), test_labels)**2
    
    test_mae = float(np.mean(np.abs(err))) 
    test_rmse = float(np.sqrt(np.mean(sq_err))) 
    test_bias = float(np.mean(err)) 
    
    test_metrics = {'test_mae' : float(test_mae), 
                    'test_rmse' : float(test_rmse), 
                    'test_bias' : float(test_bias)}
    
    return winner_model, test_metrics

In [5]:
# Run a loop for desired models - params

# Trial param for func
season_end_year_list = [2015, 2017]

model_param_list = [(linear_model.LinearRegression(), {}), 
                    #(linear_model.Lasso(), {'alpha' : [0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 0.9]}),
                    (xgboost.XGBRegressor(), {'objective' : ['reg:linear'],
                                              'learning_rate' : [0.005, 0.01, 0.05, 0.3], #so called `eta` value
                                              'max_depth': [3, 5],
                                              'min_child_weight': [1, 10],
                                              'subsample': [0.7],
                                              'colsample_bytree': [0.7],
                                              'n_estimators': [500, 1000],
                                              'reg_lambda' : [0.3, 0.5],
                                              'alpha' : [0.9],
                                              'eval_metric' : ['rmse'],
                                              'booster' : ['gbtree']})]

In [None]:
result_list = []
counter = 0

# call function

for i in model_param_list:
    temp_winner, temp_metrics = ts_cv_tuner(dataset, season_end_year_list, i[0], i[1])
    result_list.append([temp_winner, temp_metrics])
    
    temp_metrics['model'] = str(temp_winner)
    pd.DataFrame(temp_metrics, index = [0]).to_csv(target + f'model_{counter}_selectedfeat_newfeat_smartlag_wma.csv')
    
    print('\n Result: \n', temp_winner, '\n', temp_metrics, '\n \n')
    counter += 1

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    9.0s finished



 Result: 
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) 
 {'test_mae': 6.8937795295247986, 'test_rmse': 8.775631894267095, 'test_bias': -0.11679470551660551, 'model': 'LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)'} 
 

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 28.8min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 39.8min finished


