In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load Data - pandas
# Clean Data - clean_data: clean_data()
# Process Data - adv_processing: convert_percentages(), convert_to_matchups()
# Train & Evaluate model - train_model: split_data_to_test(), learn_from_data()
# Make Predictions - train_model: get_nearest_neighbors(), get_norm_team_stats(), 
#                   get_matchup_sample(), sim_convert_to_matchups(), process_matchups_for_model(),
#                   simulate_and_predict()
# Summarize Predictions - ?

In [2]:
df = pd.read_csv('data/all_team_data.csv') # Load Data 

In [3]:
# Clean Data 
from clean_data import clean_data 
from data_types import columns

df = df[columns] # subset to selected columms
df = clean_data(df) # adjust feature types, match school names, create game_ids, drop NAs 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['school'] = df['school'].map(team_labels)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10809 entries, 0 to 11330
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   game         10809 non-null  int64  
 1   game_result  10809 non-null  int64  
 2   date         10809 non-null  object 
 3   school       10809 non-null  object 
 4   opp_team_id  10809 non-null  object 
 5   pts          10809 non-null  int64  
 6   opp_pts      10809 non-null  int64  
 7   ortg         10809 non-null  float64
 8   drtg         10809 non-null  float64
 9   pace         10809 non-null  float64
 10  ftr          10809 non-null  float64
 11  3par         10809 non-null  float64
 12  ts_perc      10809 non-null  float64
 13  trb_perc     10809 non-null  float64
 14  ast_perc     10809 non-null  float64
 15  stl_perc     10809 non-null  float64
 16  blk_perc     10809 non-null  float64
 17  efg_perc     10809 non-null  float64
 18  tov_perc     10809 non-null  float64
 19  orb_

In [5]:
# Need a function for using means that gets the mean of the last k games
k = 3
list_of_games = []
for game, team, id in zip(df['game'], df['school'], df['game_id']):
    if game <= k:
        continue
    else: 
        current_stats = df[['game', 'date', 'game_result', 'school', 'opp_team_id', 'home', 'away', 'srs']][(df['school'] == team) & (df['game'] == game)]
        current_stats['game_id'] = id
        as_of_last_game = df[['streak', 'wins', 'losses']][(df['school'] == team) & (df['game'] == (game - 1))]
        as_of_last_game['game_id'] = id
        average_of_last_k = df[['pace', 'pts', 'opp_pts', 'ortg', 'drtg', 'ftr', '3par', 'ts_perc', 'trb_perc', 'ast_perc', 'stl_perc', 
            'blk_perc', 'efg_perc', 'tov_perc', 'orb_perc', 'ft_fga', 'overtimes']][(df['game'] >= (game - k)) & (df['game'] < game) & (df['school'] == team)].mean().to_frame().T
        average_of_last_k['game_id'] = id
        merged_row = pd.merge(pd.merge(current_stats, as_of_last_game, on='game_id'), average_of_last_k, on='game_id')
        list_of_games.append(merged_row)


In [6]:
df = pd.concat(list_of_games) # merge list into df
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9525 entries, 0 to 0
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   game         9525 non-null   int64  
 1   date         9525 non-null   object 
 2   game_result  9525 non-null   int64  
 3   school       9525 non-null   object 
 4   opp_team_id  9525 non-null   object 
 5   home         9525 non-null   uint8  
 6   away         9525 non-null   uint8  
 7   srs          9525 non-null   float64
 8   streak       9525 non-null   int64  
 9   wins         9525 non-null   int64  
 10  losses       9525 non-null   int64  
 11  pace         9525 non-null   float64
 12  pts          9525 non-null   float64
 13  opp_pts      9525 non-null   float64
 14  ortg         9525 non-null   float64
 15  drtg         9525 non-null   float64
 16  ftr          9525 non-null   float64
 17  3par         9525 non-null   float64
 18  ts_perc      9525 non-null   float64
 19  trb_perc 

In [7]:
# Process data 
from adv_processing import only_duplicates, convert_percentages, convert_to_matchups

df = only_duplicates(df) # keep only duplicate game_ids
df = convert_percentages(df) # convert 0-100 percentages to 0-1
df_match = convert_to_matchups(df) # convert df table to matchup table

In [10]:
# Train Test Split

from train_model import split_data_to_test

# Split data based on date string for training and predictions
df_match_train = split_data_to_test(df_match, date='2023-02-28', type='train')
df_match_pred = split_data_to_test(df_match, date='2023-02-28', type='pred')

In [11]:
df_match_train.columns

Index(['game', 'date', 'game_result', 'school', 'opp_team_id', 'home', 'away',
       'srs', 'streak', 'wins', 'losses', 'pace', 'pts', 'opp_pts', 'ortg',
       'drtg', 'ftr', '3par', 'ts_perc', 'trb_perc', 'ast_perc', 'stl_perc',
       'blk_perc', 'efg_perc', 'tov_perc', 'orb_perc', 'ft_fga', 'overtimes',
       'game_id', 'opp_pts_self', 'opp_pts_team', 'opp_ortg', 'opp_drtg',
       'opp_pace', 'opp_ftr', 'opp_3par', 'opp_ts_perc', 'opp_trb_perc',
       'opp_ast_perc', 'opp_stl_perc', 'opp_blk_perc', 'opp_efg_perc',
       'opp_tov_perc', 'opp_orb_perc', 'opp_ft_fga', 'opp_srs', 'opp_wins',
       'opp_losses', 'opp_streak'],
      dtype='object')

In [12]:
# Train Model 
# Subset training data 
df_match_train = df_match_train.drop(['game', 'date', 'school', 'opp_team_id', 'game_id'], axis=1)


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors

In [14]:

# Standardization Pipeline
# Create a pipeline for numerical features
num_transformer = Pipeline(steps=[('scaler', StandardScaler())]) 

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['pts', 'opp_pts', 'ortg', 'drtg','pace', 'ftr','3par', 'ts_perc', 
                                  'trb_perc', 'ast_perc', 'stl_perc', 'blk_perc',
                                'efg_perc', 'tov_perc', 'orb_perc', 'ft_fga', 'opp_pts_self', 'opp_pts_team',
                                'opp_ortg', 'opp_drtg', 'opp_pace','opp_ftr', 'opp_3par', 'opp_ts_perc', 
                                'opp_trb_perc', 'opp_ast_perc', 'opp_stl_perc', 'opp_blk_perc', 
                                'opp_efg_perc', 'opp_tov_perc', 'opp_orb_perc', 'opp_ft_fga'])
        ], 
        remainder='passthrough'
    )  


In [110]:
X = df_match_train.drop('game_result', axis=1)
y = df_match_train['game_result']
y = y.astype('int')

# split, train, test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=235)


# Scale Training Data
X_train_scaled = preprocessor.fit_transform(X_train) # Scale Data
    
# Transform Test data 
X_test_scaled = preprocessor.transform(X_test) # Scale Data


# Define the hyperparameters to use with GridSearchCV
param_grid = {
    'n_estimators': [64],
    'max_depth': [10],
    'min_samples_split': [2],
    'min_samples_leaf': [2],
    'max_features': ['log2'],
    'bootstrap': [True], 
    'oob_score': [True]
    }

# Define Random Forest Classifier 
rfc = RandomForestClassifier()

# Define Grid Search CV
grid = GridSearchCV(rfc, param_grid, scoring='accuracy', cv=5, n_jobs=4)

# Fit Grid Search
grid.fit(X_train_scaled, y_train)  

from sklearn.calibration import CalibratedClassifierCV
# Calibrate the classsifier using CalibratedClassifierCV
calibrated_clf = CalibratedClassifierCV(grid, method='sigmoid', cv=5, n_jobs=4)
calibrated_clf.fit(X_train_scaled, y_train)
    
# Predict probabilities for the test set using the calibrated classifier 
y_proba = calibrated_clf.predict_proba(X_test_scaled)



# Evaluate performance of Random Forests Model 
from sklearn.metrics import classification_report

predictions = grid.predict(X_test_scaled)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.71      0.65      0.68       399
           1       0.68      0.74      0.71       401

    accuracy                           0.69       800
   macro avg       0.70      0.69      0.69       800
weighted avg       0.70      0.69      0.69       800



In [112]:
y_proba[1]

array([0.37437603, 0.62562397])

In [17]:
grid.best_estimator_

In [10]:
# Prediction of rolling mean data
df_match_pred


In [18]:
actual_outcomes = df_match_pred[['game_id', 'school', 'opp_team_id', 'game_result']] # dataframe of actual results

In [19]:
# Subset prediction data 
X_pred = df_match_pred.drop(['game', 'date', 'school', 'opp_team_id', 'game_id', 'game_result'], axis=1)


In [20]:
# Scale data
X_pred_scaled = preprocessor.transform(X_pred) # Scale Data

In [22]:
actual_outcomes['pred_pos'] = grid.predict(X_pred_scaled) # Add prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_outcomes['pred_pos'] = grid.predict(X_pred_scaled) # Add prediction


In [24]:
actual_outcomes['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
actual_outcomes['accuracy'] = np.vectorize(lambda x, y: 1 if x == y else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_outcomes['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_outcomes['accuracy'] = np.vectorize(lambda x, y: 1 if x == y else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])


In [26]:
actual_outcomes['accuracy'].mean()

0.6890459363957597

In [31]:
recall = actual_outcomes['tp'].sum() / actual_outcomes['game_result'].sum()
precision = actual_outcomes['tp'].sum() / actual_outcomes['pred_pos'].sum()
fp_rate = 1 - (actual_outcomes['tp'].sum() / actual_outcomes['pred_pos'].sum())

print('Recall: {:.2f} \nPrecision: {:.2f}\nFP Rate: {:.2f}'.format(recall, precision, fp_rate))

Recall: 0.78 
Precision: 0.70
FP Rate: 0.30


In [32]:
# LOOP TO FIGURE OUT BEST K 

# Need a function for using means that gets the mean of the last k games
games = [5, 7, 9, 11, 13, 15, 17]
dict_of_pred_tables = {}
k_report = {'k': [], 'recall': [], 'precision': [], 'fp_rate': []}

for k in games:
    df = pd.read_csv('data/all_team_data.csv') # Load Data 
    # Clean Data 
    from clean_data import clean_data 
    from data_types import columns

    df = df[columns] # subset to selected columms
    df = clean_data(df) # adjust feature types, match school names, create game_ids, drop NAs 

    list_of_games = []
    for game, team, id in zip(df['game'], df['school'], df['game_id']):
        if game <= k:
            continue
        else: 
            current_stats = df[['game', 'date', 'game_result', 'school', 'opp_team_id', 'home', 'away', 'srs']][(df['school'] == team) & (df['game'] == game)]
            current_stats['game_id'] = id
            as_of_last_game = df[['streak', 'wins', 'losses']][(df['school'] == team) & (df['game'] == (game - 1))]
            as_of_last_game['game_id'] = id
            average_of_last_k = df[['pace', 'pts', 'opp_pts', 'ortg', 'drtg', 'ftr', '3par', 'ts_perc', 'trb_perc', 'ast_perc', 'stl_perc', 
                'blk_perc', 'efg_perc', 'tov_perc', 'orb_perc', 'ft_fga', 'overtimes']][(df['game'] >= (game - k)) & (df['game'] < game) & (df['school'] == team)].mean().to_frame().T
            average_of_last_k['game_id'] = id
            merged_row = pd.merge(pd.merge(current_stats, as_of_last_game, on='game_id'), average_of_last_k, on='game_id')
            list_of_games.append(merged_row)
        
    
    df = pd.concat(list_of_games) # merge list into df


    # Process data 
    from adv_processing import only_duplicates, convert_percentages, convert_to_matchups

    df = only_duplicates(df) # keep only duplicate game_ids
    df = convert_percentages(df) # convert 0-100 percentages to 0-1
    df_match = convert_to_matchups(df) # convert df table to matchup table

    # Train Test Split

    from train_model import split_data_to_test

    # Split data based on date string for training and predictions
    df_match_train = split_data_to_test(df_match, date='2023-02-28', type='train')
    df_match_pred = split_data_to_test(df_match, date='2023-02-28', type='pred')

    # Train Model 
    # Subset training data 
    df_match_train = df_match_train.drop(['game', 'date', 'school', 'opp_team_id', 'game_id'], axis=1)  

    X = df_match_train.drop('game_result', axis=1)
    y = df_match_train['game_result']
    y = y.astype('int')

    # split, train, test 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=235)


    # Scale Training Data
    X_train_scaled = preprocessor.fit_transform(X_train) # Scale Data
    
    # Transform Test data 
    X_test_scaled = preprocessor.transform(X_test) # Scale Data


    # Define the hyperparameters to use with GridSearchCV
    param_grid = {
        'n_estimators': [64, 100, 128, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True, False], 
        'oob_score': [True, False]
        }

    # Define Random Forest Classifier 
    rfc = RandomForestClassifier()

    # Define Grid Search CV
    grid = GridSearchCV(rfc, param_grid, scoring='accuracy', cv=20, n_jobs=4)

    # Fit Grid Search
    grid.fit(X_train_scaled, y_train)  


    # Evaluate performance of Random Forests Model 
    from sklearn.metrics import classification_report

    predictions = grid.predict(X_test_scaled)
    print(classification_report(y_test, predictions))
    
    print(grid.best_estimator_)

    dict_of_pred_tables[k] = df_match_pred[['game_id', 'school', 'opp_team_id', 'game_result']] # dataframe of actual results

    # Subset prediction data 
    X_pred = df_match_pred.drop(['game', 'date', 'school', 'opp_team_id', 'game_id', 'game_result'], axis=1)

    # Scale data
    X_pred_scaled = preprocessor.transform(X_pred) # Scale Data

    dict_of_pred_tables[k]['pred_pos'] = grid.predict(X_pred_scaled) # Add prediction

    dict_of_pred_tables[k]['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
    dict_of_pred_tables[k]['accuracy'] = np.vectorize(lambda x, y: 1 if x == y else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])

    k_report['k'].append(k)
    k_report['recall'].append(dict_of_pred_tables[k]['tp'].sum() / dict_of_pred_tables[k]['game_result'].sum())
    k_report['precision'].append(dict_of_pred_tables[k]['tp'].sum() / dict_of_pred_tables[k]['pred_pos'].sum())
    k_report['accuracy'].append(dict_of_pred_tables[k]['accuracy'].mean())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['school'] = df['school'].map(team_labels)
4320 fits failed out of a total of 17280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4320 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jeffreysachs/Documents/ncaa_predictor/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jeffreysachs/Documents/ncaa_predictor/lib/python3.8/si

              precision    recall  f1-score   support

           0       0.70      0.67      0.69       380
           1       0.71      0.74      0.73       420

    accuracy                           0.71       800
   macro avg       0.71      0.71      0.71       800
weighted avg       0.71      0.71      0.71       800

RandomForestClassifier(max_depth=5, min_samples_leaf=2, min_samples_split=5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['pred_pos'] = grid.predict(X_pred_scaled) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

              precision    recall  f1-score   support

           0       0.70      0.73      0.72       366
           1       0.72      0.69      0.71       372

    accuracy                           0.71       738
   macro avg       0.71      0.71      0.71       738
weighted avg       0.71      0.71      0.71       738

RandomForestClassifier(max_depth=5, min_samples_leaf=2, min_samples_split=10,
                       n_estimators=200)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['pred_pos'] = grid.predict(X_pred_scaled) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

              precision    recall  f1-score   support

           0       0.69      0.69      0.69       341
           1       0.69      0.69      0.69       340

    accuracy                           0.69       681
   macro avg       0.69      0.69      0.69       681
weighted avg       0.69      0.69      0.69       681

RandomForestClassifier(max_depth=10, max_features='log2', n_estimators=200,
                       oob_score=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['pred_pos'] = grid.predict(X_pred_scaled) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

              precision    recall  f1-score   support

           0       0.66      0.66      0.66       309
           1       0.67      0.67      0.67       319

    accuracy                           0.66       628
   macro avg       0.66      0.66      0.66       628
weighted avg       0.66      0.66      0.66       628

RandomForestClassifier(max_depth=5, min_samples_leaf=2, min_samples_split=10,
                       n_estimators=200, oob_score=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['pred_pos'] = grid.predict(X_pred_scaled) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

              precision    recall  f1-score   support

           0       0.69      0.75      0.72       280
           1       0.74      0.68      0.71       297

    accuracy                           0.71       577
   macro avg       0.71      0.71      0.71       577
weighted avg       0.71      0.71      0.71       577

RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=4,
                       n_estimators=64)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['pred_pos'] = grid.predict(X_pred_scaled) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

              precision    recall  f1-score   support

           0       0.67      0.68      0.68       257
           1       0.68      0.67      0.67       258

    accuracy                           0.68       515
   macro avg       0.68      0.68      0.68       515
weighted avg       0.68      0.68      0.68       515

RandomForestClassifier(max_depth=10, min_samples_leaf=4, oob_score=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['pred_pos'] = grid.predict(X_pred_scaled) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

              precision    recall  f1-score   support

           0       0.68      0.67      0.68       235
           1       0.64      0.65      0.64       209

    accuracy                           0.66       444
   macro avg       0.66      0.66      0.66       444
weighted avg       0.66      0.66      0.66       444

RandomForestClassifier(max_features='log2', min_samples_leaf=2,
                       n_estimators=200)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['pred_pos'] = grid.predict(X_pred_scaled) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_pred_tables[k]['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

In [36]:
del k_report['fp_rate']

In [41]:
dict_of_pred_tables[5]['accuracy'].mean()

0.7208480565371025

In [43]:
k_report['accuracy'] = []

In [44]:
for k in games: 
    k_report['accuracy'].append(dict_of_pred_tables[k]['accuracy'].mean())

In [45]:
k_report_df = pd.DataFrame(k_report)

In [46]:
k_report_df

Unnamed: 0,k,recall,precision,accuracy
0,5,0.731034,0.726027,0.720848
1,7,0.598592,0.765766,0.707746
2,9,0.810127,0.719101,0.71831
3,11,0.765517,0.711538,0.721831
4,13,0.642857,0.767442,0.700704
5,15,0.694656,0.722222,0.735915
6,17,0.693431,0.703704,0.711268


In [52]:
# Standardization Pipeline
# Create a pipeline for numerical features
num_transformer = Pipeline(steps=[('scaler', StandardScaler())]) 

# Create a column transformer to apply different transformations to different columns
preprocessor_pca = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['pts', 'opp_pts', 'ortg', 'drtg','pace', 'ftr','3par', 'ts_perc', 
                                  'trb_perc', 'ast_perc', 'stl_perc', 'blk_perc', 'streak', 'wins', 'losses',
                                  'overtimes', 'srs',
                                'efg_perc', 'tov_perc', 'orb_perc', 'ft_fga', 'opp_pts_self', 'opp_pts_team',
                                'opp_ortg', 'opp_drtg', 'opp_pace','opp_ftr', 'opp_3par', 'opp_ts_perc', 
                                'opp_trb_perc', 'opp_ast_perc', 'opp_stl_perc', 'opp_blk_perc', 
                                'opp_efg_perc', 'opp_tov_perc', 'opp_orb_perc', 'opp_ft_fga', 'opp_streak', 'opp_wins', 'opp_losses','opp_srs',])
        ], 
        remainder='passthrough'
    )  


In [58]:
# PCA LOGISTIC REGRESSION WITH K = 5 


# Need a function for using means that gets the mean of the last k games
k = 5
components = [2,3,4,5,6,7,8,9,10,11,12,13,14,15]
dict_of_log_pred_tables = {}
component_report = {'components': [], 'recall': [], 'precision': [], 'accuracy': []}

for c in components:
    df = pd.read_csv('data/all_team_data.csv') # Load Data 
    # Clean Data 
    from clean_data import clean_data 
    from data_types import columns

    df = df[columns] # subset to selected columms
    df = clean_data(df) # adjust feature types, match school names, create game_ids, drop NAs 

    list_of_games = []
    for game, team, id in zip(df['game'], df['school'], df['game_id']):
        if game <= k:
            continue
        else: 
            current_stats = df[['game', 'date', 'game_result', 'school', 'opp_team_id', 'home', 'away', 'srs']][(df['school'] == team) & (df['game'] == game)]
            current_stats['game_id'] = id
            as_of_last_game = df[['streak', 'wins', 'losses']][(df['school'] == team) & (df['game'] == (game - 1))]
            as_of_last_game['game_id'] = id
            average_of_last_k = df[['pace', 'pts', 'opp_pts', 'ortg', 'drtg', 'ftr', '3par', 'ts_perc', 'trb_perc', 'ast_perc', 'stl_perc', 
                'blk_perc', 'efg_perc', 'tov_perc', 'orb_perc', 'ft_fga', 'overtimes']][(df['game'] >= (game - 5)) & (df['game'] < game) & (df['school'] == team)].mean().to_frame().T
            average_of_last_k['game_id'] = id
            merged_row = pd.merge(pd.merge(current_stats, as_of_last_game, on='game_id'), average_of_last_k, on='game_id')
            list_of_games.append(merged_row)
        
    
    df = pd.concat(list_of_games) # merge list into df


    # Process data 
    from adv_processing import only_duplicates, convert_percentages, convert_to_matchups

    df = only_duplicates(df) # keep only duplicate game_ids
    df = convert_percentages(df) # convert 0-100 percentages to 0-1
    df_match = convert_to_matchups(df) # convert df table to matchup table

    # Train Test Split

    from train_model import split_data_to_test

    # Split data based on date string for training and predictions
    df_match_train = split_data_to_test(df_match, date='2023-02-28', type='train')
    df_match_pred = split_data_to_test(df_match, date='2023-02-28', type='pred')

    # Train Model 
    # Subset training data 
    df_match_train = df_match_train.drop(['game', 'date', 'school', 'opp_team_id', 'game_id'], axis=1)  

    X = df_match_train.drop('game_result', axis=1)
    y = df_match_train['game_result']
    y = y.astype('int')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=235)

    # Standardize the data 
    X_train_scaled = preprocessor_pca.fit_transform(X_train)
    X_test_scaled = preprocessor_pca.transform(X_test)

    from sklearn.decomposition import PCA
    #  Apply PCA
    pca = PCA(n_components = c)

    X_train_pca = pca.fit_transform(X_train_scaled)

    # Transform the data 
    X_test_pca = pca.transform(X_test_scaled)

    # FOR PERFORMING LOGISTIC REGRESSION
    from sklearn.linear_model import LogisticRegression
    # Define the model 
    model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=100000)

    # Define the hyperparameters 
    params = {'C': np.logspace(-3, 3, 7),
              'l1_ratio': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

    from sklearn.model_selection import GridSearchCV
    # Perform a grid search over the hyperparameters 
    grid = GridSearchCV(model, param_grid=params, cv=20, n_jobs=4, scoring='accuracy')
    grid.fit(X_train_pca, y_train)


    predictions = grid.predict(X_test_pca)
    print(classification_report(y_test, predictions))
    
    print(grid.best_estimator_)

    dict_of_log_pred_tables[c] = df_match_pred[['game_id', 'school', 'opp_team_id', 'game_result']] # dataframe of actual results

    # Subset prediction data 
    X_pred = df_match_pred.drop(['game', 'date', 'school', 'opp_team_id', 'game_id', 'game_result'], axis=1)

    # Scale data
    X_pred_scaled = preprocessor.transform(X_pred) # Scale Data
    X_pred_pca = pca.transform(X_pred_scaled)

    dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction

    prediction_probs = grid.predict_proba(X_pred_pca)
    pos_class = []
    for pair in prediction_probs:
        pos_class.append(pair[1])

    dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
    dict_of_log_pred_tables[c]['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
    dict_of_log_pred_tables[c]['accuracy'] = np.vectorize(lambda x, y: 1 if x == y else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])

    component_report['components'].append(c)
    component_report['recall'].append(dict_of_log_pred_tables[c]['tp'].sum() / dict_of_log_pred_tables[c]['game_result'].sum())
    component_report['precision'].append(dict_of_log_pred_tables[c]['tp'].sum() / dict_of_log_pred_tables[c]['pred_pos'].sum())
    component_report['accuracy'].append(dict_of_log_pred_tables[c]['accuracy'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['school'] = df['school'].map(team_labels)


              precision    recall  f1-score   support

           0       0.63      0.62      0.62       373
           1       0.67      0.68      0.68       427

    accuracy                           0.65       800
   macro avg       0.65      0.65      0.65       800
weighted avg       0.65      0.65      0.65       800

LogisticRegression(C=0.1, l1_ratio=0.5, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.63      0.59      0.61       386
           1       0.64      0.68      0.66       414

    accuracy                           0.64       800
   macro avg       0.64      0.63      0.63       800
weighted avg       0.64      0.64      0.64       800

LogisticRegression(C=0.001, l1_ratio=0.9, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.65      0.65      0.65       378
           1       0.68      0.68      0.68       422

    accuracy                           0.67       800
   macro avg       0.66      0.66      0.66       800
weighted avg       0.67      0.67      0.67       800

LogisticRegression(C=0.001, l1_ratio=0, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.65      0.68      0.67       408
           1       0.65      0.62      0.64       392

    accuracy                           0.65       800
   macro avg       0.65      0.65      0.65       800
weighted avg       0.65      0.65      0.65       800

LogisticRegression(C=0.01, l1_ratio=0.1, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.68      0.67      0.67       424
           1       0.63      0.64      0.64       376

    accuracy                           0.66       800
   macro avg       0.66      0.66      0.66       800
weighted avg       0.66      0.66      0.66       800

LogisticRegression(C=0.001, l1_ratio=0, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.64      0.64      0.64       402
           1       0.64      0.64      0.64       398

    accuracy                           0.64       800
   macro avg       0.64      0.64      0.64       800
weighted avg       0.64      0.64      0.64       800

LogisticRegression(C=0.01, l1_ratio=0.3, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.63      0.65      0.64       393
           1       0.65      0.63      0.64       407

    accuracy                           0.64       800
   macro avg       0.64      0.64      0.64       800
weighted avg       0.64      0.64      0.64       800

LogisticRegression(C=0.01, l1_ratio=0.4, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.65      0.63      0.64       407
           1       0.63      0.65      0.64       393

    accuracy                           0.64       800
   macro avg       0.64      0.64      0.64       800
weighted avg       0.64      0.64      0.64       800

LogisticRegression(C=0.01, l1_ratio=0.1, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.68      0.62      0.65       436
           1       0.59      0.65      0.62       364

    accuracy                           0.64       800
   macro avg       0.64      0.64      0.64       800
weighted avg       0.64      0.64      0.64       800

LogisticRegression(C=0.1, l1_ratio=0, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.66      0.64      0.65       410
           1       0.63      0.65      0.64       390

    accuracy                           0.64       800
   macro avg       0.64      0.64      0.64       800
weighted avg       0.64      0.64      0.64       800

LogisticRegression(l1_ratio=0, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.66      0.64      0.65       409
           1       0.63      0.65      0.64       391

    accuracy                           0.65       800
   macro avg       0.65      0.65      0.65       800
weighted avg       0.65      0.65      0.65       800

LogisticRegression(C=0.1, l1_ratio=0.2, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.62      0.64      0.63       368
           1       0.68      0.67      0.68       432

    accuracy                           0.66       800
   macro avg       0.65      0.65      0.65       800
weighted avg       0.66      0.66      0.66       800

LogisticRegression(C=10.0, l1_ratio=0.3, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.65      0.66      0.66       403
           1       0.65      0.64      0.64       397

    accuracy                           0.65       800
   macro avg       0.65      0.65      0.65       800
weighted avg       0.65      0.65      0.65       800

LogisticRegression(C=0.1, l1_ratio=0.5, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

              precision    recall  f1-score   support

           0       0.65      0.61      0.63       399
           1       0.63      0.68      0.66       401

    accuracy                           0.64       800
   macro avg       0.64      0.64      0.64       800
weighted avg       0.64      0.64      0.64       800

LogisticRegression(C=0.01, l1_ratio=0.9, max_iter=100000, penalty='elasticnet',
                   solver='saga')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_pos'] = grid.predict(X_pred_pca) # Add prediction
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['pred_prob'] = pos_class  # Add prediction prob
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_of_log_pred_tables[c]['tp'] = np.vectorize

In [109]:
grid.classes_

array([0, 1])

In [61]:
c_report_df = pd.DataFrame(component_report)

c_report_df

Unnamed: 0,components,recall,precision,accuracy
0,2,0.422535,0.638298,0.590106
1,3,0.460993,0.575221,0.561837
2,4,0.520548,0.603175,0.575972
3,5,0.503311,0.66087,0.598592
4,6,0.467626,0.560345,0.558304
5,7,0.443609,0.59,0.59507
6,8,0.489362,0.560976,0.556338
7,9,0.48951,0.56,0.549296
8,10,0.567164,0.539007,0.566901
9,11,0.585714,0.5,0.5053


In [62]:
dict_of_pred_tables[5]

Unnamed: 0,game_id,school,opp_team_id,game_result,pred_pos,tp,accuracy
0,226646,Air Force,San Jose State,0,1,0,0
0,313825,Akron,Kent State,0,0,0,1
0,410568,Grambling,Alabama A&M,1,1,1,1
0,428446,Southern,Alabama A&M,0,1,0,0
0,51776,Auburn,Alabama,0,0,0,1
...,...,...,...,...,...,...,...
0,32132268,UC-Irvine,UC-Riverside,1,0,0,0
0,32735846,Wofford,UNC Greensboro,1,0,0,0
0,32933376,UT Arlington,Utah Valley,0,0,0,1
0,33435268,Western Kentucky,UTEP,1,0,0,0


In [63]:
dict_of_log_pred_tables[5]

Unnamed: 0,game_id,school,opp_team_id,game_result,pred_pos,pred_prob,tp,accuracy
0,226646,Air Force,San Jose State,0,1,0.873962,0,0
0,31955,Akron,Ball State,1,1,0.632055,1,1
0,313825,Kent State,Akron,1,1,0.876769,1,1
0,410568,Alabama A&M,Grambling,0,0,0.131703,0,1
0,428446,Alabama A&M,Southern,1,0,0.236819,0,0
...,...,...,...,...,...,...,...,...
0,32132268,UC-Irvine,UC-Riverside,1,1,0.674466,1,1
0,32735846,Wofford,UNC Greensboro,1,1,0.502435,1,1
0,32933376,UT Arlington,Utah Valley,0,0,0.411495,0,1
0,33435268,UTEP,Western Kentucky,0,0,0.240118,0,1


In [70]:
best_models_df = pd.merge(dict_of_pred_tables[5], dict_of_log_pred_tables[5].drop(['school', 'opp_team_id', 'game_result', 'tp', 'accuracy', 'pred_pos'], axis=1), on='game_id')

In [71]:
best_models_df

Unnamed: 0,game_id,school,opp_team_id,game_result,pred_pos,tp,accuracy,pred_prob
0,226646,Air Force,San Jose State,0,1,0,0,0.873962
1,313825,Akron,Kent State,0,0,0,1,0.876769
2,410568,Grambling,Alabama A&M,1,1,1,1,0.131703
3,428446,Southern,Alabama A&M,0,1,0,0,0.236819
4,51776,Auburn,Alabama,0,0,0,1,0.831950
...,...,...,...,...,...,...,...,...
278,32132268,UC-Irvine,UC-Riverside,1,0,0,0,0.674466
279,32735846,Wofford,UNC Greensboro,1,0,0,0,0.502435
280,32933376,UT Arlington,Utah Valley,0,0,0,1,0.411495
281,33435268,Western Kentucky,UTEP,1,0,0,0,0.240118


In [98]:
best_models_df['return'] = best_models_df['accuracy'].apply(lambda x: 10 if x == 1 else -100)


-5860

In [86]:
np.linspace(0.5, 1.0, 51)

array([0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 ,
       0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71,
       0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82,
       0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93,
       0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.  ])

In [104]:
calibration_threshold = {'thresh': [], 'accuracy': [], 'fp_rate': [], 'precision': [], 'games': [], 'return': []}
for thresh in np.linspace(0.5, 1.0, 51): 
    best_models_df['pos_neg_thresh'] = best_models_df['pred_prob'].apply(lambda x: 1 if (x > thresh) | (x <= 1-thresh) else 0)
    best_models_df['return'] = best_models_df['accuracy'].apply(lambda x: 50 if x == 1 else -100) # Figure out how to attach actual returns
    subset_df = best_models_df[best_models_df['pos_neg_thresh'] == 1]
    fp_rate = 1 - (subset_df['tp'].sum() / subset_df['pred_pos'].sum())
    calibration_threshold['thresh'].append(thresh)
    calibration_threshold['accuracy'].append(subset_df['accuracy'].mean())
    calibration_threshold['fp_rate'].append(fp_rate)
    calibration_threshold['precision'].append((subset_df['tp'].sum() / subset_df['pred_pos'].sum()))
    calibration_threshold['games'].append(subset_df['pos_neg_thresh'].sum())
    calibration_threshold['return'].append(subset_df['return'].sum())


thresh_cal_df = pd.DataFrame(calibration_threshold)


  fp_rate = 1 - (subset_df['tp'].sum() / subset_df['pred_pos'].sum())
  calibration_threshold['precision'].append((subset_df['tp'].sum() / subset_df['pred_pos'].sum()))


In [107]:
thresh_cal_df['ROI'] = thresh_cal_df['return'] / (thresh_cal_df['games']*100)

In [108]:
thresh_cal_df

Unnamed: 0,thresh,accuracy,fp_rate,precision,games,return,ROI
0,0.5,0.720848,0.273973,0.726027,283,2300,0.081272
1,0.51,0.725,0.273973,0.726027,280,2450,0.0875
2,0.52,0.723636,0.272727,0.727273,275,2350,0.085455
3,0.53,0.726937,0.269504,0.730496,271,2450,0.090406
4,0.54,0.727612,0.271429,0.728571,268,2450,0.091418
5,0.55,0.730038,0.266667,0.733333,263,2500,0.095057
6,0.56,0.730769,0.268657,0.731343,260,2500,0.096154
7,0.57,0.73622,0.267176,0.732824,254,2650,0.104331
8,0.58,0.732,0.271318,0.728682,250,2450,0.098
9,0.59,0.72428,0.275591,0.724409,243,2100,0.08642


In [None]:
# # Split df at same date as before
# df_train = split_data_to_test(df, date='2023-02-28', type='train')


In [15]:
# # Normalization processor
# # Create a pipeline for numerical features
# num_transformer2 = Pipeline(steps=[('scaler', MinMaxScaler(feature_range=(0,1)))])

# # Create a column transformer to apply different transformations to different columns
# preprocessor2 = ColumnTransformer(
#     transformers=[
#             ('num', num_transformer2, ['pace', 'ftr','3par', 'ts_perc', 
#                                   'trb_perc', 'ast_perc', 'stl_perc', 'blk_perc',
#                                 'efg_perc', 'tov_perc', 'orb_perc', 'ft_fga', 'srs',
#                                 'wins', 'losses', 'streak'])
#         ], 
#     remainder='passthrough'
#     )


In [16]:
# # fit a nearest neighbors model on all training games    
# nn_df = df_train[['pace', 'ftr', '3par', 'ts_perc', 'trb_perc', 'ast_perc', 'stl_perc', 'blk_perc', 
#                  'efg_perc', 'tov_perc', 'orb_perc', 'ft_fga', 'srs', 'wins', 'losses', 'streak']] # Subset df to wanted features

# # Use pipeline to scale numerical data Use sklearn.preprocessing MinMaxScaler
# # scaler = MinMaxScaler(feature_range=(0,1))
# nn_df = preprocessor2.fit_transform(nn_df)

# # Fit NearestNeighbors with k neighbors (k number of similar matchups

# knn = NearestNeighbors(n_neighbors=10) # 10 most similar neighbors 

# knn.fit(nn_df) # Fit matchup stats  


In [17]:
# season_stats_dict = {}
# for name in df_train['school'].unique():
#     dataframe = df_train[df_train['school'] == name]
#     dataframe = dataframe[['pace', 'ftr', '3par', 'ts_perc', 'trb_perc', 'ast_perc', 'stl_perc', 'blk_perc', 
#                  'efg_perc', 'tov_perc', 'orb_perc', 'ft_fga', 'srs', 'wins', 'losses', 'streak']]
#     dataframe = preprocessor2.transform(dataframe)
#     season_stats_dict[name] = dataframe
    

In [57]:

# # Function to create matchup format df
# def sim_convert_to_matchups(df):
#     # Turn individual team data into matchup rows based on game_id

#     opponent_features = ['pace', 'ftr','3par', 'ts_perc', 'trb_perc', 
#                      'ast_perc', 'stl_perc', 'blk_perc','efg_perc', 
#                      'tov_perc', 'orb_perc', 'ft_fga', 'srs', 'wins', 
#                      'losses', 'streak', 'game_id',]

#     new_opp_feat_names_dict = {'pace': 'opp_pace',
#                            'ftr': 'opp_ftr', 
#                            '3par': 'opp_3par',
#                            'ts_perc': 'opp_ts_perc',
#                            'trb_perc': 'opp_trb_perc',
#                            'ast_perc': 'opp_ast_perc',
#                            'stl_perc': 'opp_stl_perc',
#                            'blk_perc': 'opp_blk_perc',
#                            'efg_perc': 'opp_efg_perc',
#                            'tov_perc': 'opp_tov_perc',
#                            'orb_perc': 'opp_orb_perc', 
#                            'tov_perc': 'opp_tov_perc',
#                            'orb_perc': 'opp_orb_perc',
#                            'ft_fga': 'opp_ft_fga',
#                            'srs': 'opp_srs', 
#                            'wins': 'opp_wins', 
#                            'losses': 'opp_losses', 
#                            'streak': 'opp_streak',}
    
#     list_of_matchups = []
#     for id, school in zip(df['game_id'], df['school']):
#         team_row = df[(df['game_id'] == id) & (df['school'] == school)]
#         # transform team row to normal
#         opp_row = df[opponent_features][(df['game_id'] == id) & (df['opp_team_id'] != school)] # Gets opponent features 
#         # transform team row to normal
#         opp_row = opp_row.rename(columns=new_opp_feat_names_dict) # new column names
#         new_row = pd.merge(team_row, opp_row, on='game_id', how='inner') # merge team and opponent data
#         list_of_matchups.append(new_row) # add row to matchup list

#     match_up_df = pd.concat(list_of_matchups)

#     return match_up_df


# Function that bootstrap samples similar matchups n times 
# and generates a predicted probability of team_a winning
# def simulate_and_predict(df, model, n: int):
#     n_simulations = n
#     simulated_outcomes = []

#     for i in range(n_simulations):
#         random_row = np.random.choice(df.shape[0])
#         matchup = df[random_row]
#         outcome = model.predict(matchup.reshape(1, -1))
#         simulated_outcomes.append(outcome)

#     team_win_prob = np.mean(simulated_outcomes)

#     return team_win_prob

In [55]:
# # Create table of actual outcomes to test 
# actual_outcomes = df_match_pred[['game_id', 'school', 'opp_team_id', 'game_result']]
# win_prob = []
# for a, b in zip(actual_outcomes['school'], actual_outcomes['opp_team_id']):
#     # Get similar teams and games for each team 
#     distances_a, indices_a = knn.kneighbors(season_stats_dict[a]) # get distances and indices for team a
#     distances_b, indices_b = knn.kneighbors(season_stats_dict[b]) 

#     similar_teams = [] # similar teams to team_a
#     for i, indx in enumerate(indices_a): 
#         similar_teams.extend(list(indx[1:11]))

#     similar_opps = []
#     for i, indx in enumerate(indices_b): 
#         similar_opps.extend(list(indx[1:11]))

#     # Game ids that involve both teams
#     similar_game_ids = [x for x in list(df_train['game_id'].iloc[similar_teams]) if x in list(df_train['game_id'].iloc[similar_opps])]

#     similar_school_df = df_train.iloc[similar_teams] # df subset by teams similar to team_a
#     # subset of teams similar that have game_ids in similar_game_ids
#     similar_df = similar_school_df[similar_school_df['game_id'].isin(similar_game_ids)]
#     # drop duplicate games if they exist
#     similar_df = similar_df.drop_duplicates()

#     # Convert similar df to matchups 

#     # Turn individual team data into matchup rows based on game_id
#     opponent_features = ['pace', 'ftr','3par', 'ts_perc', 'trb_perc', 
#                      'ast_perc', 'stl_perc', 'blk_perc','efg_perc', 
#                      'tov_perc', 'orb_perc', 'ft_fga', 'srs', 'wins', 
#                      'losses', 'streak','game_id']

#     new_opp_feat_names_dict = {
#                            'ftr': 'opp_ftr', 
#                            '3par': 'opp_3par',
#                            'ts_perc': 'opp_ts_perc',
#                            'trb_perc': 'opp_trb_perc',
#                            'ast_perc': 'opp_ast_perc',
#                            'stl_perc': 'opp_stl_perc',
#                            'blk_perc': 'opp_blk_perc',
#                            'efg_perc': 'opp_efg_perc',
#                            'tov_perc': 'opp_tov_perc',
#                            'orb_perc': 'opp_orb_perc', 
#                            'tov_perc': 'opp_tov_perc',
#                            'orb_perc': 'opp_orb_perc',
#                            'ft_fga': 'opp_ft_fga',
#                            'srs': 'opp_srs', 
#                            'wins': 'opp_wins', 
#                            'losses': 'opp_losses', 
#                            'streak': 'opp_streak',}
    
#     list_of_matchups = []
#     for id, school in zip(similar_df['game_id'], similar_df['school']):
#         team_row = df_train[(df_train['game_id'] == id) & (df_train['school'] == school)]
#         # transform team row to normal
#         team_row = team_row.drop(['game', 'date', 'game_result', 'school', 'opp_team_id'], axis=1)

#         opp_row = df_train[opponent_features][(df_train['game_id'] == id) & (df_train['school'] != school)] # Gets opponent features 
#         # Rename rows 
#         opp_row = opp_row.rename(columns=new_opp_feat_names_dict)
#         # drop redundant feature
#         opp_row = opp_row.drop('pace', axis=1) 

#         # merge team and opp
#         team_opp_com = pd.merge(team_row, opp_row, on='game_id', how='inner')

#         # add row to list 
#         list_of_matchups.append(team_opp_com)
    
#     # Concatenate list into df rows
#     similar_matchups_df = pd.concat(list_of_matchups)

#     # Scale the data 
#     similar_matchups_df = preprocessor.transform(similar_matchups_df)

#     # Simulate and predict
#     win_prob.append(simulate_and_predict(similar_matchups_df, model=grid, n=1000))    


# # Add win probs to actual outcomes 
# actual_outcomes['pred_prob'] = win_prob

In [72]:
# win_prob

In [73]:
# actual_outcomes['pred_pos'] = actual_outcomes['pred_prob'].apply(lambda x: 1 if x > 0.5 else 0)

In [74]:
# actual_outcomes['accuracy'] = np.vectorize(lambda x, y: 1 if x == y else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])

In [75]:
# actual_outcomes['accuracy'].mean()

In [76]:
# actual_outcomes

In [77]:
# actual_outcomes['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])

In [78]:
# actual_outcomes['tp'].sum() / actual_outcomes['pred_pos'].sum()

In [79]:
# calibration_threshold = {'thresh': [], 'accuracy': [], 'fp_rate': [], 'precision': [], 'games': []}
# for thresh in np.linspace(0, 1.0, 101): 
#     actual_outcomes['pred_pos'] = actual_outcomes['pred_prob'].apply(lambda x: 1 if x >= thresh else 0)
#     actual_outcomes['pred_neg'] = actual_outcomes['pred_prob'].apply(lambda x: 1 if x < thresh else 0)
#     actual_outcomes['tp'] = np.vectorize(lambda x, y: 1 if x == 1 & y == 1 else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
#     actual_outcomes['accuracy'] = np.vectorize(lambda x, y: 1 if x == y else 0)(actual_outcomes['game_result'], actual_outcomes['pred_pos'])
#     fp_rate = 1 - (actual_outcomes['tp'].sum() / actual_outcomes['pred_pos'].sum())
#     calibration_threshold['thresh'].append(thresh)
#     calibration_threshold['accuracy'].append(actual_outcomes['accuracy'].mean())
#     calibration_threshold['fp_rate'].append(fp_rate)
#     calibration_threshold['precision'].append((actual_outcomes['tp'].sum() / actual_outcomes['pred_pos'].sum()))
#     calibration_threshold['games'].append(actual_outcomes['pred_pos'].sum())


# thresh_cal_df = pd.DataFrame(calibration_threshold)


In [80]:
# thresh_cal_df['accuracy'].argmax()

In [81]:
# thresh_cal_df['precision'].argmax()

In [82]:
# thresh_cal_df.iloc[51]