In [276]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
np.set_printoptions(precision=2)

In [277]:
folder_path = './team_adv_stats_avg/'

train_df = pd.DataFrame()

team_dfs = {}
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    df = pd.read_csv(file_path)

    df.loc[:, ~df.columns.str.startswith('Unnamed')]

    

    team_dfs[filename.split('.')[0]] = df

In [278]:

for key in team_dfs:
    for index, row in team_dfs[key].iterrows():
        date = row['Date']
        opp = row['Opp']            

        # Get the opponents stats based on the opponent's name
        try:
            opp = opp.lower().replace(' ', '-').replace('&', '').replace('.', '')
            opp_stats = team_dfs[opp]
        except Exception as e:
            continue

        # Get the opponent's stats for the specified date
        opp_stats = opp_stats[opp_stats['Date'] == date]

        # Check if opp_stats is not empty before accessing the first row
        if not opp_stats.empty:
            opp_stats = opp_stats.iloc[0]
        else:
            continue

        # Get Purdue's stats for the current date
        team_stats = row

        # Concatenate the stats into the same row for Purdue and the opponent
        combined_stats = pd.concat([team_stats, opp_stats], axis=0, keys=['Team', 'Opp'])

        # Append the combined row to the purdue_train_df
        train_df = train_df.append(combined_stats, ignore_index=True)

    train_df.reset_index(drop=True, inplace=True)


train_df


  train_df = train_df.append(combined_stats, ignore_index=True)


Unnamed: 0_level_0,Team,Team,Team,Team,Team,Team,Team,Team,Team,Team,...,Opp,Opp,Opp,Opp,Opp,Opp,Opp,Opp,Opp,Opp
Unnamed: 0_level_1,Date,Opp,Tm_Score,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,...,STL%,BLK%,eFG%,TOV%,ORB%,FT/FGA,Opp_eFG%.1,Opp_TOV%,DRB%,Opp_FT/FGA
0,2022-11-27,Northern Arizona,92.0,95.66,96.66,73.96,0.2470,0.3392,0.5402,50.92,...,7.86,6.32,0.4808,15.20,31.12,0.1704,0.4720,17.66,74.12,0.3246
1,2022-12-06,Kansas State,64.0,114.82,91.34,70.80,0.2812,0.3640,0.5788,52.96,...,9.26,10.24,0.5114,18.02,29.58,0.2524,0.4988,19.16,74.56,0.2564
2,2022-12-10,Texas A&M-Commerce,83.0,115.02,93.78,70.02,0.2832,0.3688,0.5692,53.30,...,10.48,11.94,0.5148,13.52,32.04,0.1224,0.4568,18.26,74.56,0.3652
3,2022-12-17,Cal State Bakersfield,65.0,121.60,91.46,69.12,0.3104,0.3434,0.5994,54.24,...,9.26,14.18,0.4620,17.82,34.58,0.1788,0.4692,17.04,73.58,0.2722
4,2022-12-29,Stephen F. Austin,68.0,114.88,85.24,71.32,0.3136,0.3726,0.5868,54.82,...,13.24,3.50,0.6168,20.28,31.92,0.2800,0.4512,23.54,72.08,0.2394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8102,2023-02-21,Robert Morris,64.0,119.28,95.20,69.70,0.2474,0.3490,0.5872,54.20,...,11.02,15.82,0.4798,15.68,37.94,0.2026,0.4714,17.52,75.18,0.1916
8103,2023-02-25,IUPUI,93.0,112.86,100.66,71.42,0.2934,0.3334,0.5676,51.50,...,8.58,5.36,0.5268,21.44,31.54,0.2410,0.5592,11.44,63.76,0.3446
8104,2023-03-02,Detroit Mercy,71.0,116.62,102.96,71.32,0.3024,0.3426,0.5808,50.54,...,10.64,7.16,0.5274,14.30,40.26,0.2468,0.5440,16.28,76.90,0.2330
8105,2023-03-06,Northern Kentucky,63.0,115.00,99.14,71.08,0.3452,0.3560,0.5898,49.96,...,11.54,8.06,0.5790,16.28,29.32,0.1798,0.4808,13.00,77.08,0.2770


In [279]:
train_df['Total'] = train_df['Team']['Tm_Score'] + train_df['Opp']['Tm_Score']

In [280]:
X_train = train_df.copy()

X_train = X_train.dropna()

y_train = X_train['Total']

X_train.drop(['Total'], axis=1, inplace=True)

# Renaming columns
X_train.columns = [f'{team}_{stat}' if team != '' else stat for team, stat in X_train.columns]

X_train.drop(['Team_Tm_Score', 'Opp_Tm_Score', 'Team_Date', 'Opp_Date', 'Team_Opp', 'Opp_Opp'], axis=1, inplace=True)

In [281]:
print(str(len(X_train)) + " " + str(len(y_train)))

8064 8064


In [282]:

scaler = StandardScaler()
X_norm = scaler.fit_transform(X_train)

sgdr = SGDRegressor(max_iter=50000)
sgdr.fit(X_norm, y_train)

b_norm = sgdr.intercept_
w_norm = sgdr.coef_

print(f'W_norm = {w_norm}')
print(f'b_norm = {b_norm}')

# Print the w_norm values compared to the features they are associated with
for col in range(len(X_train.columns)):
    print(f'{X_train.columns[col]}: {w_norm[col]}')

# make a prediction using sgdr.predict()
y_pred_sgd = sgdr.predict(X_norm)


# find the mean squared error
mse = ((y_train - y_pred_sgd)**2).mean()

# find the max error
max_error = np.max(np.abs(y_train - y_pred_sgd))

# find the mean of the absolute errors
mae = np.abs(y_train - y_pred_sgd).mean()

print(f'MSE: {mse:.2f}, Max Error: {max_error:.2f}, Average error: {mae:.2f}')

W_norm = [ 1.63 -0.34  4.87 -0.69 -0.46  0.57 -0.57 -0.47 -0.53 -0.67 -0.61 -0.55
  0.2   0.53  1.24 -1.08 -0.19 -0.44  1.37  0.07  4.82 -0.86  0.22  0.74
 -0.45  0.19 -0.21 -0.47 -0.07 -0.7   0.38  0.45  0.44 -1.16 -0.49 -0.19]
b_norm = [141.18]
Team_ORtg: 1.6255737506100993
Team_DRtg: -0.3378888135975956
Team_Pace: 4.870780251177403
Team_FTr: -0.6891184969727471
Team_3PAr: -0.4603594466740023
Team_TS%: 0.5721475672992595
Team_TRB%: -0.5656496754632774
Team_AST%: -0.4677141478012276
Team_STL%: -0.5341166738324732
Team_BLK%: -0.6657181771369072
Team_eFG%: -0.6113612468448978
Team_TOV%: -0.5541842205521212
Team_ORB%: 0.20178383185879434
Team_FT/FGA: 0.5256341580341269
Team_Opp_eFG%.1: 1.2433316173369604
Team_Opp_TOV%: -1.07502191528348
Team_DRB%: -0.18568884184139064
Team_Opp_FT/FGA: -0.44081699010730024
Opp_ORtg: 1.3722520025000369
Opp_DRtg: 0.06829736333473685
Opp_Pace: 4.821160369811132
Opp_FTr: -0.8622343768128576
Opp_3PAr: 0.21817139975703947
Opp_TS%: 0.7388392756402798
Opp_TRB%: -

In [283]:
# Let's put our money where our mouth is

indiana_df = pd.read_csv('./tests/indiana_adv_2024.csv')
wisconsin_df = pd.read_csv('./tests/wisconsin_adv_2024.csv')

indiana_df = indiana_df.loc[:, ~indiana_df.columns.str.startswith('Unnamed')]

indiana_df = indiana_df.drop(['W/L', 'G', 'Date', 'Opp', 'Tm', 'Opp.1'], axis=1)
indiana_df = indiana_df.dropna()

avg_ind = indiana_df.copy()

for i in range(5, len(indiana_df)):
    # Calculate the mean of the previous 5 rows
    avg_ind.loc[i, indiana_df.columns[3:]] = indiana_df.loc[i-5:i-1, indiana_df.columns[1:]].mean(numeric_only=True)


# get last row
avg_ind = avg_ind.iloc[-1]


#do same for wisconsin
wisconsin_df = wisconsin_df.loc[:, ~wisconsin_df.columns.str.startswith('Unnamed')]
wisconsin_df = wisconsin_df.drop(['W/L', 'G', 'Date', 'Opp', 'Tm', 'Opp.1'], axis=1)
wisconsin_df = wisconsin_df.dropna()

avg_wis = wisconsin_df.copy()

for i in range(5, len(wisconsin_df)):
    # Calculate the mean of the previous 5 rows
    avg_wis.loc[i, wisconsin_df.columns[3:]] = wisconsin_df.loc[i-5:i-1, wisconsin_df.columns[1:]].mean(numeric_only=True)

# get last row
avg_wis = avg_wis.iloc[-1]

# Concatenate the stats into the same row for Purdue and the opponent
combined = pd.concat([avg_ind, avg_wis], axis=0, keys=['Team', 'Opp'])

test_df = pd.DataFrame()

# Append the combined row to the purdue_train_df
test_df = test_df.append(combined, ignore_index=True)

test_df.reset_index(drop=True, inplace=True)

column_rename_mapping = {'Date': 'Date', 'Opp': 'Opp', 'Tm': 'Tm_Score', 'Opp.1': 'Opp_Score', 'ORtg': 'ORtg', 'DRtg': 'DRtg', 'Pace': 'Pace', 'FTr': 'FTr', '3PAr': '3PAr',
       'TS%': 'TS%', 'TRB%': 'TRB%', 'AST%': 'AST%', 'STL%': 'STL%', 'BLK%': 'BLK%', 'eFG%': 'eFG%', 'TOV%': 'TOV%', 'ORB%': 'ORB%', 'FT/FGA': 'FT/FGA',
       'eFG%.1': 'Opp_eFG%.1', 'TOV%.1': 'Opp_TOV%', 'DRB%': 'DRB%', 'FT/FGA.1': 'Opp_FT/FGA'}
test_df.rename(columns=column_rename_mapping, inplace=True)

test_df.columns = [f'{team}_{stat}' if team != '' else stat for team, stat in test_df.columns]


test_norm = scaler.transform(test_df)

pred = sgdr.predict(test_norm)

print(f'Predicted: {pred[0]:.2f}')


Predicted: 147.64


  test_df = test_df.append(combined, ignore_index=True)
