In [1]:
### Setup Environment ###
import numpy as np
import pandas as pd
import plotly_express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import chi2_contingency
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from itertools import combinations

np.set_printoptions(suppress=True, formatter={'float_kind':'{:0.4f}'.format})
pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.max_rows', None)

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest, SelectFromModel, SequentialFeatureSelector
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('matchups_portal.csv')
df = df[df['round'].isin(['Final Four', 'Championship'])]
print(df.shape)
df.head()

(12, 87)


Unnamed: 0,year,region,round,high_bracket_seed,high_bracket_team,low_bracket_seed,low_bracket_team,win,wab,prpg!,...,D_B,D_C,D_D,D_F,F_S,F_A,F_B,F_C,F_D,F_F
4,2021,Final Four,Final Four,1,2021 Gonzaga,11,2021 UCLA,1,5.8598,6.8,...,0,0,0,0,0,0,0,0,0,0
7,2021,Final Four,Championship,1,2021 Gonzaga,1,2021 Baylor,0,1.8167,6.6,...,0,0,0,0,0,0,0,0,0,0
34,2021,Final Four,Final Four,1,2021 Baylor,2,2021 Houston,1,2.6364,0.7,...,0,0,0,0,0,0,0,0,0,0
76,2022,Final Four,Final Four,2,2022 Duke,8,2022 North Carolina,0,2.9282,1.6,...,0,0,0,0,0,0,0,0,0,0
86,2022,Final Four,Championship,8,2022 North Carolina,1,2022 Kansas,0,-6.1487,-0.2,...,0,0,0,0,0,0,0,0,0,0


In [2]:
eda_df = df[[
    'win', 
    'wab', 
    'prpg!', 
    'dprpg',
    'bpm', 
    'obpm', 
    'dbpm', 
    'kenpom_adjem',
    'kenpom_adjoe', 
    'kenpom_adjde', 
    'barthag', 
    'barthag_oe', 
    'barthag_de',
    'off_eff', 
    'def_eff', 
    'efg%',
    'ftr', 
    'tor', 
    'orb%', 
    'efgd%', 
    'ftrd',
    'tord', 
    'drb%', 
    '2p%', 
    '3p%', 
    'blked%', 
    'ast%', 
    '3pr', 
    'ft%', 
    '2p%d',
    '3p%d', 
    'blk%', 
    'ast%d', 
    '3prd', 
    'ft%d', 
    'height', 
    'size', 
    'experience',
    'bench']]

In [3]:
corr_matrix = eda_df.corr()
correlation_with_target = round(abs(corr_matrix['win'])*100,2)
correlation_with_target = correlation_with_target[correlation_with_target >= 25]
print(correlation_with_target)

win            100.0000
wab             59.5500
prpg!           35.8000
dprpg           63.1600
bpm             59.4600
obpm            40.4200
dbpm            48.0400
kenpom_adjem    57.0000
kenpom_adjoe    51.7600
kenpom_adjde    27.6500
barthag         54.2800
barthag_oe      51.1500
off_eff         38.9100
efg%            27.3800
orb%            38.7600
efgd%           47.4300
ftrd            42.0300
3p%             33.8400
ast%            31.1000
3pr             41.5300
ft%             28.9700
2p%d            46.3600
blk%            27.9800
ast%d           47.5000
size            25.4100
Name: win, dtype: float64


In [4]:
X = eda_df.drop('win', axis=1)
y = eda_df['win']

# ANOVA
f_values, p_values = f_classif(X, y)
# Mutual Information
mi = mutual_info_regression(X, y)

column_names = np.array(X.columns)
stats_df = pd.DataFrame({
    'feature': column_names,
    'f_values': f_values,
    'p_values': p_values,
    'mutual_info': mi
})
stats_df

Unnamed: 0,feature,f_values,p_values,mutual_info
0,wab,5.4958,0.041,0.0919
1,prpg!,1.4703,0.2532,0.0579
2,dprpg,6.636,0.0276,0.1978
3,bpm,5.4692,0.0414,0.1184
4,obpm,1.9533,0.1925,0.1912
5,dbpm,2.9999,0.1139,0.0702
6,kenpom_adjem,4.8139,0.053,0.1782
7,kenpom_adjoe,3.6596,0.0848,0.0743
8,kenpom_adjde,0.8276,0.3844,0.0
9,barthag,4.1772,0.0682,0.1483


In [5]:
X_train = eda_df.drop('win', axis=1)
y_train = eda_df['win']

X_test = eda_df.drop('win', axis=1)
y_test = eda_df['win']

float64_columns = X_train.select_dtypes(include=['float64']).columns

scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_train_scaled[float64_columns] = scaler.fit_transform(X_train[float64_columns])

X_test_scaled = X_test.copy()
X_test_scaled[float64_columns] = scaler.transform(X_test[float64_columns])

In [6]:
sel_ = SelectFromModel(
    LogisticRegression(C=0.5, penalty='l1', solver='liblinear', random_state=10))

sel_.fit(X_train_scaled, y_train)

In [7]:
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 38
selected features: 4
features with coefficients shrank to zero: 34


In [8]:
# Transform datasets based on selected features
X_train_selected = sel_.transform(X_train_scaled)
X_test_selected = sel_.transform(X_test_scaled)

# Train a Lasso regression model
lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X_train_selected, y_train)

# Make predictions
y_train_pred = lasso.predict(X_train_selected)
y_test_pred = lasso.predict(X_test_selected)

# Convert predictions to binary using a threshold
threshold = 0.5
y_train_pred_binary = (y_train_pred >= threshold).astype(int)
y_test_pred_binary = (y_test_pred >= threshold).astype(int)

# Evaluate F1 scores
f1_train = f1_score(y_train, y_train_pred_binary)
f1_test = f1_score(y_test, y_test_pred_binary)

print(f"Lasso Regression F1 Score (Train): {f1_train:.2f}")
print(f"Lasso Regression F1 Score (Test): {f1_test:.2f}")

Lasso Regression F1 Score (Train): 0.89
Lasso Regression F1 Score (Test): 0.89
