In [1]:
### Setup Environment ###
import numpy as np
import pandas as pd
import plotly_express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import chi2_contingency
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from itertools import combinations

np.set_printoptions(suppress=True, formatter={'float_kind':'{:0.4f}'.format})
pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.max_rows', None)

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest, SelectFromModel, SequentialFeatureSelector
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('womens_matchups.csv')
df = df[df['round'].isin(['Elite Eight', 'Final Four', 'Championship'])]
print(df.shape)
df.head()

(28, 27)


Unnamed: 0,year,region,round,high_bracket_seed,high_team_id,low_bracket_seed,low_team_id,win,adj_oe,adj_de,...,ftr,ftrd,2p%,2p%d,3p%,3p%d,3pr,3prd,adj_tempo,wab
3,2021,East,Elite Eight,1,2021 Stanford,2,2021 Louisville,1,3.8,13.3,...,-8.9,3.1,10.4,-16.6,9.5,-9.1,6.8,1.8,0.6,5.0
6,2021,Final Four,Final Four,1,2021 Stanford,1,2021 South Carolina,1,-4.5,17.2,...,-1.4,-7.9,12.6,-10.3,9.9,-7.6,12.2,15.2,-1.5,2.3
7,2021,Final Four,Championship,1,2021 Stanford,3,2021 Arizona,1,-5.0,23.9,...,-6.6,-1.4,13.3,-8.4,4.9,-5.8,5.7,5.1,0.4,6.8
8,2021,South,Elite Eight,4,2021 Indiana,3,2021 Arizona,0,-18.9,18.0,...,4.0,-2.2,12.1,-4.1,-2.5,-1.1,-5.8,-3.2,1.0,0.0
22,2021,West,Elite Eight,1,2021 South Carolina,6,2021 Texas,1,-9.0,21.7,...,0.3,-7.0,0.0,-4.0,1.3,-5.1,-5.9,-2.0,2.9,6.5


In [2]:
df['win'].value_counts(normalize=True)

1   0.6786
0   0.3214
Name: win, dtype: float64

In [3]:
df.columns

Index(['year', 'region', 'round', 'high_bracket_seed', 'high_team_id',
       'low_bracket_seed', 'low_team_id', 'win', 'adj_oe', 'adj_de', 'barthag',
       'efg%', 'efgd%', 'tor', 'tord', 'orb%', 'drb%', 'ftr', 'ftrd', '2p%',
       '2p%d', '3p%', '3p%d', '3pr', '3prd', 'adj_tempo', 'wab'],
      dtype='object')

In [4]:
eda_df = df[[
    'win',
    'adj_oe', 
    'adj_de', 
    'barthag',
    'efg%', 
    'efgd%', 
    'tor', 
    'tord', 
    'orb%', 
    'drb%', 
    'ftr', 
    'ftrd',
    '2p%', 
    '2p%d', 
    '3p%', 
    '3p%d', 
    '3pr', 
    '3prd', 
    'adj_tempo', 
    'wab']]

### I. Numerical Tests

In [7]:
corr_matrix = eda_df.corr()
correlation_with_target = round(abs(corr_matrix['win'])*100,2)
correlation_with_target = correlation_with_target[correlation_with_target >= 25]
print(correlation_with_target)

win       100.0000
adj_de     38.4000
barthag    49.0000
3p%d       30.9900
3pr        25.3200
wab        33.5300
Name: win, dtype: float64


In [9]:
X = eda_df.drop('win', axis=1)
y = eda_df['win']

# ANOVA
f_values, p_values = f_classif(X, y)
# Mutual Information
mi = mutual_info_regression(X, y)

column_names = np.array(X.columns)
stats_df = pd.DataFrame({
    'feature': column_names,
    'f_values': f_values,
    'p_values': p_values,
    'mutual_info': mi
})
stats_df

Unnamed: 0,feature,f_values,p_values,mutual_info
0,adj_oe,0.8869,0.355,0.0
1,adj_de,4.4964,0.0437,0.0325
2,barthag,8.2153,0.0081,0.1007
3,efg%,0.0101,0.9208,0.0
4,efgd%,0.6608,0.4237,0.1097
5,tor,0.1993,0.659,0.0336
6,tord,0.7474,0.3952,0.0
7,orb%,0.5696,0.4572,0.0
8,drb%,0.1705,0.683,0.0
9,ftr,1.5318,0.2269,0.0977


In [10]:
stats_df[stats_df['mutual_info'] > .05]

Unnamed: 0,feature,f_values,p_values,mutual_info
2,barthag,8.2153,0.0081,0.1007
4,efgd%,0.6608,0.4237,0.1097
9,ftr,1.5318,0.2269,0.0977
12,2p%d,0.1627,0.6899,0.1024


In [11]:
stats_df[stats_df['p_values'] < .05]

Unnamed: 0,feature,f_values,p_values,mutual_info
1,adj_de,4.4964,0.0437,0.0325
2,barthag,8.2153,0.0081,0.1007


In [12]:
stats_df[stats_df['f_values'] > 5]

Unnamed: 0,feature,f_values,p_values,mutual_info
2,barthag,8.2153,0.0081,0.1007


### II. Wrapper Methods

In [13]:
X_train = eda_df.drop('win', axis=1)
y_train = eda_df['win']

X_test = eda_df.drop('win', axis=1)
y_test = eda_df['win']

float64_columns = X_train.select_dtypes(include=['float64']).columns

scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_train_scaled[float64_columns] = scaler.fit_transform(X_train[float64_columns])

X_test_scaled = X_test.copy()
X_test_scaled[float64_columns] = scaler.transform(X_test[float64_columns])

In [14]:
sel_ = SelectFromModel(
    LogisticRegression(C=0.5, penalty='l1', solver='liblinear', random_state=10))

sel_.fit(X_train_scaled, y_train)

In [15]:
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 19
selected features: 4
features with coefficients shrank to zero: 15


In [16]:
# Transform datasets based on selected features
X_train_selected = sel_.transform(X_train_scaled)
X_test_selected = sel_.transform(X_test_scaled)

# Train a Lasso regression model
lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X_train_selected, y_train)

# Make predictions
y_train_pred = lasso.predict(X_train_selected)
y_test_pred = lasso.predict(X_test_selected)

# Convert predictions to binary using a threshold
threshold = 0.5
y_train_pred_binary = (y_train_pred >= threshold).astype(int)
y_test_pred_binary = (y_test_pred >= threshold).astype(int)

# Evaluate F1 scores
f1_train = f1_score(y_train, y_train_pred_binary)
f1_test = f1_score(y_test, y_test_pred_binary)

print(f"Lasso Regression F1 Score (Train): {f1_train:.2f}")
print(f"Lasso Regression F1 Score (Test): {f1_test:.2f}")

Lasso Regression F1 Score (Train): 0.84
Lasso Regression F1 Score (Test): 0.84


In [17]:
selected_feat

Index(['adj_de', 'barthag', '3p%d', '3pr'], dtype='object')