In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from itertools import combinations

df = pd.read_csv('matchups_women_elite.csv')
print(df.shape)
df.head()

(105, 49)


Unnamed: 0,game_id,year,round,region,high_team_id,low_team_id,win,ppg,opp_ppg,net_ppg,...,3prd,%pts_ftd,%pts_2pd,%pts_3pd,astd%,tovd%,opp_ast_tov,opp_stl%,opp_blk%,opp_pf%
0,20173113,2017,Sweet 16,East,2017 Connecticut,2017 UCLA,1,30.9,-9.0,21.8,...,0.027,0.078,0.03,0.047,-0.08,0.078,-0.7,-0.044,-0.059,-0.003
1,20173114,2017,Sweet 16,East,2017 Maryland,2017 Oregon,0,35.1,-15.4,19.7,...,0.029,0.006,0.023,0.017,-0.088,0.035,-0.47,-0.015,-0.011,0.0
2,20174115,2017,Elite Eight,East,2017 Connecticut,2017 Oregon,1,32.9,-5.8,27.1,...,0.075,0.043,0.005,0.047,-0.128,0.041,-0.59,-0.009,-0.005,0.008
3,20173213,2017,Sweet 16,West,2017 South Carolina,2017 Quinnipiac,1,14.9,-5.3,9.8,...,-0.067,0.021,0.09,0.069,-0.176,0.013,-0.41,-0.061,-0.064,-0.005
4,20173214,2017,Sweet 16,West,2017 Florida St.,2017 Oregon St.,1,13.4,-3.9,9.4,...,-0.024,0.025,0.002,0.026,-0.176,0.055,-0.58,0.006,-0.044,0.051


In [2]:
df['win'].value_counts(normalize=True)

1    0.666667
0    0.333333
Name: win, dtype: float64

In [3]:
split_df = df.drop(['game_id', 'year', 'round', 'region', 'high_team_id', 'low_team_id'],axis=1)
print(split_df.shape)
split_df.head()

(105, 43)


Unnamed: 0,win,ppg,opp_ppg,net_ppg,off_rtg,def_rtg,net_rtg,pace,hhs_net_rtg,hhs_off_rtg,...,3prd,%pts_ftd,%pts_2pd,%pts_3pd,astd%,tovd%,opp_ast_tov,opp_stl%,opp_blk%,opp_pf%
0,1,30.9,-9.0,21.8,90.9,-60.1,30.9,-1.7,29.1,88.6,...,0.027,0.078,0.03,0.047,-0.08,0.078,-0.7,-0.044,-0.059,-0.003
1,0,35.1,-15.4,19.7,92.9,-67.5,25.4,5.9,17.1,85.9,...,0.029,0.006,0.023,0.017,-0.088,0.035,-0.47,-0.015,-0.011,0.0
2,1,32.9,-5.8,27.1,97.1,-59.7,37.4,1.1,35.2,93.6,...,0.075,0.043,0.005,0.047,-0.128,0.041,-0.59,-0.009,-0.005,0.008
3,1,14.9,-5.3,9.8,71.7,-57.5,14.2,-2.7,24.9,79.7,...,-0.067,0.021,0.09,0.069,-0.176,0.013,-0.41,-0.061,-0.064,-0.005
4,1,13.4,-3.9,9.4,70.7,-60.3,10.4,6.8,10.8,72.8,...,-0.024,0.025,0.002,0.026,-0.176,0.055,-0.58,0.006,-0.044,0.051


In [4]:
split_df.columns

Index(['win', 'ppg', 'opp_ppg', 'net_ppg', 'off_rtg', 'def_rtg', 'net_rtg',
       'pace', 'hhs_net_rtg', 'hhs_off_rtg', 'hhs_def_rtg', 'rpi', 'ppp',
       'ppsa', 'efg%', 'ftr', '3pr', '%pts_ft', '%pts_2p', '%pts_3p', 'orb%',
       'drb%', 'trb%', 'ast%', 'tov%', 'ast_tov', 'stl%', 'blk%', 'pf%',
       'pppd', 'opp_ppsa', 'efgd%', 'ftrd', '3prd', '%pts_ftd', '%pts_2pd',
       '%pts_3pd', 'astd%', 'tovd%', 'opp_ast_tov', 'opp_stl%', 'opp_blk%',
       'opp_pf%'],
      dtype='object')

In [5]:
X = df[['ppg', 'opp_ppg', 'net_ppg', 'off_rtg', 'def_rtg', 'net_rtg',
       'pace', 'hhs_net_rtg', 'hhs_off_rtg', 'hhs_def_rtg', 'rpi', 'ppp',
       'ppsa', 'efg%', 'ftr', '3pr', '%pts_ft', '%pts_2p', '%pts_3p', 'orb%',
       'drb%', 'trb%', 'ast%', 'tov%', 'ast_tov', 'stl%', 'blk%', 'pf%',
       'pppd', 'opp_ppsa', 'efgd%', 'ftrd', '3prd', '%pts_ftd', '%pts_2pd',
       '%pts_3pd', 'astd%', 'tovd%', 'opp_ast_tov', 'opp_stl%', 'opp_blk%',
       'opp_pf%']]
Y = df['win']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

print(train_df.shape)
print(test_df.shape)

(73, 43)
(32, 43)


## I. Filter Methods

### A. Box Plots

### B. Correlation

In [6]:
corr_matrix = train_df.corr()
correlation_with_target = round(abs(corr_matrix['win'])*100,2)
correlation_with_target = correlation_with_target[correlation_with_target >= 5]
print(correlation_with_target)

ppg             13.58
opp_ppg         19.86
net_ppg         29.22
off_rtg         10.24
def_rtg         30.49
net_rtg         29.21
pace             6.26
hhs_net_rtg     40.44
hhs_off_rtg     24.76
hhs_def_rtg     37.41
rpi             50.34
ppp             12.16
ppsa            17.08
ftr             14.53
3pr              5.69
orb%            14.93
drb%            22.63
trb%             8.74
ast%            12.86
tov%            15.92
ast_tov         20.08
stl%            11.85
blk%            15.53
pf%             19.91
pppd            34.37
opp_ppsa        33.48
efgd%            6.58
ftrd             9.25
3prd            10.30
%pts_ftd        11.05
%pts_2pd         7.26
opp_ast_tov     10.17
opp_pf%         27.10
win            100.00
Name: win, dtype: float64


### C. Statistical Tests

In [7]:
X = train_df[['ppg', 'opp_ppg', 'net_ppg', 'off_rtg', 'def_rtg', 'net_rtg', 'pace',
             'hhs_net_rtg', 'hhs_off_rtg', 'hhs_def_rtg', 'rpi', 'ppp', 'ppsa',
             'ftr', '3pr', 'orb%', 'drb%', 'ast%', 'tov%', 'ast_tov', 'stl%', 'blk%', 'pf%',
             'pppd', 'opp_ppsa', 'efgd%', 'ftrd', '3prd', '%pts_ftd', 'opp_ast_tov', 'opp_pf%']]
#X = train_df.drop('win', axis=1)
y = train_df['win']

# ANOVA
f_values, p_values = f_classif(X, y)
# Mutual Information
mi = mutual_info_regression(X, y)

column_names = np.array(X.columns)
stats_df = pd.DataFrame({
    'feature': column_names,
    'f_values': f_values,
    'p_values': p_values,
    'mutual_info': mi
})
stats_df
#stats_df.to_csv('stats.csv')

Unnamed: 0,feature,f_values,p_values,mutual_info
0,ppg,1.334495,0.25188,0.0
1,opp_ppg,2.913925,0.092188,0.030855
2,net_ppg,6.62966,0.012115,0.023922
3,off_rtg,0.75296,0.388464,0.014875
4,def_rtg,7.277013,0.008719,0.078506
5,net_rtg,6.622504,0.012159,0.049616
6,pace,0.279117,0.598928,0.0
7,hhs_net_rtg,13.878952,0.000388,0.042297
8,hhs_off_rtg,4.635587,0.034714,0.043299
9,hhs_def_rtg,11.550392,0.001114,0.126154


In [8]:
feature_df = stats_df[stats_df['mutual_info'] > 0]
feature_df

Unnamed: 0,feature,f_values,p_values,mutual_info
1,opp_ppg,2.913925,0.092188,0.030855
2,net_ppg,6.62966,0.012115,0.023922
3,off_rtg,0.75296,0.388464,0.014875
4,def_rtg,7.277013,0.008719,0.078506
5,net_rtg,6.622504,0.012159,0.049616
7,hhs_net_rtg,13.878952,0.000388,0.042297
8,hhs_off_rtg,4.635587,0.034714,0.043299
9,hhs_def_rtg,11.550392,0.001114,0.126154
10,rpi,24.103773,6e-06,0.087236
12,ppsa,2.132721,0.148596,0.005747


In [9]:
feature_df = feature_df[feature_df['p_values'] < .25]
feature_df.sort_values(by='f_values', ascending=False)

Unnamed: 0,feature,f_values,p_values,mutual_info
10,rpi,24.103773,6e-06,0.087236
7,hhs_net_rtg,13.878952,0.000388,0.042297
9,hhs_def_rtg,11.550392,0.001114,0.126154
23,pppd,9.510444,0.00291,0.02701
24,opp_ppsa,8.96053,0.003794,0.097666
4,def_rtg,7.277013,0.008719,0.078506
2,net_ppg,6.62966,0.012115,0.023922
5,net_rtg,6.622504,0.012159,0.049616
30,opp_pf%,5.62939,0.020374,0.016521
8,hhs_off_rtg,4.635587,0.034714,0.043299


## II. Wrapper Methods

### A. Forward Selection

### B. Recursive Feature Elimination

In [47]:
# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert scaled data back to DataFrame to keep track of column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Create a Logistic Regression model
log_reg = LogisticRegression(solver='liblinear')

# Initialize RFE with logistic regression as the model and the number of features to select
rfe = RFE(estimator=log_reg, n_features_to_select=8)  # Select top 12 features

# Fit RFE on the scaled training data
rfe.fit(X_train_scaled, y_train)

# Get the selected features (column names)
selected_features = X_train_scaled.columns[rfe.support_]
print("Selected Features:", selected_features)

# Train a logistic regression model using only the selected features
log_reg.fit(X_train_scaled[selected_features], y_train)

# Make predictions on both training and testing datasets
y_train_pred = log_reg.predict(X_train_scaled[selected_features])
y_test_pred = log_reg.predict(X_test_scaled[selected_features])

# Evaluate the F1 score on both training and testing datasets
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# Output the results
print("F1 Score on Training Set:", f1_train)
print("F1 Score on Test Set:", f1_test)

Selected Features: Index(['rpi', 'hhs_def_rtg', 'pppd', 'opp_ppsa', 'def_rtg', 'opp_pf%', 'drb%',
       'opp_ppg'],
      dtype='object')
F1 Score on Training Set: 0.8431372549019608
F1 Score on Test Set: 0.8444444444444444


### C. Lasso Regression

In [48]:
# Create a Lasso Logistic Regression model (penalty='l1' for Lasso)
lasso_log_reg = LogisticRegression(solver='liblinear', penalty='l1', max_iter=1000)

# Fit the model on the training data
lasso_log_reg.fit(X_train_scaled, y_train)

# Make predictions on both training and testing datasets
y_train_pred = lasso_log_reg.predict(X_train_scaled)
y_test_pred = lasso_log_reg.predict(X_test_scaled)

# Evaluate the F1 score on both training and testing datasets
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# Output the results
print("F1 Score on Training Set:", f1_train)
print("F1 Score on Test Set:", f1_test)

# Get the non-zero coefficients (selected features) from the Lasso model
selected_features = X.columns[(lasso_log_reg.coef_ != 0).flatten()]
print("Selected Features (Columns used in the model):", selected_features)

F1 Score on Training Set: 0.8543689320388349
F1 Score on Test Set: 0.8444444444444444
Selected Features (Columns used in the model): Index(['rpi', 'pppd', 'opp_pf%', 'drb%', 'opp_ppg', 'tov%', 'orb%'], dtype='object')


In [49]:
len(selected_features)

7

## III. Tree Methods

### A. Decision Tree Information Gain

In [50]:
X_train_scaled = scaler.fit_transform(X_train)  # Scaled training features
X_test_scaled = scaler.transform(X_test)  # Scaled test features

# Create a Decision Tree Classifier model
decision_tree = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
decision_tree.fit(X_train_scaled, y_train)

# Make predictions on both training and testing datasets
y_train_pred = decision_tree.predict(X_train_scaled)
y_test_pred = decision_tree.predict(X_test_scaled)

# Evaluate the F1 score on both training and testing datasets
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# Output the F1 scores
print("F1 Score on Training Set:", f1_train)
print("F1 Score on Test Set:", f1_test)

# Get feature importances (information gain)
feature_importances = decision_tree.feature_importances_

# Get the selected features based on their importance (non-zero importance)
selected_features = X.columns[feature_importances > 0]
print("Selected Features (Columns used in the model):", selected_features)

F1 Score on Training Set: 1.0
F1 Score on Test Set: 0.6511627906976744
Selected Features (Columns used in the model): Index(['rpi', 'opp_ppsa', 'opp_pf%', 'tov%', 'blk%', 'orb%'], dtype='object')


In [51]:
len(selected_features)

6

### B. Random Forest

In [52]:
# Create a Random Forest Classifier model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
random_forest.fit(X_train_scaled, y_train)

# Make predictions on both training and testing datasets
y_train_pred = random_forest.predict(X_train_scaled)
y_test_pred = random_forest.predict(X_test_scaled)

# Evaluate the F1 score on both training and testing datasets
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# Output the F1 scores
print("F1 Score on Training Set:", f1_train)
print("F1 Score on Test Set:", f1_test)

# Get feature importances (information gain)
feature_importances = random_forest.feature_importances_

# Get the columns (features) that were selected (those with non-zero importance)
selected_features = X.columns[feature_importances > .02]
print("Selected Features (Columns used in the model):", selected_features)

F1 Score on Training Set: 1.0
F1 Score on Test Set: 0.7659574468085107
Selected Features (Columns used in the model): Index(['rpi', 'hhs_net_rtg', 'hhs_def_rtg', 'pppd', 'opp_ppsa', 'def_rtg',
       'net_ppg', 'net_rtg', 'opp_pf%', 'hhs_off_rtg', 'drb%', 'ast_tov',
       'opp_ppg', 'ppsa', 'tov%', 'blk%', 'orb%', 'ftr'],
      dtype='object')


In [53]:
len(selected_features)

18

### C. Extra Tree

In [54]:
# Create an Extra Trees Classifier model
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
extra_trees.fit(X_train_scaled, y_train)

# Make predictions on both training and testing datasets
y_train_pred = extra_trees.predict(X_train_scaled)
y_test_pred = extra_trees.predict(X_test_scaled)

# Evaluate the F1 score on both training and testing datasets
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# Output the F1 scores
print("F1 Score on Training Set:", f1_train)
print("F1 Score on Test Set:", f1_test)

# Get feature importances (information gain)
feature_importances = extra_trees.feature_importances_

F1 Score on Training Set: 1.0
F1 Score on Test Set: 0.75


In [55]:
column_names = X_train.columns
tree_df = pd.DataFrame({
    'feature': column_names,
    'importance': feature_importances
})
tree_df
#tree_df.to_csv('tree.csv')

Unnamed: 0,feature,importance
0,rpi,0.07714
1,hhs_net_rtg,0.070135
2,hhs_def_rtg,0.052475
3,pppd,0.060424
4,opp_ppsa,0.060567
5,def_rtg,0.045271
6,net_ppg,0.052895
7,net_rtg,0.060126
8,opp_pf%,0.062021
9,hhs_off_rtg,0.060453


## IV. Exhaustive Feature Selection

In [58]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

#X_train_scaled2 = X_train_scaled[[
#'off_rtg', 'rpi', 'efg%', '3pr', '%pts_3p', 'drb%', 'trb%',
#       'ast%', 'stl%', 'pf%', 'ftrd', '%pts_ftd', '%pts_2pd', '%pts_3pd',
#       'astd%', 'opp_stl%', 'opp_pf%', 'hhs_net_rtg', 'hhs_off_rtg', 'hhs_def_rtg',
#        'pppd', 'opp_ppsa'
#]]

#X_test_scaled2 = X_test_scaled[[
#'off_rtg', 'rpi', 'efg%', '3pr', '%pts_3p', 'drb%', 'trb%',
#       'ast%', 'stl%', 'pf%', 'ftrd', '%pts_ftd', '%pts_2pd', '%pts_3pd',
#       'astd%', 'opp_stl%', 'opp_pf%', 'hhs_net_rtg', 'hhs_off_rtg', 'hhs_def_rtg',
#        'pppd', 'opp_ppsa'
#]]

In [59]:
# Initialize variables to track the best model
best_f1 = 0
best_features = []
best_train_f1 = 0

# Iterate over all possible subsets of features
for k in range(1, len(X_train_scaled.columns) + 1):
    for subset in combinations(X_train_scaled.columns, k):
        X_train_subset = X_train_scaled[list(subset)]
        X_test_subset = X_test_scaled[list(subset)]
        
        # Train the model with the subset of features
        log_reg.fit(X_train_subset, y_train)
        
        # Make predictions on the test set
        y_test_pred = log_reg.predict(X_test_subset)
        test_f1 = f1_score(y_test, y_test_pred)
        
        # Make predictions on the training set
        y_train_pred = log_reg.predict(X_train_subset)
        train_f1 = f1_score(y_train, y_train_pred)
        
        # Track the best subset based on test F1 score
        if test_f1 > best_f1:
            best_f1 = test_f1
            best_features = subset
            best_train_f1 = train_f1

# Output the best feature subset and the F1 scores
print(f"Best Feature Subset: {best_features}")
print(f"Best Test F1 Score: {best_f1:.4f}")
print(f"Training F1 Score of Best Model: {best_train_f1:.4f}")

Best Feature Subset: ('rpi', 'hhs_def_rtg', 'pppd', 'opp_ppsa', 'def_rtg', 'hhs_off_rtg', 'orb%')
Best Test F1 Score: 0.9524
Training F1 Score of Best Model: 0.8544


In [60]:
rfe_df = df[['game_id', 'year', 'round', 'region', 'high_team_id', 'low_team_id', 'win', 
               'rpi', 'hhs_def_rtg', 'pppd', 'opp_ppsa', 'def_rtg', 'opp_pf%', 'drb%', 'opp_ppg']]
rfe_df.to_csv('rfe_elite.csv')
lasso_df = df[['game_id', 'year', 'round', 'region', 'high_team_id', 'low_team_id', 'win', 
               'rpi', 'pppd', 'opp_pf%', 'drb%', 'opp_ppg', 'tov%', 'orb%']]
lasso_df.to_csv('lasso_elite.csv')
exhaustive_df = df[['game_id', 'year', 'round', 'region', 'high_team_id', 'low_team_id', 'win', 
               'rpi', 'hhs_def_rtg', 'pppd', 'opp_ppsa', 'def_rtg', 'hhs_off_rtg', 'orb%']]
exhaustive_df.to_csv('exhaustive_elite.csv')