In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from itertools import combinations

#df = pd.read_csv('matchups_women_first_weekend.csv')
df = pd.read_csv('matchups_womens_first_weekend.csv')
print(df.shape)
df.head()

(336, 49)


Unnamed: 0,game_id,year,round,region,high_team_id,low_team_id,win,ppg,opp_ppg,net_ppg,...,3prd,%pts_ftd,%pts_2pd,%pts_3pd,astd%,tovd%,opp_ast_tov,opp_stl%,opp_blk%,opp_pf%
0,20171101,2017,First Round,East,2017 Connecticut,2017 Albany,1,31.1,-0.6,30.6,...,0.081,0.05,0.04,0.09,-0.117,-0.011,-0.18,-0.048,-0.051,-0.029
1,20171108,2017,First Round,East,2017 Syracuse,2017 Iowa St.,1,27.5,-21.2,6.3,...,-0.044,0.016,0.043,0.027,0.08,0.103,-0.47,-0.002,0.0,0.01
2,20171106,2017,First Round,East,2017 Texas A&M,2017 Penn,1,1.1,-3.4,-2.3,...,-0.061,0.007,0.094,0.088,-0.059,0.022,-0.19,-0.012,-0.07,0.05
3,20171104,2017,First Round,East,2017 UCLA,2017 Boise St.,1,17.8,-11.7,6.2,...,-0.037,0.043,0.012,0.032,-0.024,0.035,-0.29,-0.022,-0.027,-0.007
4,20171103,2017,First Round,East,2017 West Virginia,2017 Elon,1,13.4,-12.4,0.9,...,0.003,0.012,0.023,0.012,-0.051,0.017,-0.24,0.0,-0.032,-0.014


In [2]:
df['win'].value_counts(normalize=True)

1    0.720238
0    0.279762
Name: win, dtype: float64

In [3]:
split_df = df.drop(['game_id', 'year', 'round', 'region', 'high_team_id', 'low_team_id'],axis=1)
print(split_df.shape)
split_df.head()

(336, 43)


Unnamed: 0,win,ppg,opp_ppg,net_ppg,off_rtg,def_rtg,net_rtg,pace,hhs_net_rtg,hhs_off_rtg,...,3prd,%pts_ftd,%pts_2pd,%pts_3pd,astd%,tovd%,opp_ast_tov,opp_stl%,opp_blk%,opp_pf%
0,1,31.1,-0.6,30.6,90.9,-47.4,43.6,-0.8,54.9,98.0,...,0.081,0.05,0.04,0.09,-0.117,-0.011,-0.18,-0.048,-0.051,-0.029
1,1,27.5,-21.2,6.3,81.7,-73.9,7.8,4.2,7.6,79.5,...,-0.044,0.016,0.043,0.027,0.08,0.103,-0.47,-0.002,0.0,0.01
2,1,1.1,-3.4,-2.3,66.6,-70.9,-4.2,4.9,6.2,73.3,...,-0.061,0.007,0.094,0.088,-0.059,0.022,-0.19,-0.012,-0.07,0.05
3,1,17.8,-11.7,6.2,72.4,-63.3,9.0,1.7,20.2,79.5,...,-0.037,0.043,0.012,0.032,-0.024,0.035,-0.29,-0.022,-0.027,-0.007
4,1,13.4,-12.4,0.9,67.3,-65.1,2.2,2.8,6.8,71.9,...,0.003,0.012,0.023,0.012,-0.051,0.017,-0.24,0.0,-0.032,-0.014


In [7]:
split_df.columns

Index(['win', 'ppg', 'opp_ppg', 'net_ppg', 'off_rtg', 'def_rtg', 'net_rtg',
       'pace', 'hhs_net_rtg', 'hhs_off_rtg', 'hhs_def_rtg', 'rpi', 'ppp',
       'ppsa', 'efg%', 'ftr', '3pr', '%pts_ft', '%pts_2p', '%pts_3p', 'orb%',
       'drb%', 'trb%', 'ast%', 'tov%', 'ast_tov', 'stl%', 'blk%', 'pf%',
       'pppd', 'opp_ppsa', 'efgd%', 'ftrd', '3prd', '%pts_ftd', '%pts_2pd',
       '%pts_3pd', 'astd%', 'tovd%', 'opp_ast_tov', 'opp_stl%', 'opp_blk%',
       'opp_pf%'],
      dtype='object')

In [8]:
X = split_df.drop('win', axis=1)
X = split_df[['ppg', 'opp_ppg', 'net_ppg', 'off_rtg', 'def_rtg', 'net_rtg',
       'pace', 'hhs_net_rtg', 'hhs_off_rtg', 'hhs_def_rtg', 'rpi', 'ppp',
       'ppsa', 'efg%', 'ftr', '3pr', '%pts_ft', '%pts_2p', '%pts_3p', 'orb%',
       'drb%', 'trb%', 'ast%', 'tov%', 'ast_tov', 'stl%', 'blk%', 'pf%',
       'pppd', 'opp_ppsa', 'efgd%', 'ftrd', '3prd', '%pts_ftd', '%pts_2pd',
       '%pts_3pd', 'astd%', 'tovd%', 'opp_ast_tov', 'opp_stl%', 'opp_blk%',
       'opp_pf%']]
Y = df['win']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

print(train_df.shape)
print(test_df.shape)

(235, 43)
(101, 43)


## I. Filter Methods

### A. Box Plots

### B. Correlation

In [9]:
corr_matrix = train_df.corr()
correlation_with_target = round(abs(corr_matrix['win'])*100,2)
correlation_with_target = correlation_with_target[correlation_with_target >= 10]
print(correlation_with_target)

ppg             15.65
opp_ppg         28.20
net_ppg         40.45
off_rtg         19.30
def_rtg         35.13
net_rtg         39.70
pace            12.16
hhs_net_rtg     56.10
hhs_off_rtg     45.70
hhs_def_rtg     51.54
rpi             59.75
ppp             12.85
ppsa            14.03
efg%            12.44
ftr             22.00
3pr             20.00
%pts_3p         15.91
trb%            34.15
ast%            12.33
tov%            17.12
ast_tov         14.89
stl%            14.32
blk%            16.49
pppd            32.20
opp_ppsa        32.44
efgd%           31.93
ftrd            11.80
tovd%           15.27
opp_ast_tov     17.92
opp_pf%         10.18
win            100.00
Name: win, dtype: float64


### C. Statistical Tests

In [10]:
X = train_df[['ppg', 'opp_ppg', 'net_ppg', 'off_rtg', 'def_rtg', 'net_rtg',
       'pace', 'hhs_net_rtg', 'hhs_off_rtg', 'hhs_def_rtg', 'rpi', 'ppp',
       'ppsa', 'efg%', 'ftr', '3pr', '%pts_ft', '%pts_2p', '%pts_3p', 'orb%',
       'drb%', 'trb%', 'ast%', 'tov%', 'ast_tov', 'stl%', 'blk%', 'pf%',
       'pppd', 'opp_ppsa', 'efgd%', 'ftrd', '3prd', '%pts_ftd', '%pts_2pd',
       '%pts_3pd', 'astd%', 'tovd%', 'opp_ast_tov', 'opp_stl%', 'opp_blk%',
       'opp_pf%']]
#X = train_df.drop('win', axis=1)
y = train_df['win']

# ANOVA
f_values, p_values = f_classif(X, y)
# Mutual Information
mi = mutual_info_regression(X, y)

column_names = np.array(X.columns)
stats_df = pd.DataFrame({
    'feature': column_names,
    'f_values': f_values,
    'p_values': p_values,
    'mutual_info': mi
})
stats_df
#stats_df.to_csv('stats.csv')

Unnamed: 0,feature,f_values,p_values,mutual_info
0,ppg,5.851772,0.01632865,0.014406
1,opp_ppg,20.136089,1.133581e-05,0.033082
2,net_ppg,45.572335,1.157757e-10,0.089978
3,off_rtg,9.010296,0.002976932,0.009751
4,def_rtg,32.793562,3.143704e-08,0.060184
5,net_rtg,43.605349,2.690247e-10,0.109977
6,pace,3.498545,0.06267654,0.0
7,hhs_net_rtg,107.005122,6.979263999999999e-21,0.245777
8,hhs_off_rtg,61.517352,1.572748e-13,0.119482
9,hhs_def_rtg,84.269756,2.4056360000000002e-17,0.191677


In [11]:
feature_df = stats_df[stats_df['mutual_info'] > 0]
feature_df

Unnamed: 0,feature,f_values,p_values,mutual_info
0,ppg,5.851772,0.01632865,0.014406
1,opp_ppg,20.136089,1.133581e-05,0.033082
2,net_ppg,45.572335,1.157757e-10,0.089978
3,off_rtg,9.010296,0.002976932,0.009751
4,def_rtg,32.793562,3.143704e-08,0.060184
5,net_rtg,43.605349,2.690247e-10,0.109977
7,hhs_net_rtg,107.005122,6.979263999999999e-21,0.245777
8,hhs_off_rtg,61.517352,1.572748e-13,0.119482
9,hhs_def_rtg,84.269756,2.4056360000000002e-17,0.191677
10,rpi,129.354262,3.945765e-24,0.223487


In [12]:
feature_df = feature_df[feature_df['p_values'] < .05]
feature_df.sort_values(by='f_values', ascending=False)

Unnamed: 0,feature,f_values,p_values,mutual_info
10,rpi,129.354262,3.945765e-24,0.223487
7,hhs_net_rtg,107.005122,6.979263999999999e-21,0.245777
9,hhs_def_rtg,84.269756,2.4056360000000002e-17,0.191677
8,hhs_off_rtg,61.517352,1.572748e-13,0.119482
2,net_ppg,45.572335,1.157757e-10,0.089978
5,net_rtg,43.605349,2.690247e-10,0.109977
4,def_rtg,32.793562,3.143704e-08,0.060184
21,trb%,30.761386,7.892812e-08,0.086209
29,opp_ppsa,27.400928,3.688404e-07,0.063624
28,pppd,26.944237,4.557127e-07,0.059229


In [13]:
feature_df = feature_df[feature_df['f_values'] > 5]
feature_df.sort_values(by='f_values', ascending=False)

Unnamed: 0,feature,f_values,p_values,mutual_info
10,rpi,129.354262,3.945765e-24,0.223487
7,hhs_net_rtg,107.005122,6.979263999999999e-21,0.245777
9,hhs_def_rtg,84.269756,2.4056360000000002e-17,0.191677
8,hhs_off_rtg,61.517352,1.572748e-13,0.119482
2,net_ppg,45.572335,1.157757e-10,0.089978
5,net_rtg,43.605349,2.690247e-10,0.109977
4,def_rtg,32.793562,3.143704e-08,0.060184
21,trb%,30.761386,7.892812e-08,0.086209
29,opp_ppsa,27.400928,3.688404e-07,0.063624
28,pppd,26.944237,4.557127e-07,0.059229


## II. Wrapper Methods

### A. Forward Selection

In [14]:
# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Forward Selection using AIC
def forward_selection_logistic_aic(X_train, y_train, X_test, y_test):
    selected_features = []
    remaining_features = list(X_train.columns)  # Column names
    best_f1_train = 0
    best_f1_test = 0
    best_aic = np.inf  # Start with a large AIC value
    
    while remaining_features:
        aic_list = []
        f1_train_list = []
        f1_test_list = []
        
        for feature in remaining_features:
            current_features = selected_features + [feature]
            X_train_subset = X_train[current_features]
            X_test_subset = X_test[current_features]
            
            # Add constant for intercept (statsmodels requires constant)
            X_train_subset_sm = sm.add_constant(X_train_subset)
            X_test_subset_sm = sm.add_constant(X_test_subset)
            
            # Fit the logistic regression model using statsmodels (to compute AIC)
            model = sm.Logit(y_train, X_train_subset_sm)
            result = model.fit()
            
            aic = result.aic  # Extract AIC value
            aic_list.append(aic)
            
            # Now, use sklearn for F1 score calculation
            log_reg_model = LogisticRegression(solver='liblinear')
            log_reg_model.fit(X_train_subset, y_train)  # Fit model without constant term
            
            # Make predictions on both train and test data
            y_train_pred = log_reg_model.predict(X_train_subset)
            y_test_pred = log_reg_model.predict(X_test_subset)
            
            # Calculate F1 scores
            f1_train = f1_score(y_train, y_train_pred)
            f1_test = f1_score(y_test, y_test_pred)
            
            f1_train_list.append(f1_train)
            f1_test_list.append(f1_test)
        
        # Select the feature that gives the lowest AIC
        best_feature_idx = np.argmin(aic_list)
        selected_features.append(remaining_features[best_feature_idx])
        remaining_features.remove(remaining_features[best_feature_idx])
        
        best_aic = aic_list[best_feature_idx]
        best_f1_train = f1_train_list[best_feature_idx]
        best_f1_test = f1_test_list[best_feature_idx]
        
    return selected_features, best_aic, best_f1_train, best_f1_test

# Perform forward selection using AIC
selected_features, best_aic, best_f1_train, best_f1_test = forward_selection_logistic_aic(
    X_train, y_train, X_test, y_test
)

# Output the results
print("Selected Features:", selected_features)
print("Best AIC:", best_aic)
print("Best F1 Score on Training Set:", best_f1_train)
print("Best F1 Score on Test Set:", best_f1_test)

# Create the final Logistic Regression model using selected features
log_reg_final = LogisticRegression(solver='liblinear')
X_train_final = X_train[selected_features]
X_test_final = X_test[selected_features]

# Fit the final model
log_reg_final.fit(X_train_final, y_train)

# Make predictions on training and testing sets
y_train_pred_final = log_reg_final.predict(X_train_final)
y_test_pred_final = log_reg_final.predict(X_test_final)

# Evaluate using F1 score
final_f1_train = f1_score(y_train, y_train_pred_final)
final_f1_test = f1_score(y_test, y_test_pred_final)

print("Final F1 Score on Training Set:", final_f1_train)
print("Final F1 Score on Test Set:", final_f1_test)

Optimization terminated successfully.
         Current function value: 0.592751
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.564486
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.507423
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.586014
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.538385
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.511422
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.597908
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.378846
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.478644
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.436548
  

In [15]:
len(selected_features)

42

### B. Recursive Feature Elimination

In [16]:
# Convert scaled data back to DataFrame to keep track of column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Create a Logistic Regression model
log_reg = LogisticRegression(solver='liblinear')

# Initialize RFE with logistic regression as the model and the number of features to select
rfe = RFE(estimator=log_reg, n_features_to_select=12)  # Select top 12 features

# Fit RFE on the scaled training data
rfe.fit(X_train_scaled, y_train)

# Get the selected features (column names)
selected_features = X_train_scaled.columns[rfe.support_]
print("Selected Features:", selected_features)

# Train a logistic regression model using only the selected features
log_reg.fit(X_train_scaled[selected_features], y_train)

# Make predictions on both training and testing datasets
y_train_pred = log_reg.predict(X_train_scaled[selected_features])
y_test_pred = log_reg.predict(X_test_scaled[selected_features])

# Evaluate the F1 score on both training and testing datasets
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# Output the results
print("F1 Score on Training Set:", f1_train)
print("F1 Score on Test Set:", f1_test)

Selected Features: Index(['opp_ppg', 'hhs_net_rtg', 'hhs_def_rtg', 'rpi', 'efg%', '3pr',
       '%pts_3p', 'orb%', 'ast%', '%pts_ftd', '%pts_2pd', '%pts_3pd'],
      dtype='object')
F1 Score on Training Set: 0.9194029850746269
F1 Score on Test Set: 0.8933333333333332


### C. Lasso Regression

In [18]:
# Create a Lasso Logistic Regression model (penalty='l1' for Lasso)
lasso_log_reg = LogisticRegression(solver='liblinear', penalty='l1', max_iter=1000)

# Fit the model on the training data
lasso_log_reg.fit(X_train_scaled, y_train)

# Make predictions on both training and testing datasets
y_train_pred = lasso_log_reg.predict(X_train_scaled)
y_test_pred = lasso_log_reg.predict(X_test_scaled)

# Evaluate the F1 score on both training and testing datasets
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# Output the results
print("F1 Score on Training Set:", f1_train)
print("F1 Score on Test Set:", f1_test)

# Get the non-zero coefficients (selected features) from the Lasso model
selected_features = X.columns[(lasso_log_reg.coef_ != 0).flatten()]
print("Selected Features (Columns used in the model):", selected_features)

F1 Score on Training Set: 0.9285714285714286
F1 Score on Test Set: 0.8648648648648649
Selected Features (Columns used in the model): Index(['opp_ppg', 'pace', 'hhs_net_rtg', 'hhs_def_rtg', 'rpi', 'efg%', 'ftr',
       '3pr', '%pts_ft', '%pts_2p', '%pts_3p', 'orb%', 'trb%', 'ast%',
       'ast_tov', 'stl%', 'pf%', 'opp_ppsa', 'efgd%', '3prd', '%pts_ftd',
       '%pts_2pd', '%pts_3pd', 'astd%', 'opp_stl%', 'opp_blk%'],
      dtype='object')


In [19]:
len(selected_features)

26

## III. Tree Methods

### A. Decision Tree Information Gain

In [20]:
X_train_scaled = scaler.fit_transform(X_train)  # Scaled training features
X_test_scaled = scaler.transform(X_test)  # Scaled test features

# Create a Decision Tree Classifier model
decision_tree = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
decision_tree.fit(X_train_scaled, y_train)

# Make predictions on both training and testing datasets
y_train_pred = decision_tree.predict(X_train_scaled)
y_test_pred = decision_tree.predict(X_test_scaled)

# Evaluate the F1 score on both training and testing datasets
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# Output the F1 scores
print("F1 Score on Training Set:", f1_train)
print("F1 Score on Test Set:", f1_test)

# Get feature importances (information gain)
feature_importances = decision_tree.feature_importances_

# Get the selected features based on their importance (non-zero importance)
selected_features = X.columns[feature_importances > 0]
print("Selected Features (Columns used in the model):", selected_features)

F1 Score on Training Set: 1.0
F1 Score on Test Set: 0.8513513513513513
Selected Features (Columns used in the model): Index(['opp_ppg', 'net_ppg', 'pace', 'hhs_net_rtg', 'rpi', 'ftr', 'orb%',
       'ast%', 'stl%', 'blk%', '%pts_ftd', '%pts_2pd', 'astd%', 'tovd%',
       'opp_blk%'],
      dtype='object')


In [21]:
len(selected_features)

15

### B. Random Forest

In [22]:
# Create a Random Forest Classifier model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
random_forest.fit(X_train_scaled, y_train)

# Make predictions on both training and testing datasets
y_train_pred = random_forest.predict(X_train_scaled)
y_test_pred = random_forest.predict(X_test_scaled)

# Evaluate the F1 score on both training and testing datasets
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# Output the F1 scores
print("F1 Score on Training Set:", f1_train)
print("F1 Score on Test Set:", f1_test)

# Get feature importances (information gain)
feature_importances = random_forest.feature_importances_

# Get the columns (features) that were selected (those with non-zero importance)
selected_features = X.columns[feature_importances > .02]
print("Selected Features (Columns used in the model):", selected_features)

F1 Score on Training Set: 1.0
F1 Score on Test Set: 0.9019607843137255
Selected Features (Columns used in the model): Index(['opp_ppg', 'net_ppg', 'net_rtg', 'hhs_net_rtg', 'hhs_off_rtg',
       'hhs_def_rtg', 'rpi', '3pr', 'trb%'],
      dtype='object')


In [23]:
len(selected_features)

9

### C. Extra Tree

In [24]:
# Create an Extra Trees Classifier model
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
extra_trees.fit(X_train_scaled, y_train)

# Make predictions on both training and testing datasets
y_train_pred = extra_trees.predict(X_train_scaled)
y_test_pred = extra_trees.predict(X_test_scaled)

# Evaluate the F1 score on both training and testing datasets
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# Output the F1 scores
print("F1 Score on Training Set:", f1_train)
print("F1 Score on Test Set:", f1_test)

# Get feature importances (information gain)
feature_importances = extra_trees.feature_importances_

F1 Score on Training Set: 1.0
F1 Score on Test Set: 0.9182389937106917


In [25]:
column_names = X_train.columns
tree_df = pd.DataFrame({
    'feature': column_names,
    'importance': feature_importances
})
tree_df
#tree_df.to_csv('tree.csv')

Unnamed: 0,feature,importance
0,ppg,0.013682
1,opp_ppg,0.018359
2,net_ppg,0.024426
3,off_rtg,0.015297
4,def_rtg,0.019519
5,net_rtg,0.028614
6,pace,0.014979
7,hhs_net_rtg,0.09266
8,hhs_off_rtg,0.049906
9,hhs_def_rtg,0.065346


## IV. Exhaustive Feature Selection

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_scaled2 = X_train_scaled[[
'opp_ppg', 'hhs_net_rtg', 'hhs_def_rtg', 'rpi', 'ppsa', '3pr',
       '%pts_3p', 'orb%', 'ast%', '%pts_ftd', '%pts_2pd', '%pts_3pd',
        'pppd', 'hhs_off_rtg', 'net_ppg', 'ftr', 'astd%', 'tovd%'
]]

X_test_scaled2 = X_test_scaled[[
'opp_ppg', 'hhs_net_rtg', 'hhs_def_rtg', 'rpi', 'ppsa', '3pr',
       '%pts_3p', 'orb%', 'ast%', '%pts_ftd', '%pts_2pd', '%pts_3pd',
        'pppd', 'hhs_off_rtg', 'net_ppg', 'ftr', 'astd%', 'tovd%'
]]

In [29]:
# Initialize variables to track the best model
best_f1 = 0
best_features = []
best_train_f1 = 0

# Iterate over all possible subsets of features
for k in range(1, len(X_train_scaled2.columns) + 1):
    for subset in combinations(X_train_scaled2.columns, k):
        X_train_subset = X_train_scaled2[list(subset)]
        X_test_subset = X_test_scaled2[list(subset)]
        
        # Train the model with the subset of features
        log_reg.fit(X_train_subset, y_train)
        
        # Make predictions on the test set
        y_test_pred = log_reg.predict(X_test_subset)
        test_f1 = f1_score(y_test, y_test_pred)
        
        # Make predictions on the training set
        y_train_pred = log_reg.predict(X_train_subset)
        train_f1 = f1_score(y_train, y_train_pred)
        
        # Track the best subset based on test F1 score
        if test_f1 > best_f1:
            best_f1 = test_f1
            best_features = subset
            best_train_f1 = train_f1

# Output the best feature subset and the F1 scores
print(f"Best Feature Subset: {best_features}")
print(f"Best Test F1 Score: {best_f1:.4f}")
print(f"Training F1 Score of Best Model: {best_train_f1:.4f}")

Best Feature Subset: ('hhs_net_rtg', '3pr', '%pts_3p', '%pts_3pd', 'pppd', 'hhs_off_rtg', 'net_ppg')
Best Test F1 Score: 0.9427
Training F1 Score of Best Model: 0.8968


In [30]:
lasso_df = df[['game_id', 'year', 'round', 'region', 'high_team_id', 'low_team_id', 'win', 
               'opp_ppg', 'hhs_net_rtg', 'hhs_def_rtg', 'rpi', 'efg%', '3pr',
               '%pts_3p', 'orb%', 'ast%', '%pts_ftd', '%pts_2pd', '%pts_3pd']]
lasso_df.to_csv('lasso_first_weekend.csv')
forest_df = df[['game_id', 'year', 'round', 'region', 'high_team_id', 'low_team_id', 'win', 
               'opp_ppg', 'net_ppg', 'net_rtg', 'hhs_net_rtg', 'hhs_off_rtg',
               'hhs_def_rtg', 'rpi', '3pr', 'trb%']]
forest_df.to_csv('forest_first_weekend.csv')

In [31]:
exhaustive_df = df[['game_id', 'year', 'round', 'region', 'high_team_id', 'low_team_id', 'win', 
               'hhs_net_rtg', '3pr', '%pts_3p', '%pts_3pd', 'pppd', 'hhs_off_rtg', 'net_ppg']]
exhaustive_df.to_csv('exhaustive_first_weekend2.csv')