In [20]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [36]:
df = pd.read_csv("/Users/hannahwurzel/Desktop/MLB/data/merged_data.csv")
df.columns

Index(['date', 'season', 'away_team', 'home_team', 'over_under', 'dayofweek',
       'overunder_margin', 'home_days_since_previous_game',
       'home_point_win_streak', 'home_point_loss_streak',
       'home_overunder_margin', 'home_over_streak', 'home_under_streak',
       'home_overunder_season_avg', 'away_days_since_previous_game',
       'away_point_win_streak', 'away_point_loss_streak',
       'away_overunder_margin', 'away_over_streak', 'away_under_streak',
       'away_overunder_season_avg', 'over_line', 'under_line',
       'delta_days_since_previous_game', 'delta_point_win_streak',
       'delta_point_loss_streak', 'delta_overunder_streak_avg', 'target',
       'game_loc', 'max_temp_C', 'min_temp_C', 'precipitation_mm',
       'max_wind_speed_kmh', 'dominant_wind_dir_deg'],
      dtype='object')

In [41]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.weekday 
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df = df.drop(['date', 'season', 'overunder_margin'], axis=1)

In [42]:
X = df.drop('target', axis=1)
le_away = LabelEncoder()
X['away_team'] = le_away.fit_transform(X['away_team'])
le_home = LabelEncoder()
X['home_team'] = le_home.fit_transform(X['home_team'])
le_loc = LabelEncoder()
X['game_loc'] = le_loc.fit_transform(X['game_loc'])

le = LabelEncoder()
y = le.fit_transform(df['target'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


model = xgb.XGBClassifier(
    objective='multi:softmax',  
    num_class=3,        
    eval_metric='mlogloss',     
    use_label_encoder=False,    
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=le.classes_))


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

        Over       0.50      0.50      0.50       966
        Push       0.17      0.01      0.02        83
       Under       0.51      0.54      0.52       988

    accuracy                           0.50      2037
   macro avg       0.39      0.35      0.35      2037
weighted avg       0.49      0.50      0.49      2037



### going to try one hot encoding (works slightly better!)

In [44]:
X = pd.get_dummies(X, columns=['away_team', 'home_team', 'game_loc'], drop_first=True)
le = LabelEncoder()
y = le.fit_transform(df['target'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = xgb.XGBClassifier(
    objective='multi:softmax', 
    num_class=3,                
    eval_metric='mlogloss',    
    use_label_encoder=False,  
    random_state=42
)


model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=le.classes_))


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

        Over       0.51      0.50      0.51       966
        Push       0.00      0.00      0.00        83
       Under       0.52      0.56      0.54       988

    accuracy                           0.51      2037
   macro avg       0.34      0.36      0.35      2037
weighted avg       0.49      0.51      0.50      2037

