In [166]:
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from tqdm.auto import tqdm
from featureranker.utils import *
from featureranker.plots import *
from featureranker.rankers import *
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings('ignore')

In [167]:
player_path = './player_data/'
paths = [path.replace('\\', '/') for path in glob(player_path + '*.csv')]
for path in paths:
    df = pd.read_csv(path)
    view_data(df)

There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset


In [168]:
# functions
def get_names(df, start, stop):
    player_list = []
    drop_list = df.iloc[:, start:stop:3].columns.tolist()
    name_df = df.drop(drop_list, axis=1, inplace=False)
    name_df = name_df.iloc[:, start:stop-len(drop_list)]
    for i in range(len(name_df)):
        players = name_df.iloc[i].tolist()
        players_sorted = [players[j] for j in range(0, len(players), 2)]
        players_sorted.sort(key=lambda x: players[players.index(x) + 1])
        player_list.append(players_sorted)
    return player_list


def list_stacker(pitchers, players):
    for i in range(len(pitchers)):
        players[i].insert(0, pitchers[i])
    return players


def get_game_data(paths):
    all_data = []
    for i in range(len(paths)):
        df = pd.read_csv(paths[i], header=None)
        visiting_players = get_names(df, 105, 132)
        home_players = get_names(df, 132, 159)
        home_pitchers = df.iloc[:, 104].tolist()
        visiting_pitchers = df.iloc[:, 102].tolist()
        visiting_players = list_stacker(visiting_pitchers, visiting_players)
        home_players = list_stacker(home_pitchers, home_players)
        visiting_scores = df.iloc[:, 9].tolist()
        home_scores = df.iloc[:, 10].tolist()
        all_data.extend(list(zip(visiting_players, home_players, visiting_scores, home_scores)))
    return all_data


def get_player_data(paths):
    le = LabelEncoder()
    player_dfs = []
    for path in paths:
        df = pd.read_csv(path)
        df['Team'] = le.fit_transform(df['Team'])
        if 'Pos' in df.columns:
            df['Pos'] = le.fit_transform(df['Pos'])
        if 'Batters' not in path:
            df['ExtraCol1'] = 0
            df['ExtraCol2'] = 0
            df['ExtraCol3'] = 0
        df.columns = ['Player'] + list(range(len(df.columns) - 1))
        df['Player'] = df['Player'].str.lower()
        player_dfs.append(df)
    combined_player = pd.concat(player_dfs)
    combined_player = combined_player.groupby('Player', as_index=False).mean()
    final_player = combined_player.set_index('Player').T.to_dict('list')
    return final_player


def get_example(game, player_data):
    visiting_players = game[0]
    home_players = game[1]
    visiting_vector = [player_data[player.lower()] for player in visiting_players]
    home_vector = [player_data[player.lower()] for player in home_players]
    stacked_vector = [item for sublist in visiting_vector + home_vector for item in sublist]
    if game[2] > game[3]:
        label = 1
    else:
        label = 0
    return stacked_vector, label


def generate_vectors_and_labels(game_data, player_data):
    vectors, labels = [], []
    for game in tqdm(game_data):
        try:
            vector, label = get_example(game, player_data)
            if np.isnan(vector).any() or np.isnan(label):
                continue
            vectors.append(vector)
            labels.append(label)
        except:
            continue
    return pd.DataFrame(np.array(vectors)), pd.DataFrame(np.array(labels))

In [169]:
# load data
game_path = './game_data/'
player_path = './player_data/'

game_csvs_train = [path.replace('\\', '/') for path in glob(game_path + '*.txt') if '2022' if '2023' if '2021' not in path]
game_csvs_test = [path.replace('\\', '/') for path in glob(game_path + '*.txt') if '2021' in path]

player_csvs_train = [path.replace('\\', '/') for path in glob(player_path + '*.csv') if '2021' if '2022' not in path and '2023' not in path]
player_csvs_test = [path.replace('\\', '/') for path in glob(player_path + '*.csv') if '2021' in path]

train_game_data = [game for game in get_game_data(game_csvs_train) if game[2] != game[3]]
test_game_data = [game for game in get_game_data(game_csvs_test) if game[2] != game[3]]

train_player_data = get_player_data(player_csvs_train)
test_player_data = get_player_data(player_csvs_test)

In [170]:
print(list(train_player_data.items())[3])
print(train_game_data[0])
print(test_player_data)

('a.j. cole', [24.5, 31.0, 13.625, 2.375, 0.0, 0.0, 25.55, 26.625, 12.875, 26.0, 10.25, 4.875, 1.75, 1.25, 0.5, 0.625, 0.5, 5.015, 1.45, 0.0, 0.0, 0.0])
(['CC Sabathia', 'Jorge Posada', 'Mark Teixeira', 'Robinson Cano', 'Alex Rodriguez', 'Derek Jeter', 'Brett Gardner', 'Curtis Granderson', 'Nick Swisher', 'Nick Johnson'], ['Josh Beckett', 'Victor Martinez', 'Kevin Youkilis', 'Dustin Pedroia', 'Adrian Beltre', 'Marco Scutaro', 'Jacoby Ellsbury', 'Mike Cameron', 'J.D. Drew', 'David Ortiz'], 7, 9)
{'a.j. alexy': [27.0, 25.0, 5.0, 4.0, 0.0, 0.0, 23.0, 13.0, 12.0, 17.0, 17.0, 4.0, 3.0, 1.0, 0.0, 0.0, 0.0, 4.7, 1.3, 0.0, 0.0, 0.0], 'a.j. cole': [28.0, 31.0, 6.0, 0.0, 0.0, 0.0, 8.0, 6.0, 1.0, 7.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.13, 0.88, 0.0, 0.0, 0.0], 'a.j. minter': [1.0, 30.0, 61.0, 0.0, 0.0, 0.0, 52.1, 44.0, 22.0, 57.0, 20.0, 2.0, 3.0, 6.0, 0.0, 6.0, 23.0, 3.78, 1.22, 0.0, 0.0, 0.0], 'a.j. puk': [19.0, 28.0, 12.0, 0.0, 0.0, 0.0, 13.1, 18.0, 9.0, 16.0, 6.0, 1.0, 0.0, 3.0, 0.0, 2.0, 0

In [171]:
# get format
X_train, y_train = generate_vectors_and_labels(train_game_data, train_player_data)
X_test, y_test = generate_vectors_and_labels(test_game_data, test_player_data)

# instantiate the random undersampler
rus = RandomUnderSampler(random_state=42)

# instantiate SMOTE
smote = SMOTE(random_state=42)

# resample X_train and y_train using undersampling
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# resample X_train and y_train using SMOTE
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# print the new class distribution after undersampling
print('Resampled dataset shape after undersampling %s' % len(X_train_resampled))

# print the new class distribution after SMOTE
print('Resampled dataset shape after SMOTE %s' % len(X_train_smote))


100%|██████████| 27624/27624 [00:00<00:00, 68333.12it/s] 
100%|██████████| 2429/2429 [00:00<00:00, 36572.89it/s]


Resampled dataset shape after undersampling 11120
Resampled dataset shape after SMOTE 12950


In [172]:
class vector_dataset(Dataset):
    def __init__(self, X, y):
        self.X = np.array(X)
        self.y = np.array(y)

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        vec = torch.tensor(self.X[idx], dtype=torch.float)
        label = torch.tensor(self.y[idx], dtype=torch.long)
        return vec, label


class neural_net(nn.Module):
    def __init__(self, input_size, hidden_size, hidden_num, output_size, dropout_rate):
        super(neural_net, self).__init__()
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(dropout_rate)
        self.hidden_num = hidden_num
        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList()
        for i in range(hidden_num):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.gelu(self.input_layer(x))
        x = self.dropout(x)
        for i in range(self.hidden_num):
            x = self.gelu(self.hidden_layers[i](x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

model = neural_net(X_train.shape[1], X_train.shape[1]*2, 2, 2, 0.15)
model

neural_net(
  (gelu): GELU(approximate='none')
  (dropout): Dropout(p=0.15, inplace=False)
  (input_layer): Linear(in_features=440, out_features=880, bias=True)
  (hidden_layers): ModuleList(
    (0-1): 2 x Linear(in_features=880, out_features=880, bias=True)
  )
  (output_layer): Linear(in_features=880, out_features=2, bias=True)
)

In [173]:
#train_dataset = vector_dataset(X_train_resampled, y_train_resampled)
train_dataset = vector_dataset(X_train_smote, y_train_smote)
test_dataset = vector_dataset(X_test, y_test)


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [174]:
patience = 5
best_loss = float('inf')
patience_counter = 0
best_model_path = 'best_model.pth'  # File path for saving the best model

for epoch in range(50):  # number of epochs
    model.train()
    train_losses = []
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.squeeze())
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    avg_train_loss = np.mean(train_losses)
    print(f'Training Loss: {avg_train_loss}')

    model.eval()
    valid_losses = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels.squeeze())
            valid_losses.append(loss.item())

    avg_valid_loss = np.mean(valid_losses)
    print(f'Validation Loss: {avg_valid_loss}')

    # Save the model at each epoch
    epoch_model_path = f'model_epoch_{epoch + 1}.pth'
    torch.save(model.state_dict(), epoch_model_path)

    # Update the best model if validation loss improves
    if avg_valid_loss < best_loss:
        best_loss = avg_valid_loss
        torch.save(model.state_dict(), best_model_path)
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Load the best model
model.load_state_dict(torch.load(best_model_path))

# Evaluate the best model
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate confusion matrix and classification report
conf_matrix = confusion_matrix(all_labels, all_preds)
class_report = classification_report(all_labels, all_preds)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Epoch 1: 100%|██████████| 203/203 [00:02<00:00, 80.11it/s]


Training Loss: 1.0943487997125523
Validation Loss: 0.6802015322096208


Epoch 2: 100%|██████████| 203/203 [00:02<00:00, 77.01it/s]


Training Loss: 0.6903160948471483
Validation Loss: 0.7021904622807222


Epoch 3: 100%|██████████| 203/203 [00:02<00:00, 79.01it/s]


Training Loss: 0.6900617635896054
Validation Loss: 0.6866653561592102


Epoch 4: 100%|██████████| 203/203 [00:02<00:00, 79.73it/s]


Training Loss: 0.6881615264075143
Validation Loss: 0.6945754587650299


Epoch 5: 100%|██████████| 203/203 [00:02<00:00, 76.34it/s]


Training Loss: 0.6858028191063792
Validation Loss: 0.6927404701709747


Epoch 6: 100%|██████████| 203/203 [00:02<00:00, 78.20it/s]


Training Loss: 0.6862011792624525
Validation Loss: 0.6790121337946724


Epoch 7: 100%|██████████| 203/203 [00:02<00:00, 69.10it/s]


Training Loss: 0.6827512163246794
Validation Loss: 0.7011193913571975


Epoch 8: 100%|██████████| 203/203 [00:02<00:00, 77.06it/s]


Training Loss: 0.6828221640563363
Validation Loss: 0.6800695254522211


Epoch 9: 100%|██████████| 203/203 [00:02<00:00, 76.83it/s]


Training Loss: 0.6859578590087703
Validation Loss: 0.6887122760800755


Epoch 10: 100%|██████████| 203/203 [00:02<00:00, 78.96it/s]


Training Loss: 0.6833478350357469
Validation Loss: 0.6800326687448165


Epoch 11: 100%|██████████| 203/203 [00:02<00:00, 76.47it/s]


Training Loss: 0.6866628609854599
Validation Loss: 0.6882622417281655
Early stopping triggered
Confusion Matrix:
[[1035  128]
 [ 791  198]]
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.89      0.69      1163
           1       0.61      0.20      0.30       989

    accuracy                           0.57      2152
   macro avg       0.59      0.55      0.50      2152
weighted avg       0.59      0.57      0.51      2152



In [175]:
def get_player_vector(players, player_data):
    return [player_data[player.lower()] for player in players]

In [184]:
def predict_game_outcome(player_list, model, player_data):
    if not all(isinstance(i, tuple) for i in player_list):
        raise ValueError("Each item in player_list must be a tuple of player names")
    results = []
    for i in range(len(player_list)):
        visiting_players = player_list[i][0]  # first tuple in the list
        home_players = player_list[i][1]  # second tuple in the list
        visiting_vector = get_player_vector(visiting_players, player_data)
        home_vector = get_player_vector(home_players, player_data)
        game_vector = visiting_vector + home_vector
        game_vector = torch.tensor([game_vector], dtype=torch.float)
        model.eval()
        with torch.no_grad():
            outputs = model(game_vector)
            _, preds = torch.max(outputs, 1)
        results.append("Game {}: {}".format(i+1, "Visiting team wins" if preds.item() == 1 else "Home team wins"))
    return results

In [187]:
def check_missing_players(player_list, player_data):
    missing_players = []
    for game in player_list:
        for team in game:
            for player in team:
                if player.lower() not in player_data:
                    missing_players.append(player)
    return missing_players

# Now you can use the check_missing_players function
missing_players = check_missing_players(player_list, test_player_data)
print("Missing players:", missing_players)
len(missing_players)

Missing players: ['l', 'i', 'n', 'e', 'u', 'p']


6

In [186]:
with open('player_list_2022.pkl', 'rb') as f:
    player_list = pickle.load(f)
print(player_list)
# Assuming the index of the game you want to predict is 10154
game_to_predict = player_list.loc[10154, 'lineup']

print(predict_game_outcome(game_to_predict, model, test_player_data))
print(predict_game_outcome(player_list, model, test_player_data))

                                                  lineup
10154  [(Max Fried, Max Fried, Travis d'Arnaud, Fredd...
10155  [(Shane Bieber, Roberto Perez, Yu Chang, Cesar...
10156  [(Zack Greinke, Martin Maldonado, Yulieski Gur...
10157  [(Kenta Maeda, Kenta Maeda, Mitch Garver, Migu...
10158  [(Cristian Javier, Martin Maldonado, Yulieski ...
...                                                  ...
11200  [(Ashton Goudeau, Ashton Goudeau, Dom Nunez, C...
11201  [(Aaron Civale, Austin Hedges, Yu Chang, Andre...
11202  [(Bruce Zimmermann, Pedro Severino, Ryan Mount...
11203  [(Cole Irvin, Yan Gomes, Seth Brown, Tony Kemp...
11204  [(Chris Sale, Chris Sale, Christian Vazquez, K...

[1051 rows x 1 columns]


KeyError: 'm'

In [179]:
# SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define hyperparameters for search
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'poly', 'sigmoid']}

# Grid search for hyperparameters
clf = GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=3)
clf.fit(X_train_scaled, y_train)

# Predict using the best model
y_pred = clf.predict(X_test_scaled)

class_report = classification_report(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=clf.classes_)
disp.plot()

print("Classification Report:")
print(class_report)

NameError: name 'GridSearchCV' is not defined

In [None]:
# forests and feature ranking
hypers = classification_hyper_param_search(X, y, 10, 10)
xb_hypers = hypers[0]['best_params']
rf_hypers = hypers[1]['best_params']
ranking = classification_ranking(X, y, rf_hypers, xb_hypers)
scoring = voting(ranking)
#plot_ranking(scoring, title='Classification example')

Fitting 10 folds for each of 10 candidates, totalling 100 fits


KeyboardInterrupt: 

In [None]:
"""
EXTRA
def get_player_data(paths):
    player_dfs = [pd.read_csv(path) for path in paths]
    player_dfs = [df.assign(Name=df['last_name, first_name'].apply(lambda x: ' '.join(x.split(', ')[::-1]).lower())).drop(columns=['player_id', 'last_name, first_name']) for df in player_dfs]
    combined_player = pd.concat(player_dfs).groupby('Name', as_index=False).mean()
    final_player = combined_player.set_index('Name').T.to_dict('list')
    return final_player

df.iloc[:, 102] # visiting pitchers
df.iloc[:, 104] # home pitchers
df.iloc[:, 105:132] # visiting players
df.iloc[:, 132:159] # home players
df.iloc[:, 9] # visiting score
df.iloc[:, 10] # home score
df.iloc[:, 3] # visiting team
df.iloc[:, 6] # home team
visiting_teams = df.iloc[:, 3].tolist()
home_teams = df.iloc[:, 6].tolist()
"""