# Imports

In [145]:
import pandas as pd
import glob
import os
import numpy as np
from featureranker.utils import *
from featureranker.plots import *
from featureranker.rankers import *
from tqdm.auto import tqdm
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from glob import glob
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings('ignore')

# Functions

In [164]:
def check_missing_players(player_list, player_data):
    missing_players = []
    for game in player_list:
        for team in game:
            for player in team:
                if player.lower() not in player_data:
                    missing_players.append(player)
    return missing_players


def get_names(df, start, stop):
    player_list = []
    drop_list = df.iloc[:, start:stop:3].columns.tolist()
    name_df = df.drop(drop_list, axis=1, inplace=False)
    name_df = name_df.iloc[:, start:stop-len(drop_list)]
    for i in range(len(name_df)):
        players = name_df.iloc[i].tolist()
        players_sorted = [players[j] for j in range(0, len(players), 2)]
        players_sorted.sort(key=lambda x: players[players.index(x) + 1])
        player_list.append(players_sorted)
    return player_list


def list_stacker(pitchers, players):
    for i in range(len(pitchers)):
        players[i].insert(0, pitchers[i])
    return players


def get_game_data(paths):
    all_data = []
    for i in range(len(paths)):
        df = pd.read_csv(paths[i], header=None)
        visiting_players = get_names(df, 105, 132)
        home_players = get_names(df, 132, 159)
        home_pitchers = df.iloc[:, 104].tolist()
        visiting_pitchers = df.iloc[:, 102].tolist()
        visiting_players = list_stacker(visiting_pitchers, visiting_players)
        home_players = list_stacker(home_pitchers, home_players)
        visiting_scores = df.iloc[:, 9].tolist()
        home_scores = df.iloc[:, 10].tolist()
        all_data.extend(list(zip(visiting_players, home_players, visiting_scores, home_scores)))
    return all_data


def get_player_data(paths):
    le = LabelEncoder()
    player_dfs = []
    for path in paths:
        df = pd.read_csv(path)
        df['Team'] = le.fit_transform(df['Team'])     
        if 'Pos' in df.columns:
            df['Pos'] = le.fit_transform(df['Pos'])
        if 'batting' in path:
            df['ExtraCol1'] = 0
            df['ExtraCol2'] = 0
            df['ExtraCol3'] = 0
            df['ExtraCol4'] = 0
            df['ExtraCol5'] = 0
            df['ExtraCol6'] = 0
            df['ExtraCol7'] = 0
            df['ExtraCol8'] = 0
        df.columns = ['Player'] + list(range(len(df.columns) - 1))
        df['Player'] = df['Player'].str.lower()
        player_dfs.append(df)
    combined_player = pd.concat(player_dfs)
    combined_player = combined_player.groupby('Player', as_index=False).mean()
    final_player = combined_player.set_index('Player').T.to_dict('list')
    return final_player


def get_example(game, player_data):
    visiting_players = game[0]
    home_players = game[1]
    visiting_vector = [player_data[player.lower()] for player in visiting_players]
    home_vector = [player_data[player.lower()] for player in home_players]
    stacked_vector = [item for sublist in visiting_vector + home_vector for item in sublist]
    if game[2] > game[3]:
        label = 0 # might need to flip
    else:
        label = 1
    return stacked_vector, label


def generate_vectors_and_labels(game_data, player_data):
    vectors, labels = [], []
    for game in tqdm(game_data):
        try:
            vector, label = get_example(game, player_data)
            if np.isnan(vector).any() or np.isnan(label):
                continue
            vectors.append(vector)
            labels.append(label)
        except:
            continue
    return pd.DataFrame(np.array(vectors)), pd.DataFrame(np.array(labels))


def simulate_betting_season(bets, df, wallet_balance=1000.0):

    wallet_balance_history = [wallet_balance]  # Start with the initial wallet balance
    for bet in bets:
        bet_amount, bet_team, bet_date = bet
        row = df[(df['Date'] == bet_date) & ((df['HmTm'] == bet_team) | (df['VisTm'] == bet_team))]
        if not row.empty and not row.isnull().values.any():
            if row['winner'].values[0]==1:
                winner = row['HmTm'].values[0]
                if winner == bet_team:
                    if row['home_open'].values[0] > 0:
                        wallet_balance += bet_amount * (row['home_open'].values[0] / 100.0)
                    else:
                        wallet_balance += bet_amount * (100.0 / abs(row['home_open'].values[0]))
                else:
                    wallet_balance -= bet_amount
            else:
                winner = row['VisTm'].values[0]
                if winner == bet_team:
                    if row['visiting_open'].values[0] > 0:
                        wallet_balance += bet_amount * (row['visiting_open'].values[0] / 100.0)
                    else:
                        wallet_balance += bet_amount * (100.0 / abs(row['visiting_open'].values[0]))
                else:
                    wallet_balance -= bet_amount
            wallet_balance_history.append(wallet_balance)
        else:
            print(f"No betting data for date: {bet_date}")
    return wallet_balance, wallet_balance_history


def get_player_vector(players, player_data):
    return [player_data[player.lower()] for player in players]


def predict_game_outcome(player_list, player_data, model, augment=False, base_bet=100, scaler=1):
    results, bets, skipped = [], [], []
    for i in range(len(player_list)):
        try:
            visiting_players = player_list.iloc[i][0][0]
            home_players = player_list.iloc[i][0][1]
            visiting_vector = get_player_vector(visiting_players, player_data)
            home_vector = get_player_vector(home_players, player_data)
            game_vector = [item for sublist in visiting_vector + home_vector for item in sublist]  # Flatten the vectors
            game_vector = torch.tensor([game_vector], dtype=torch.float)
            model.eval()
            with torch.no_grad():
                logits = model(game_vector)
                probs = logits.softmax(dim=-1)
                _, pred = torch.max(logits, 1)
                pred = pred.item()
                prob = probs[0][pred].item()
            results.append(pred)
            bets.append(base_bet * prob * scaler if augment else base_bet)
        except:
            skipped.append(i)
    return results, bets, skipped


def generate_bets(results, bets, df, skipped):
    final_bets = []
    for i, (result, (_, row)) in enumerate(zip(results, df.iterrows())):
        if i in skipped:
            continue
        if result == 1:
            bet_team = row['HmTm']
        else:
            bet_team = row['VisTm']
        bet_date = row['Date']
        final_bets.append((bets[i], bet_team, bet_date))
    return final_bets

In [147]:
# import pandas as pd
# import os
# import glob

# path = 'C:/Users/vile3/.cursor-tutor/projects/python/Lahman_compiled_player_data/'
# all_files = glob.glob(os.path.join(path, "*.csv"))
# people_path = 'C:/Users/vile3/.cursor-tutor/projects/python/Lahman_MLB_People/Lahman_MLB_People.csv'
# people_df = pd.read_csv(people_path)

# for file in all_files:
#     df = pd.read_csv(file)
#     df = df[df['yearID'] >= 2000]
#     df = pd.merge(df, people_df[['playerID', 'nameFirst', 'nameLast']], on='playerID', how='left')
#     df['Player'] = df['nameFirst'] + ' ' + df['nameLast']
#     df.drop(['playerID', 'nameFirst', 'nameLast', 'Unnamed: 0','stint','lgID' ], axis=1, inplace=True)  # Remove 'Unnamed' column
#     df.rename(columns={'teamID': 'Team'}, inplace=True)  # Rename 'teamID' to 'Team'
#     cols = df.columns.tolist()
#     cols = cols[-1:] + cols[:-1]  # Make 'Player' the first column
#     df = df[cols]
#     split_files = df.groupby('yearID')
#     df.drop(['yearID'], axis=1, inplace=True)
#     for name, group in split_files:
#         group.to_csv(f"{path}{os.path.splitext(os.path.basename(file))[0]}_{name}.csv", index=False)



# Data

In [148]:
def load_data(game_path, player_path, train_years, test_year):
    game_csvs_train = [path.replace('\\', '/') for path in glob(game_path + '*.txt') if str(test_year) not in path and any(str(year) in path for year in train_years)]
    game_csvs_test = [path.replace('\\', '/') for path in glob(game_path + '*.txt') if str(test_year) in path]

    player_csvs_train = [path.replace('\\', '/') for path in glob(player_path + '*.csv') if str(test_year) not in path and any(str(year) in path for year in train_years)]
    player_csvs_test = [path.replace('\\', '/') for path in glob(player_path + '*.csv') if str(test_year) in path]
    print(game_csvs_train)
    print(player_csvs_train)
    print(game_csvs_test)
    print(player_csvs_test)
    train_game_data = [game for game in get_game_data(game_csvs_train) if game[2] != game[3]]
    test_game_data = [game for game in get_game_data(game_csvs_test) if game[2] != game[3]]

    train_player_data = get_player_data(player_csvs_train)
    test_player_data = get_player_data(player_csvs_test)
   
    X_train, y_train = generate_vectors_and_labels(train_game_data, train_player_data)
    X_test, y_test = generate_vectors_and_labels(test_game_data, test_player_data)

    return X_train, y_train, X_test, y_test, train_game_data, test_game_data, train_player_data, test_player_data

# load data
game_path = './game_data/'
player_path = './Lahman_compiled_player_data/'

test_year = 2021
train_years = range(2010, test_year) #train years from 2010 to test year -1 for some reason range reduces the year by 1.

X_train, y_train, X_test, y_test, train_game_data, test_game_data, train_player_data, test_player_data = load_data(game_path, player_path, train_years, test_year)

player_path = './Lahman_compiled_player_data/'
paths = [path.replace('\\', '/') for path in glob(player_path + '*.csv')]
for path in paths:
    df = pd.read_csv(path)
    view_data(df)

xlsx_path = './betting_odds/'
txt_path = './alldata/gamelogs/'
glfields_path = './alldata/gamelogs/glfields.txt'
paths = [path for path in glob(xlsx_path + '*.xlsx')]
paths_txt = [f'{txt_path}gl{test_year}.txt']
schedule_paths = [f'./alldata/schedules/{year}schedule.csv' for year in range(2010, 2024)]

['./game_data/gl2010.txt', './game_data/gl2011.txt', './game_data/gl2012.txt', './game_data/gl2013.txt', './game_data/gl2014.txt', './game_data/gl2015.txt', './game_data/gl2016.txt', './game_data/gl2017.txt', './game_data/gl2018.txt', './game_data/gl2019.txt', './game_data/gl2020.txt']
['./Lahman_compiled_player_data/Lahman_MLB_batting_2010.csv', './Lahman_compiled_player_data/Lahman_MLB_batting_2011.csv', './Lahman_compiled_player_data/Lahman_MLB_batting_2012.csv', './Lahman_compiled_player_data/Lahman_MLB_batting_2013.csv', './Lahman_compiled_player_data/Lahman_MLB_batting_2014.csv', './Lahman_compiled_player_data/Lahman_MLB_batting_2015.csv', './Lahman_compiled_player_data/Lahman_MLB_batting_2016.csv', './Lahman_compiled_player_data/Lahman_MLB_batting_2017.csv', './Lahman_compiled_player_data/Lahman_MLB_batting_2018.csv', './Lahman_compiled_player_data/Lahman_MLB_batting_2019.csv', './Lahman_compiled_player_data/Lahman_MLB_batting_2020.csv', './Lahman_compiled_player_data/Lahman_MLB

100%|██████████| 25194/25194 [00:00<00:00, 40745.53it/s]
100%|██████████| 2429/2429 [00:00<00:00, 46094.17it/s]


The column lgID has 0.7% NaN values.
The column RBI has 0.7% NaN values.
The column SB has 2.1% NaN values.
The column CS has 21.0% NaN values.
The column SO has 1.9% NaN values.
The column IBB has 32.7% NaN values.
The column HBP has 2.5% NaN values.
The column SH has 5.4% NaN values.
The column SF has 32.2% NaN values.
The column GIDP has 22.7% NaN values.
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no NaN values in the dataset
There are no Na

In [149]:
print(paths_txt)

['./alldata/gamelogs/gl2021.txt']


In [150]:
xlsx_dataframes = []
for i, path in enumerate(paths):
    temp_df = pd.read_excel(path, skiprows=0 if i > 0 else 0)
    temp_df.columns = temp_df.columns.str.replace('[^a-zA-Z0-9]', '')
    year = path[-9:-5]  # extract year from file name
    if 'Date' in temp_df.columns:
        temp_df['Date'] = year + temp_df['Date'].astype(str).str.zfill(4)  # format date as yyyymmdd
    xlsx_dataframes.append(temp_df)

try:
    xlsx_dataframes = pd.concat(xlsx_dataframes, ignore_index=True)
except pd.errors.InvalidIndexError:
    print('Error: Reindexing only valid with uniquely valued Index objects')

with open(glfields_path, 'r') as file:
    glfields_data = file.read()

xlsx_dataframes = xlsx_dataframes[['Date', 'VH', 'Team', 'Open']]
home_teams_df = xlsx_dataframes[xlsx_dataframes['VH'] == 'H'].copy()
visiting_teams_df = xlsx_dataframes[xlsx_dataframes['VH'] == 'V'].copy()

home_teams_df.rename(columns={'Team': 'HmTm', 'Open': 'home_open'}, inplace=True)
visiting_teams_df.rename(columns={'Team': 'VisTm', 'Open': 'visiting_open'}, inplace=True)

# Merge on 'date'
xlsx_dataframes = pd.concat([home_teams_df.reset_index(drop=True), visiting_teams_df.reset_index(drop=True)], axis=1)
xlsx_dataframes = xlsx_dataframes.loc[:,~xlsx_dataframes.columns.duplicated()]
xlsx_dataframes = xlsx_dataframes[['Date', 'HmTm', 'VisTm','home_open','visiting_open']]
print(xlsx_dataframes)
# Define the new path

path_csv = 'C:/Users/vile3/.cursor-tutor/projects/python/Lahman_MLB_per_game_data/Lahman_MLB_per_game_data.csv'

# Read the CSV file
df = pd.read_csv(path_csv)

# Replace the txt_dataframes with the new dataframe
txt_dataframes = [df]

# The rest of your code remains the same
txt_dataframes_concat = pd.concat(txt_dataframes)

# Determine the winning team for each game and add it as a new column 'winner'
txt_dataframes_concat['winner'] = txt_dataframes_concat.apply(lambda row: 1 if row["HmRuns"] > row["VisRuns"] else 2 if row["HmRuns"] == row["VisRuns"] else 0, axis=1)

# Adjust the column names to match the new CSV file
txt_dataframes_concat = txt_dataframes_concat[['Date', 'HmTm', 'VisTm', 'HmStPchNm', 'VisStPchNm', 
                                              'HmBat1Nm', 'HmBat2Nm', 'HmBat3Nm', 
                                              'HmBat4Nm', 'HmBat5Nm', 'HmBat6Nm', 
                                              'HmBat7Nm', 'HmBat8Nm', 'HmBat9Nm', 
                                              'VisBat1Nm', 'VisBat2Nm', 'VisBat3Nm', 
                                              'VisBat4Nm', 'VisBat5Nm', 'VisBat6Nm', 
                                              'VisBat7Nm', 'VisBat8Nm', 'VisBat9Nm',
                                              'winner']]

xlsx_dataframes['Date'] = xlsx_dataframes['Date'].astype(int)
# Now perform the merge operation
merged_dataframes = pd.merge(xlsx_dataframes, txt_dataframes_concat, on=['Date', 'HmTm', 'VisTm'])
merged_dataframes_full = pd.merge(txt_dataframes_concat, xlsx_dataframes, on=['Date', 'HmTm', 'VisTm'], how="left")

player_list_concat = pd.concat(txt_dataframes)
player_list_concat = player_list_concat[['Date', 'HmTm', 'VisTm', 'HmStPchNm', 'VisStPchNm', 
                                          'HmBat1Nm', 'HmBat2Nm', 'HmBat3Nm', 
                                          'HmBat4Nm', 'HmBat5Nm', 'HmBat6Nm', 
                                          'HmBat7Nm', 'HmBat8Nm', 'HmBat9Nm', 
                                          'VisBat1Nm', 'VisBat2Nm', 'VisBat3Nm', 
                                          'VisBat4Nm', 'VisBat5Nm', 'VisBat6Nm', 
                                          'VisBat7Nm', 'VisBat8Nm', 'VisBat9Nm',
                                          'HmRuns', 'VisRuns',
                                          'HmBat1Pos', 'HmBat2Pos', 
                                          'HmBat3Pos', 'HmBat4Pos', 
                                          'HmBat5Pos', 'HmBat6Pos', 
                                          'HmBat7Pos', 'HmBat8Pos', 
                                          'HmBat9Pos', 'VisBat1Pos', 
                                          'VisBat2Pos', 'VisBat3Pos', 
                                          'VisBat4Pos', 'VisBat5Pos', 
                                          'VisBat6Pos', 'VisBat7Pos', 
                                          'VisBat8Pos', 'VisBat9Pos']]

player_list_concat['lineup'] = player_list_concat.apply(lambda row: [
    (
        (row['VisStPchNm'],) +
        tuple(row[f'VisBat{i}Nm'] for i in sorted(range(1, 10), key=lambda i: row[f'VisBat{i}Pos']))
    ),
    (
        (row['HmStPchNm'],) +
        tuple(row[f'HmBat{i}Nm'] for i in sorted(range(1, 10), key=lambda i: row[f'HmBat{i}Pos']))
    ),
], axis=1).tolist()

merged_player_dataframe = pd.merge(xlsx_dataframes, player_list_concat, on=['Date', 'HmTm', 'VisTm'])

           Date HmTm VisTm home_open visiting_open
0      20100404  BOS   NYY      -114          -106
1      20100405  WAS   PHI       170          -200
2      20100405  NYM   MIA      -115          -105
3      20100405  CIN   STL       135          -155
4      20100405  PIT   LOS       135          -155
...         ...  ...   ...       ...           ...
28001  20211027  HOU   ATL      -115          -105
28002  20211029  ATL   HOU      -115          -105
28003  20211030  ATL   HOU      -115          -105
28004  20211031  ATL   HOU      -105          -115
28005  20211102  HOU   ATL      -120           100

[28006 rows x 5 columns]


In [151]:
print(xlsx_dataframes.columns)


Index(['Date', 'HmTm', 'VisTm', 'home_open', 'visiting_open'], dtype='object')


In [152]:


# # load data
# game_path = './game_data/'
# player_path = './player_data/'

# game_csvs_train = [path.replace('\\', '/') for path in glob(game_path + '*.txt') if '2019' not in path and '2020' not in path and '2021' not in path and '2022' not in path and '2023' not in path]
# game_csvs_test = [path.replace('\\', '/') for path in glob(game_path + '*.txt') if '2019' in path]

# player_csvs_train = [path.replace('\\', '/') for path in glob(player_path + '*.csv') if '2019' not in path and '2020' not in path and '2021' not in path and '2022' not in path and '2023' not in path]
# player_csvs_test = [path.replace('\\', '/') for path in glob(player_path + '*.csv') if '2019' in path]

# train_game_data = [game for game in get_game_data(game_csvs_train) if game[2] != game[3]]
# test_game_data = [game for game in get_game_data(game_csvs_test) if game[2] != game[3]]

# train_player_data = get_player_data(player_csvs_train)
# test_player_data = get_player_data(player_csvs_test)

# X_train, y_train = generate_vectors_and_labels(train_game_data, train_player_data)
# X_test, y_test = generate_vectors_and_labels(test_game_data, test_player_data)

In [153]:
print(merged_player_dataframe)

           Date HmTm VisTm home_open visiting_open        HmStPchNm  \
0      20100405  WAS   PHI       170          -200      John Lannan   
1      20100405  MIL   COL      -125           105  Yovani Gallardo   
2      20100405  TEX   TOR      -135           115    Scott Feldman   
3      20100405  OAK   SEA       105          -125       Ben Sheets   
4      20100406  MIL   COL      -140           120       Randy Wolf   
...         ...  ...   ...       ...           ...              ...   
10655  20211003  ARI   COL       105          -125   Humberto Mejia   
10656  20211003  TEX   CLE      -115          -105     Dane Dunning   
10657  20211003  TOR   BAL      -360           290     Hyun-Jin Ryu   
10658  20211003  HOU   OAK      -195           165     Jose Urquidy   
10659  20211003  WAS   BOS       180          -220        Joan Adon   

             VisStPchNm         HmBat1Nm            HmBat2Nm  \
0          Roy Halladay     Nyjer Morgan       Willie Harris   
1        Ubaldo Jim

In [154]:
# xlsx_dataframes.to_csv('output.csv', index=False)

In [155]:
# print(merged_player_dataframe['lineup'])

# NN

In [156]:
class vector_dataset(Dataset):
    def __init__(self, X, y):
        self.X = np.array(X)
        self.y = np.array(y)

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        vec = torch.tensor(self.X[idx], dtype=torch.float)
        label = torch.tensor(self.y[idx], dtype=torch.long)
        return vec, label


class neural_net(nn.Module):
    def __init__(self, input_size, hidden_size, hidden_num, output_size, dropout_rate):
        super(neural_net, self).__init__()
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(dropout_rate)
        self.hidden_num = hidden_num
        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList()
        for i in range(hidden_num):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.gelu(self.input_layer(x))
        x = self.dropout(x)
        for i in range(self.hidden_num):
            x = self.gelu(self.hidden_layers[i](x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

model = neural_net(X_train.shape[1], X_train.shape[1]*2, 2, 2, 0.15)
model

neural_net(
  (gelu): GELU(approximate='none')
  (dropout): Dropout(p=0.15, inplace=False)
  (input_layer): Linear(in_features=520, out_features=1040, bias=True)
  (hidden_layers): ModuleList(
    (0-1): 2 x Linear(in_features=1040, out_features=1040, bias=True)
  )
  (output_layer): Linear(in_features=1040, out_features=2, bias=True)
)

In [157]:
train_dataset = vector_dataset(X_train, y_train)
test_dataset = vector_dataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss()

In [158]:
patience = 15
best_loss = float('inf')
patience_counter = 0
best_model_path = 'best_model.pth'  # File path for saving the best model

for epoch in range(50):  # number of epochs
    model.train()
    train_losses = []
    for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.squeeze())
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    avg_train_loss = np.mean(train_losses)
    print(f'Training Loss: {avg_train_loss}')

    model.eval()
    valid_losses = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels.squeeze())
            valid_losses.append(loss.item())

    avg_valid_loss = np.mean(valid_losses)
    print(f'Validation Loss: {avg_valid_loss}')

    # Save the model at each epoch
    epoch_model_path = f'model_epoch_{epoch + 1}.pth'
    torch.save(model.state_dict(), epoch_model_path)

    # Update the best model if validation loss improves
    if avg_valid_loss < best_loss:
        best_loss = avg_valid_loss
        torch.save(model.state_dict(), best_model_path)
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print('Early stopping triggered')
            break

# Load the best model
model.load_state_dict(torch.load(best_model_path))

# Evaluate the best model
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate confusion matrix and classification report
conf_matrix = confusion_matrix(all_labels, all_preds)
class_report = classification_report(all_labels, all_preds)
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Epoch 1: 100%|██████████| 224/224 [00:04<00:00, 47.01it/s]


Training Loss: 1.0440555435738392
Validation Loss: 0.6902901917695999


Epoch 2: 100%|██████████| 224/224 [00:04<00:00, 47.63it/s]


Training Loss: 0.68724750594369
Validation Loss: 0.6838438868522644


Epoch 3: 100%|██████████| 224/224 [00:04<00:00, 48.07it/s]


Training Loss: 0.6849597756351743
Validation Loss: 0.6948444932699204


Epoch 4: 100%|██████████| 224/224 [00:04<00:00, 47.58it/s]


Training Loss: 0.6821118510727372
Validation Loss: 0.6916826486587524


Epoch 5: 100%|██████████| 224/224 [00:04<00:00, 48.01it/s]


Training Loss: 0.6800714608814035
Validation Loss: 0.6774281829595565


Epoch 6: 100%|██████████| 224/224 [00:04<00:00, 48.61it/s]


Training Loss: 0.6814463955483266
Validation Loss: 0.6801083326339722


Epoch 7: 100%|██████████| 224/224 [00:04<00:00, 49.26it/s]


Training Loss: 0.680154184411679
Validation Loss: 0.6839700192213058


Epoch 8: 100%|██████████| 224/224 [00:04<00:00, 47.85it/s]


Training Loss: 0.6786999963223934
Validation Loss: 0.6808146268129349


Epoch 9: 100%|██████████| 224/224 [00:04<00:00, 47.23it/s]


Training Loss: 0.6789044657988208
Validation Loss: 0.6793304502964019


Epoch 10: 100%|██████████| 224/224 [00:04<00:00, 49.17it/s]


Training Loss: 0.6786603690790278
Validation Loss: 0.685047909617424


Epoch 11: 100%|██████████| 224/224 [00:04<00:00, 49.37it/s]


Training Loss: 0.6776197885296175
Validation Loss: 0.6816428899765015


Epoch 12: 100%|██████████| 224/224 [00:04<00:00, 50.02it/s]


Training Loss: 0.6775565150060824
Validation Loss: 0.6841548562049866


Epoch 13: 100%|██████████| 224/224 [00:04<00:00, 47.05it/s]


Training Loss: 0.6785384201045547
Validation Loss: 0.6789781302213669


Epoch 14: 100%|██████████| 224/224 [00:04<00:00, 48.94it/s]


Training Loss: 0.6774453209447009
Validation Loss: 0.6815139591693878


Epoch 15: 100%|██████████| 224/224 [00:04<00:00, 46.93it/s]


Training Loss: 0.6755487187100309
Validation Loss: 0.6783029109239578


Epoch 16: 100%|██████████| 224/224 [00:04<00:00, 47.75it/s]


Training Loss: 0.6770113581525428
Validation Loss: 0.6769505769014359


Epoch 17: 100%|██████████| 224/224 [00:04<00:00, 48.99it/s]


Training Loss: 0.6769713786031518
Validation Loss: 0.6830625116825104


Epoch 18: 100%|██████████| 224/224 [00:04<00:00, 47.78it/s]


Training Loss: 0.6750016965504203
Validation Loss: 0.6788893967866898


Epoch 19: 100%|██████████| 224/224 [00:04<00:00, 48.56it/s]


Training Loss: 0.6728615079607282
Validation Loss: 0.7063958495855331


Epoch 20: 100%|██████████| 224/224 [00:04<00:00, 48.43it/s]


Training Loss: 0.6749196358557258
Validation Loss: 0.6834502875804901


Epoch 21: 100%|██████████| 224/224 [00:04<00:00, 47.92it/s]


Training Loss: 0.6736613275217158
Validation Loss: 0.6820438265800476


Epoch 22: 100%|██████████| 224/224 [00:04<00:00, 48.47it/s]


Training Loss: 0.6727712444428887
Validation Loss: 0.6870870113372802


Epoch 23: 100%|██████████| 224/224 [00:04<00:00, 48.57it/s]


Training Loss: 0.6728542263486555
Validation Loss: 0.6825290143489837


Epoch 24: 100%|██████████| 224/224 [00:04<00:00, 48.53it/s]


Training Loss: 0.6751849693911416
Validation Loss: 0.7001162379980087


Epoch 25: 100%|██████████| 224/224 [00:04<00:00, 48.36it/s]


Training Loss: 0.6724312539611544
Validation Loss: 0.6790608316659927


Epoch 26: 100%|██████████| 224/224 [00:04<00:00, 48.96it/s]


Training Loss: 0.67036569517638
Validation Loss: 0.6857291400432587


Epoch 27: 100%|██████████| 224/224 [00:04<00:00, 48.84it/s]


Training Loss: 0.6696228494069406
Validation Loss: 0.6833903849124908


Epoch 28: 100%|██████████| 224/224 [00:04<00:00, 48.46it/s]


Training Loss: 0.6696379014423915
Validation Loss: 0.7008301615715027


Epoch 29: 100%|██████████| 224/224 [00:04<00:00, 47.82it/s]


Training Loss: 0.6706823384655374
Validation Loss: 0.6794489055871964


Epoch 30: 100%|██████████| 224/224 [00:04<00:00, 48.24it/s]


Training Loss: 0.667616116415177
Validation Loss: 0.6900422126054764


Epoch 31: 100%|██████████| 224/224 [00:04<00:00, 48.77it/s]


Training Loss: 0.6675857459860188
Validation Loss: 0.6862527161836625
Early stopping triggered
Confusion Matrix:
[[369 220]
 [295 348]]
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.63      0.59       589
           1       0.61      0.54      0.57       643

    accuracy                           0.58      1232
   macro avg       0.58      0.58      0.58      1232
weighted avg       0.59      0.58      0.58      1232



# Run Season

In [165]:
date1, date2 = int(str(test_year)+'0101'), int(str(test_year+1)+'0101')
subset_df = merged_player_dataframe[(merged_player_dataframe['Date'] >= date1) & (merged_player_dataframe['Date'] <= date2)]
player_list = subset_df['lineup'].to_frame()

results, bets, skipped = predict_game_outcome(player_list, train_player_data, model, augment=False, scaler=10, base_bet=20)

final_bets = generate_bets(results, bets, subset_df, skipped)

print('Number of games: ', len(subset_df))
print('Predictions: ', results)
print('Bets: ', bets)

wallet_balance, wallet_balance_history = simulate_betting_season(final_bets, merged_dataframes)
print(wallet_balance)
print(wallet_balance_history)
plt.plot(wallet_balance_history)
plt.title('Wallet Balance Over Betting Season')
plt.xlabel('Bets')
plt.ylabel('Wallet Balance')
plt.show()

Number of games:  967
Predictions:  [1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0]
Bets:  [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 

KeyError: 'date'

In [None]:
len(merged_dataframes)

967

In [None]:
print(final_bets)

[(20, 'CLE', 20210401), (20, 'MIN', 20210401), (20, 'DET', 20210403), (20, 'MIL', 20210403), (20, 'ATL', 20210404), (20, 'MIL', 20210404), (20, 'TEX', 20210405), (20, 'WAS', 20210406), (20, 'TEX', 20210406), (20, 'WAS', 20210407), (20, 'WAS', 20210407), (20, 'WAS', 20210407), (20, 'WAS', 20210407), (20, 'PHI', 20210410), (20, 'BAL', 20210410), (20, 'ARI', 20210411), (20, 'CLE', 20210411), (20, 'ATL', 20210412), (20, 'OAK', 20210412), (20, 'ATL', 20210413), (20, 'ARI', 20210413), (20, 'MIA', 20210414), (20, 'ATL', 20210415), (20, 'WAS', 20210416), (20, 'PIT', 20210416), (20, 'TEX', 20210416), (20, 'WAS', 20210417), (20, 'MIL', 20210417), (20, 'OAK', 20210417), (20, 'TEX', 20210417), (20, 'WAS', 20210418), (20, 'TEX', 20210418), (20, 'CIN', 20210418), (20, 'OAK', 20210420), (20, 'MIN', 20210420), (20, 'MIN', 20210420), (20, 'OAK', 20210420), (20, 'MIA', 20210420), (20, 'OAK', 20210421), (20, 'ARI', 20210423), (20, 'BAL', 20210423), (20, 'BAL', 20210424), (20, 'ATL', 20210425), (20, 'ATL'

In [None]:
# len(final_bets)
len(train_player_data)

3741

# Misc

In [None]:
missing_players = check_missing_players(player_list, train_player_data)
print('Missing players:', missing_players)
len(missing_players)

Missing players: ['l', 'i', 'n', 'e', 'u', 'p']


6

In [None]:
print(skipped)
# for index in skipped:missing_players_per_game = []


[0, 2, 4, 5, 6, 8, 9, 12, 13, 14, 16, 17, 20, 21, 22, 24, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43, 44, 47, 48, 50, 52, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 79, 80, 81, 86, 87, 89, 91, 92, 94, 95, 101, 102, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 115, 117, 118, 119, 120, 122, 123, 124, 125, 126, 132, 133, 135, 136, 138, 139, 140, 142, 143, 144, 146, 147, 148, 150, 151, 153, 155, 156, 157, 160, 161, 163, 166, 167, 168, 170, 173, 174, 175, 176, 178, 181, 182, 183, 185, 186, 187, 188, 189, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 205, 206, 207, 209, 211, 213, 214, 215, 216, 217, 218, 219, 221, 224, 225, 226, 227, 228, 229, 230, 231, 234, 235, 236, 237, 238, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 255, 256, 257, 259, 260, 261, 262, 263, 264, 265, 267, 268, 270, 271, 273, 274, 275, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 294, 295, 297, 298, 299, 300, 301, 306,

In [None]:
len(pd.DataFrame(train_player_data.keys()))
len(player_list)

967

In [None]:
# extra_players_per_game = []
# for game in player_list_df.iloc[:, 0]:
#     # Flatten the list of players for the current game
#     game_players_flat = [player for sublist in game for player in sublist]
#     # Convert the list into a set to remove duplicates
#     game_player_set = set(game_players_flat)
#     extra_players = train_player_set - game_player_set
#     extra_players_per_game.append(extra_players)

# print(extra_players_per_game)

In [None]:
def check_missing_players_per_game(player_list, player_data):
    missing_players_per_game = []
    for game in player_list:
        missing_players = []
        for team in game:
            for player in team:
                if player.lower() not in player_data:
                    missing_players.append(player)
        missing_players_per_game.append(missing_players)
    return missing_players_per_game

missing_players_per_game = check_missing_players_per_game(player_list['lineup'], train_player_data) #Check if there are any wrong player names between the two datasets
# missing_players_per_game = check_missing_players_per_game(player_list['lineup'], train_player_data) #Check if there are any players  
for i, missing_players in enumerate(missing_players_per_game):
    print(f'Missing players for game {i+1}:', missing_players)

Missing players for game 1: ['J.T. Realmuto']
Missing players for game 2: []
Missing players for game 3: ['Yulieski Gurriel']
Missing players for game 4: []
Missing players for game 5: ['Yulieski Gurriel', "Ka'ai Tom"]
Missing players for game 6: ['J.D. Martinez']
Missing players for game 7: ['J.T. Realmuto']
Missing players for game 8: []
Missing players for game 9: ['J.D. Martinez']
Missing players for game 10: ['Yulieski Gurriel']
Missing players for game 11: []
Missing players for game 12: []
Missing players for game 13: ['Akil Baddoo']
Missing players for game 14: ['J.D. Martinez']
Missing players for game 15: ['Yulieski Gurriel', 'Chas McCormick', "Ka'ai Tom"]
Missing players for game 16: []
Missing players for game 17: ['Jonathan India']
Missing players for game 18: ['Akil Baddoo']
Missing players for game 19: []
Missing players for game 20: []
Missing players for game 21: ['Jonathan India']
Missing players for game 22: ['Gerardo Perdomo', 'C.J. Cron']
Missing players for game 2

In [None]:
def remove_empty_and_duplicate_games(missing_players_per_game):
    filtered_games = []
    for game in missing_players_per_game:
        if game and game not in filtered_games:
            filtered_games.append(game)
    return filtered_games

filtered_games = remove_empty_and_duplicate_games(missing_players_per_game)
for i, missing_players in enumerate(filtered_games):
    print(f'Missing players for game {i+1}:', missing_players)



Missing players for game 1: ['J.T. Realmuto']
Missing players for game 2: ['Yulieski Gurriel']
Missing players for game 3: ['Yulieski Gurriel', "Ka'ai Tom"]
Missing players for game 4: ['J.D. Martinez']
Missing players for game 5: ['Akil Baddoo']
Missing players for game 6: ['Yulieski Gurriel', 'Chas McCormick', "Ka'ai Tom"]
Missing players for game 7: ['Jonathan India']
Missing players for game 8: ['Gerardo Perdomo', 'C.J. Cron']
Missing players for game 9: ['J.A. Happ']
Missing players for game 10: ['C.J. Cron', 'Josh Fuentes']
Missing players for game 11: ['Hyun-Jin Ryu']
Missing players for game 12: ['Gerardo Perdomo', 'Josh Fuentes']
Missing players for game 13: ['J.P. Crawford', 'Taylor Trammell']
Missing players for game 14: ["Ka'ai Tom", 'Yulieski Gurriel']
Missing players for game 15: ['Akil Baddoo', 'Yulieski Gurriel']
Missing players for game 16: ['J.D. Martinez', 'J.A. Happ']
Missing players for game 17: ['J.D. Martinez', 'J.T. Riddle', 'Alex Kirilloff']
Missing players for

In [None]:
# train_dataset = vector_dataset(X_train_resampled, y_train_resampled)
# instantiate the random undersampler
#rus = RandomUnderSampler(random_state=42)

# instantiate SMOTE
#smote = SMOTE(random_state=42)

# resample X_train and y_train using undersampling
#X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# resample X_train and y_train using SMOTE
#X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# print the new class distribution after undersampling
#print('Resampled dataset shape after undersampling %s' % len(X_train_resampled))

# print the new class distribution after SMOTE
#print('Resampled dataset shape after SMOTE %s' % len(X_train_smote))