In [264]:
import json
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.utils import shuffle as skshuffle
from sklearn.preprocessing import OneHotEncoder as OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [201]:
# Load data
teams = pd.read_csv('data/MTeams.csv').astype(object)
stats = pd.read_csv('data/cbb21.csv')
results = pd.read_csv('data/MRegularSeasonCompactResults.csv')

# Filter only this year's teams
results = results.loc[results['Season'].isin([2019, 2020, 2021])]
my_team_names = ['Gonzaga', 'Arizona', 'Kansas', 'Baylor', 'Auburn', 'Kentucky', 'Villanova', 'Duke', 'Wisconsin', 'Tennessee', 'Purdue', 'Texas Tech', 'UCLA', 'Illinois', 'Providence', 'Arkansas', 'Connecticut', 'Houston', 'Saint Marys', 'Iowa', 'Alabama', 'LSU ', 'Texas ', 'Colorado St.', 'USC', 'Murray St.', 'Michigan St', 'Ohio St', 'Boise St.', 'North Carolina', 'San Diego St', 'Seton Hall ', 'Creighton ', 'TCU ', 'Marquette ', 'Memphis ', 'San Francisco', 'Miami', 'Loyola Chicago', 'Davidson', 'Iowa St', 'Michigan ', 'Wyoming', 'Rutgers ', 'Indiana ', 'Virginia Tech ', 'Notre Dame ', 'UAB', 'Richmond ', 'New Mexico St', 'Chattanooga', 'South Dakota St', 'Vermont', 'Akron', 'Longwood', 'Yale ', 'Colgate ', 'Montana St.', 'Delaware ', 'Saint Peters ', 'Jacksonville St', 'Cal StFullerton ', 'Georgia St', 'Norfolk St.', 'Wright St', 'Bryant', 'Texas Southern ', 'A&M-Corpus Christian']
my_team_names = [team_name.lower().replace('.', '').replace(' ', '') for team_name in my_team_names]
all_team_names = list(set(my_team_names + list(teams['TeamName'])))
all_team_names = [team_name.lower().replace('.', '').replace(' ', '') for team_name in all_team_names]
print(all_team_names)

# Join teams with their stats
for real_team_name in all_team_names:
    #print(f'real_name: {real_team_name}')

    for team in teams.iterrows():
        team_name_og = team[1]['TeamName']
        team_name = team[1]['TeamName'].lower().replace('.', '').replace(' ', '')
        
        if real_team_name == team_name:
            #print(f'team_name: {team_name}')
            teams.loc[teams['TeamName'] == team_name_og, 'TeamName'] = real_team_name
            break
    else:
        print(f'Could not find teams match for {real_team_name}')
        
    for stat in stats.iterrows():
        stat_name_og = stat[1]['TEAM']
        stat_name = stat[1]['TEAM'].lower().replace('.', '').replace(' ', '')

        if stat_name == real_team_name:
            #print(f'stat_name: {stat_name}')
            stats.loc[stats['TEAM'] == stat_name_og, 'TEAM'] = real_team_name
            break
    else:
        print(f'Could not find stats match for {real_team_name}')
    
    #print('-----')        

['alabamast', 'washington', 'drexel', 'samhoustonst', 'gasouthern', 'vermont', 'vanderbilt', 'calpoly', 'indiana', 'alabama', "stjohn's", 'ucsandiego', 'ilchicago', 'lasalle', 'pepperdine', 'drake', 'houstonbap', 'vcu', 'marquette', 'richmond', 'utahvalley', 'liubrooklyn', 'oregon', 'texas', 'stbonaventure', 'cincinnati', 'sandiegost', 'samford', 'oklahomast', 'villanova', 'bowlinggreen', 'houston', 'utica', 'santaclara', 'ncolorado', 'arkansas', 'smu', 'texastech', 'sandiego', 'floridast', 'wichitast', 'manhattan', 'northcarolina', 'virginiatech', 'northtexas', 'alabamaa&m', 'arkpinebluff', 'utahst', 'mtstmarys', 'merrimack', 'longbeachst', 'chattanooga', 'eillinois', 'memphis', 'rutgers', 'wmichigan', 'bryant', 'a&m-corpuschristian', 'ucirvine', 'baylor', 'selouisiana', 'southdakotast', 'illinois', 'setonhall', 'emichigan', 'montanast', 'longwood', 'sandiegost', 'portland', 'liberty', 'utsanantonio', 'bethune-cookman', 'howard', 'sanfrancisco', 'mississippi', 'utarlington', 'syracuse

Could not find stats match for savannahst
Could not find stats match for cssacramento
Could not find stats match for floridaintl
Could not find stats match for abilenechr
Could not find stats match for semissourist
Could not find stats match for princeton
Could not find stats match for grambling
Could not find stats match for msvalleyst
Could not find stats match for nccentral
Could not find stats match for miamifl
Could not find stats match for neillinois
Could not find stats match for csbakersfield
Could not find stats match for stlouis
Could not find stats match for wcarolina
Could not find stats match for sunyalbany
Could not find stats match for yale
Could not find stats match for nca&t
Could not find stats match for csnorthridge
Could not find stats match for alliantintl
Could not find stats match for ewashington
Could not find stats match for wigreenbay
Could not find stats match for wku
Could not find stats match for armstrongst
Could not find stats match for stjoseph'spa
Could

In [202]:
teams_stats = teams.merge(stats, how='inner', left_on='TeamName', right_on='TEAM')
teams_stats = teams_stats.loc[teams_stats['TeamName'].isin(all_team_names)]
teams_stats = teams_stats.drop(columns=['TEAM', 'LastD1Season', 'FirstD1Season', 'SEED', 'CONF'])
teams_stats = teams_stats.reset_index()

features_columns = list(teams_stats.columns)
features_columns.remove('TeamID')
features_columns.remove('TeamName')
features_columns.remove('index')
print(features_columns)
features = teams_stats[features_columns]
scaled_features = StandardScaler().fit_transform(features)
teams_stats[features_columns] = scaled_features

teams_stats.head()

['G', 'W', 'ADJOE', 'ADJDE', 'BARTHAG', 'EFG_O', 'EFG_D', 'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P_O', '2P_D', '3P_O', '3P_D', 'ADJ_T', 'WAB']


Unnamed: 0,index,TeamID,TeamName,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,...,ORB,DRB,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB
0,0,1102,airforce,0.627713,-1.445459,-0.951124,1.365976,-1.379059,0.603395,2.720302,...,-2.859034,2.487068,-0.181529,0.238288,1.184041,2.866734,-0.246716,1.209288,-1.958782,-1.66995
1,1,1103,akron,-0.334487,0.445022,0.49166,0.186862,0.325783,0.414518,-0.451915,...,0.514122,0.329008,-0.22495,-0.782752,0.782558,-0.073472,-0.104024,-0.817039,0.036829,0.348154
2,2,1104,alabama,1.830463,2.146454,1.096255,-2.121545,1.544199,0.351559,-1.641496,...,0.858322,0.655986,-0.680873,0.071209,0.036946,-0.943533,0.502415,-1.951782,2.107745,2.269233
3,3,1105,alabamaa&m,-1.777788,-1.25641,-2.42139,0.635257,-1.776437,-1.694607,-2.110118,...,0.101082,0.525195,0.013866,1.092248,-1.282214,-1.423567,-1.744976,-2.154415,-0.339702,-0.641493
4,4,1106,alabamast,-1.056138,-1.634507,-2.325205,1.814372,-1.936486,-2.10384,-0.343771,...,0.399389,0.590591,2.380321,0.999426,-2.486664,-0.163479,-0.710463,-0.4523,0.67693,-1.611735


In [203]:
all_team_ids = teams_stats['TeamID']
cleaned_results = results.loc[results['WTeamID'].isin(all_team_ids) & results['LTeamID'].isin(all_team_ids)]
cleaned_results = cleaned_results.drop(columns=['Season', 'DayNum', 'WLoc', 'NumOT'])
cleaned_results = cleaned_results.reset_index()
cleaned_results = cleaned_results.drop(columns=['index'])
cleaned_results.head()

Unnamed: 0,WTeamID,WScore,LTeamID,LScore
0,1113,102,1168,94
1,1119,73,1265,69
2,1120,101,1375,58
3,1123,86,1232,69
4,1138,82,1384,67


In [204]:
class MatchData(torch.utils.data.Dataset):
    def __init__(self, teams_stats, results, is_train=True):        
        self.teams_stats = teams_stats
        self.results = results
        self.is_train = is_train
        
        all_team_ids = list(map(lambda d: [d], teams_stats['TeamID']))#np.reshape(teams_stats['TeamID'], (-1, 1))
        
        self.teamid_encoder = OneHotEncoder()
        self.teamid_encoder.fit(all_team_ids)
        
    def generateXData(self, winner_team_id, loser_team_id):
        winner_x = self.teams_stats[self.teams_stats['TeamID'] == winner_team_id]
        winner_x = winner_x.drop(columns=['TeamID', 'TeamName'])
        loser_x = self.teams_stats[self.teams_stats['TeamID'] == loser_team_id]
        loser_x = loser_x.drop(columns=['TeamID', 'TeamName'])
        
        winner_id = np.array([[winner_team_id]])
        loser_id = np.array([[loser_team_id]])
        winner_encoding = self.teamid_encoder.transform(winner_id).toarray()[0]
        loser_encoding = self.teamid_encoder.transform(loser_id).toarray()[0]
                
        winner_x = np.array(winner_x)[0]
        loser_x = np.array(loser_x)[0]
        
        return winner_encoding, loser_encoding, winner_x, loser_x
        
    def __getitem__(self, index):
        match = self.results.loc[index]
        winner_encoding, loser_encoding, winner_x, loser_x = self.generateXData(match['WTeamID'], match['LTeamID'])
        
        winner_y = match['WScore']
        loser_y = match['LScore']
        
        noise = np.random.randint(-2, 2, size=1)[0] if self.is_train else 0
        if np.random.randint(2, size=1)[0] == 0:
            x = np.concatenate((winner_encoding, loser_encoding, winner_x, loser_x))
            y = np.array([winner_y - loser_y + noise]) #np.array([winner_y, loser_y])
        else:
            x = np.concatenate((loser_encoding, winner_encoding, loser_x, winner_x))
            y = np.array([loser_y - winner_y + noise]) #np.array([loser_y, winner_y])
            
        return torch.tensor(x).float(), torch.tensor(y).float()
    
    def __len__(self):
        return len(self.results)

In [297]:
num_train_samples = int(len(cleaned_results) * 0.8)
num_test_samples = len(cleaned_results) - num_train_samples

shuffled_results = skshuffle(cleaned_results)
train_results = shuffled_results[:num_train_samples].reset_index().drop(columns='index')
test_results = shuffled_results[num_train_samples:].reset_index().drop(columns='index')

train_data = MatchData(teams_stats, train_results, is_train=True)
test_data = MatchData(teams_stats, test_results, is_train=False)

train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=5, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_data, batch_size=num_test_samples, shuffle=True)

print(len(train_data))
print(len(test_data))
print(next(iter(train_loader))[0][0])
print(len(next(iter(train_loader))[0][0]))

7212
1804
tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000

In [207]:
class Model(nn.Module):

    def __init__(self):
        super(Model, self).__init__()
        self.linear_0 = nn.Linear(588, 200)
        self.linear_1 = nn.Linear(200, 100)
        self.linear_2 = nn.Linear(100, 50)
        self.linear_3 = nn.Linear(50, 20)
        self.linear_4 = nn.Linear(20, 1)
        #self.dropout = nn.Dropout(0.0)
        
        self.linear_reg = nn.Linear(174, 1)
        
        self.relu = nn.ReLU()
        
        #nn.Tanh() #nn.SiLU() #nn.Tanhshrink()
        self.final_activatation = nn.Tanhshrink()
        

    def forward(self, x):
        #x = self.linear_reg(x)
        
        x = self.linear_0(x)
        x = self.relu(x)
        
        x = self.linear_1(x)
        #x = self.dropout(x)
        x = self.relu(x)
        
        x = self.linear_2(x)
        #x = self.dropout(x)
        x = self.relu(x)
        
        x = self.linear_3(x)
        #x = self.dropout(x)
        x = self.relu(x)
        
        x = self.linear_4(x)
        
        x = self.final_activatation(x)

        return x

In [208]:
model = Model()

criterion = nn.MSELoss()
#torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) #optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.003)
log_rate = 100

test_x, test_y = next(iter(test_loader))
for epoch in range(4):

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % log_rate == log_rate - 1:
            
            with torch.no_grad():
                test_out = model(test_x)
                test_loss = criterion(test_out, test_y).item()
                acc = accuracy_score(test_y > 0, test_out > 0)
            
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f} test_loss: {test_loss:.2f} acc: {acc:.3f}')
            running_loss = 0.0

print('Finished Training')


[1,   100] loss: 11.059 test_loss: 209.11 acc: 0.497
[1,   200] loss: 11.518 test_loss: 209.16 acc: 0.497
[1,   300] loss: 11.483 test_loss: 203.37 acc: 0.497
[1,   400] loss: 10.699 test_loss: 168.79 acc: 0.637
[1,   500] loss: 9.519 test_loss: 185.39 acc: 0.583
[1,   600] loss: 7.953 test_loss: 171.75 acc: 0.663
[1,   700] loss: 8.628 test_loss: 162.24 acc: 0.665
[1,   800] loss: 9.261 test_loss: 169.19 acc: 0.645
[1,   900] loss: 9.476 test_loss: 171.27 acc: 0.635
[1,  1000] loss: 10.384 test_loss: 188.47 acc: 0.634
[1,  1100] loss: 9.200 test_loss: 172.64 acc: 0.643
[1,  1200] loss: 9.524 test_loss: 161.44 acc: 0.654
[1,  1300] loss: 8.330 test_loss: 173.71 acc: 0.664
[1,  1400] loss: 8.755 test_loss: 157.78 acc: 0.666
[2,   100] loss: 8.150 test_loss: 158.09 acc: 0.669
[2,   200] loss: 8.493 test_loss: 160.66 acc: 0.661
[2,   300] loss: 9.495 test_loss: 167.23 acc: 0.671
[2,   400] loss: 8.499 test_loss: 160.88 acc: 0.673
[2,   500] loss: 8.108 test_loss: 155.74 acc: 0.673
[2,   6

In [209]:
record = {}

In [292]:
def getResult(team1_name, team2_name, print_history=False):
    with torch.no_grad():
        team1_id = int(teams_stats.loc[teams_stats['TeamName'] == team1_name]['TeamID'])
        team2_id = int(teams_stats.loc[teams_stats['TeamName'] == team2_name]['TeamID'])

        winner_encoding, loser_encoding, winner_x, loser_x = train_data.generateXData(team1_id, team2_id)
        x1 = np.concatenate((winner_encoding, loser_encoding, winner_x, loser_x))
        x2 = np.concatenate((loser_encoding, winner_encoding, loser_x, winner_x))

        real_history = cleaned_results.loc[cleaned_results['WTeamID'].isin([team1_id, team2_id]) & cleaned_results['LTeamID'].isin([team2_id, team1_id])]

        pred1 = model(torch.tensor(x1).float())
        pred2 = model(torch.tensor(x2).float())
        #print(pred1)
        #print(pred2)
        average = (pred1.numpy()[0] - pred2.numpy()[0]) / 2
        print(average)
        
        if print_history:
            print(f'{team1_name}: {team1_id}')
            print(f'{team2_name}: {team2_id}')
            print(real_history)
        
        # Write to record
        if not team1_name in record:
            record[team1_name] = {}
        if not team2_name in record:
            record[team2_name] = {}

        record[team1_name][team2_name] = average
        record[team2_name][team1_name] = -1 * average
    

In [295]:
print(my_team_names)

team1_name = 'gonzaga'
team2_name = 'auburn'
getResult(team1_name, team2_name, print_history=True)

['gonzaga', 'arizona', 'kansas', 'baylor', 'auburn', 'kentucky', 'villanova', 'duke', 'wisconsin', 'tennessee', 'purdue', 'texastech', 'ucla', 'illinois', 'providence', 'arkansas', 'connecticut', 'houston', 'saintmarys', 'iowa', 'alabama', 'lsu', 'texas', 'coloradost', 'usc', 'murrayst', 'michiganst', 'ohiost', 'boisest', 'northcarolina', 'sandiegost', 'setonhall', 'creighton', 'tcu', 'marquette', 'memphis', 'sanfrancisco', 'miami', 'loyolachicago', 'davidson', 'iowast', 'michigan', 'wyoming', 'rutgers', 'indiana', 'virginiatech', 'notredame', 'uab', 'richmond', 'newmexicost', 'chattanooga', 'southdakotast', 'vermont', 'akron', 'longwood', 'yale', 'colgate', 'montanast', 'delaware', 'saintpeters', 'jacksonvillest', 'calstfullerton', 'georgiast', 'norfolkst', 'wrightst', 'bryant', 'texassouthern', 'a&m-corpuschristian']
13.532773971557617
gonzaga: 1211
auburn: 1120
      WTeamID  WScore  LTeamID  LScore
6613     1211      90     1120      67


In [296]:
print(record)
with open('record.json', 'w') as file:
    json.dump(record, file)

{'alabamast': {'notredame': -40.86168670654297}, 'notredame': {'alabamast': 40.86168670654297, 'alabama': -8.063173294067383}, 'gonzaga': {'georgiast': 31.019922256469727, 'memphis': 15.931671142578125, 'connecticut': 7.094582557678223, 'alabama': 5.366922378540039, 'arizona': 11.725333213806152, 'baylor': 2.9373722076416016, 'houston': 1.8810545206069946, 'auburn': 13.532773971557617}, 'georgiast': {'gonzaga': -31.019922256469727}, 'boisest': {'memphis': -2.168485164642334}, 'memphis': {'boisest': 2.168485164642334, 'gonzaga': -15.931671142578125}, 'connecticut': {'newmexicost': 20.607955932617188, 'arkansas': 0.7410917282104492, 'gonzaga': -7.094582557678223}, 'newmexicost': {'connecticut': -20.607955932617188}, 'arkansas': {'vermont': 19.57164192199707, 'connecticut': -0.7410917282104492}, 'vermont': {'arkansas': -19.57164192199707}, 'alabama': {'notredame': 8.063173294067383, 'texastech': 1.4999794960021973, 'duke': 1.8349093198776245, 'gonzaga': -5.366922378540039}, 'texastech': {

In [291]:
team1_id = int(teams_stats.loc[teams_stats['TeamName'] == 'gonzaga']['TeamID'])
team2_id = int(teams_stats.loc[teams_stats['TeamName'] == 'houston']['TeamID'])

print(cleaned_results[cleaned_results['WTeamID'] == team1_id | ])