In [81]:
import sys
import pickle
from typing import List, Dict
import pandas as pd
import numpy as np
import xgboost

sys.path.insert(0, '../source/data_structures/')
sys.path.insert(0, '../data_structures/')
from teams import Teams, SpecificEntryImporter

In [82]:
pd.set_option('display.max_columns', None)

In [83]:
with open('../data/my_data/ds1.p', 'rb') as handle:
    df = pickle.load(handle)
rks = pd.read_csv('../data/kaggle_data/MMasseyOrdinals.csv')

### 1. Get locations on games

In [84]:
""" 
=============
A. Get Teams 
=============
- To see where teams belong in bracket
"""
entryImporter = SpecificEntryImporter()
teams = Teams(teamImporter = entryImporter)
teams.setPredIds(file = '../data/MTeams_.csv')

In [100]:
"""
=============
A. Which game will teams meet in
=============
- i.e., which game of tournament
    - so we can determine location of matchup
"""

# 1. Dictionary to go from Kaggle team id to bracket id
predToBrack = dict(zip([team.predId for team in teams.teams] , [team.bracketId for team in teams.teams]))

# 2. Read in predictions template 
#    - all possible games, formatted like:
#        2022_1411_1506
matchups = pd.read_csv('../data/kaggle_predictions/predTemplate2022.csv')
matchups[['Year', 'Tm1','Tm2']] = matchups['Id'].str.split('_', expand = True)
matchups[['Tm1','Tm2']] = matchups[['Tm1','Tm2']].astype(int)


# 3. For any game (pair of teams)
#      -Determine game in bracket that teams meet in
#      -This will determine physical location of game
#           i.e., Final 4 games in New Orleans, LA
def leftMostSetBit(num):
    pos = 0
    while num > 0:
        num = num >> 1
        pos += 1
    return pos
    
def getBracketNums(row):
    tm1 = int(row['Tm1'])
    tm2 = int(row['Tm2'])
    row['brack_1'] = predToBrack[tm1]
    row['brack_2'] = predToBrack[tm2]

    rdsIn = leftMostSetBit(predToBrack[tm1] ^ predToBrack[tm2])
    row['rdsIn'] = rdsIn
    row['meetGame'] = (predToBrack[tm1] + 64) // (2 ** rdsIn)
    return row
matchups = matchups.apply(getBracketNums, axis = 1)
matchups.head()

Unnamed: 0,Id,Pred,Year,Tm1,Tm2,brack_1,brack_2,rdsIn,meetGame
0,2022_1103_1104,,2022,1103,1104,23,8,5,2
1,2022_1103_1112,,2022,1103,1112,23,32,6,1
2,2022_1103_1116,,2022,1103,1116,23,6,5,2
3,2022_1103_1120,,2022,1103,1120,23,62,6,1
4,2022_1103_1124,,2022,1103,1124,23,16,3,10


In [99]:
"""
===========
A. Get physical game locations
===========
"""
state_abbrvs = {'Louisiana' : 'LA',
                'Pennsylvania' : 'PA',
                'Oregon' : 'OR',
                'California' : 'CA',
                'Texas' : 'TX',
                'New York' : 'NY',
                'Illinois' : 'IL',
                'South Carolina' : 'SC',
                'Wisconsin' : 'WI',
                'Indiana' : 'IN'}

ncaaLocs = pd.read_excel('../data/my_data/ncaaGameLocations.xlsx') # This is file I made by hand
matchups1 = matchups.merge(ncaaLocs, left_on = 'meetGame',right_on = 'Index').drop('Index', axis = 1)
matchups1['State'] = matchups1['State'].replace(state_abbrvs)
matchups1.head()

Unnamed: 0,Id,Pred,Year,Tm1,Tm2,brack_1,brack_2,rdsIn,meetGame,Description,City,State
0,2022_1103_1104,,2022,1103,1104,23,8,5,2,F4,New Orleans,LA
1,2022_1103_1116,,2022,1103,1116,23,6,5,2,F4,New Orleans,LA
2,2022_1103_1129,,2022,1103,1129,23,2,5,2,F4,New Orleans,LA
3,2022_1103_1163,,2022,1103,1163,23,4,5,2,F4,New Orleans,LA
4,2022_1103_1168,,2022,1103,1168,23,15,5,2,F4,New Orleans,LA


In [98]:
""" 
===============
A. Merge lat/long coords
===============
"""
# 1. Merge on game locations
with open('../data/my_data/gameLocs.p', 'rb') as f:
    gameLocs = pickle.load(f).drop_duplicates()

matchups2 = matchups1.merge(gameLocs[['City','State','lat','long']], on = ['City','State'])

# 2. Merge on team home cities
with open('../data/my_data/teamLocs.p', 'rb') as f:
    teamLocs = pickle.load(f)
matchups2 = matchups2.merge(teamLocs[['TmID','TmName','TmLat','TmLong']], left_on = ['Tm1'], right_on = 'TmID')
matchups2 = matchups2.merge(teamLocs[['TmID','TmName','TmLat','TmLong']], left_on = ['Tm2'], right_on = 'TmID', suffixes = ['_tm1','_tm2'])

matchups2.drop(['brack_1', 'brack_2', 'rdsIn', 'meetGame'], axis = 1, inplace = True)
matchups2.head()
# matchups2[~matchups2['Description'].isin(['NCG','F4'])].sample(15)

Unnamed: 0,Id,Pred,Year,Tm1,Tm2,Description,City,State,lat,long,TmID_tm1,TmName_tm1,TmLat_tm1,TmLong_tm1,TmID_tm2,TmName_tm2,TmLat_tm2,TmLong_tm2
0,2022_1103_1104,,2022,1103,1104,F4,New Orleans,LA,29.975998,-90.078213,1103.0,Akron,41.083064,-81.518485,1104.0,Alabama,33.209561,-87.567526
1,2022_1103_1116,,2022,1103,1116,F4,New Orleans,LA,29.975998,-90.078213,1103.0,Akron,41.083064,-81.518485,1116.0,Arkansas,36.062584,-94.157433
2,2022_1104_1116,,2022,1104,1116,E8,San Francisco,CA,37.779026,-122.419906,1104.0,Alabama,33.209561,-87.567526,1116.0,Arkansas,36.062584,-94.157433
3,2022_1112_1116,,2022,1112,1116,NCG,New Orleans,LA,29.975998,-90.078213,1112.0,Arizona,32.222876,-110.974848,1116.0,Arkansas,36.062584,-94.157433
4,2022_1103_1129,,2022,1103,1129,F4,New Orleans,LA,29.975998,-90.078213,1103.0,Akron,41.083064,-81.518485,1129.0,Boise St,43.616616,-116.200886


In [103]:
""" 
================
A. Calculate tm distance to loc
================
"""
import geopy.distance
def calcDist(row):
    c1 = row['TmLat_tm1'], row['TmLong_tm1']
    c2 = row['TmLat_tm2'], row['TmLong_tm2']

    c_game = row['lat'], row['long']
    row['dist_tm1'] =  geopy.distance.geodesic(c1, c_game).miles
    row['dist_tm2'] =  geopy.distance.geodesic(c2, c_game).miles
    return row

matchups2 = matchups2.apply(calcDist, axis = 1)
matchups2.head()

Unnamed: 0,Id,Pred,Year,Tm1,Tm2,Description,City,State,lat,long,TmID_tm1,TmName_tm1,TmLat_tm1,TmLong_tm1,TmID_tm2,TmName_tm2,TmLat_tm2,TmLong_tm2,dist_tm1,dist_tm2
0,2022_1103_1104,,2022,1103,1104,F4,New Orleans,LA,29.975998,-90.078213,1103.0,Akron,41.083064,-81.518485,1104.0,Alabama,33.209561,-87.567526,903.919061,267.474087
1,2022_1103_1116,,2022,1103,1116,F4,New Orleans,LA,29.975998,-90.078213,1103.0,Akron,41.083064,-81.518485,1116.0,Arkansas,36.062584,-94.157433,903.919061,481.554851
2,2022_1104_1116,,2022,1104,1116,E8,San Francisco,CA,37.779026,-122.419906,1104.0,Alabama,33.209561,-87.567526,1116.0,Arkansas,36.062584,-94.157433,1978.457825,1563.316692
3,2022_1112_1116,,2022,1112,1116,NCG,New Orleans,LA,29.975998,-90.078213,1112.0,Arizona,32.222876,-110.974848,1116.0,Arkansas,36.062584,-94.157433,1246.426187,481.554851
4,2022_1103_1129,,2022,1103,1129,F4,New Orleans,LA,29.975998,-90.078213,1103.0,Akron,41.083064,-81.518485,1129.0,Boise St,43.616616,-116.200886,903.919061,1715.66684


### 2. Get Rankings on Games

In [104]:
""" 
=============
A. Limit ranks data
=============
- To proper time frame
- And previously determined rating systems (ml_helper.ipynb)
"""
ords_f = rks[(rks['Season'] >= 2022) & (rks['RankingDayNum'] == 128)]
ords_f = ords_f.pivot_table(index = ['Season','RankingDayNum','TeamID'],
                    values = 'OrdinalRank', 
                    columns = 'SystemName').reset_index().sort_values(['Season','RankingDayNum','TeamID'])
ords_f = ords_f[['TeamID','MAS', 'DOK', 'MOR', 'PGH', 'POM', 'SAG']]

In [106]:
"""" 
============
A. Merge on ranks
============
"""
matchups3 = matchups2.merge(ords_f, left_on = 'TmID_tm1', right_on = 'TeamID').drop('TeamID',axis = 1)
matchups3 = matchups3.merge(ords_f, left_on = 'TmID_tm2', right_on = 'TeamID', suffixes = ['_tm1','_tm2']).drop('TeamID',axis = 1)
matchups3['tm1home'] = 0.5

reg_set = matchups3[['dist_tm1',
                        'dist_tm2',
                        'tm1home',
                        'DOK_tm1',
                        'MAS_tm1',
                        'MOR_tm1',
                        'PGH_tm1',
                        'POM_tm1',
                        'SAG_tm1',
                        'DOK_tm2',
                        'MAS_tm2',
                        'MOR_tm2',
                        'PGH_tm2',
                        'POM_tm2',
                        'SAG_tm2']]