In [58]:
import pandas as pd
import numpy as np
from typing import List, Dict
from geopy.geocoders import Nominatim
import pickle

In [7]:
pd.set_option('display.max_columns', None)

In [8]:
DATA = '../data/'
KAG = DATA + 'kaggle_data/'

teams = pd.read_csv(KAG + 'MTeams.csv')
reg = pd.read_csv(KAG + 'MRegularSeasonCompactResults.csv')
post = pd.read_csv(KAG  + 'MNCAATourneyCompactResults.csv')
cities = pd.read_csv(KAG + 'Cities.csv')
gameCities = pd.read_csv(KAG + 'MGameCities.csv')
ords = pd.read_csv(KAG + 'MMasseyOrdinals.csv')

## 1. Construct 'favorite' model

### Description: 

Set of games considered:
All regular season games

Inputs used:
Rating systems + Distance to games

### Why favored:

-Lots of data - Rating systems and results go back to 2003

-"Compact" variables - ratings systems are designed to capture a lot of what we would already be trying to model if we were to feature engineer ourselves from the raw data

### 1a. Get results first

In [9]:
assert all(post.columns == reg.columns)
all_games = pd.concat([reg, post])
all_games['wins'] = 1
all_games.rename(columns = {'WTeamID':'Tm1ID', 
                            'LTeamID':'Tm2ID',
                            'WScore' : 'Tm1Score', 
                            'LScore' : 'Tm2Score', 
                            'WLoc' : 'Tm1Loc'}, inplace = True)

In [10]:
"""
=======================
A. Merge on game cities
=======================
"""
def mergeGameCities(df, gameCities, cities):
    """
    1. Merge on game cities
    """
    # a. Merge
    res = df.merge(gameCities, left_on = ['Season','DayNum','Tm1ID','Tm2ID'],
                                    right_on = ['Season','DayNum','WTeamID','LTeamID'],
                                    how = 'outer',
                                    indicator= 'has_city')
    # b. Check that after merging game cities
    #     that entire dataset has NCAA Tournament or regular season games
    assert len(res.loc[res['has_city']== 'right_only','CRType'].value_counts()) == 1
    assert len(res.loc[res['has_city'] != 'right_only','CRType'].value_counts()) == 2

    # c. Check city merge result is complete for data from 2010 onwards
    assert len(df[df['Season'] >= 2010]) == len(res[res['has_city'] == 'both'])

    # d. Reduce to left-only, drop vars, update city merge indicator
    res = res[res['has_city'] != 'right_only']
    res['has_city'] = np.where(res['has_city'] == 'both', 1,0)

    """
    2. Merge on actual city information
    """
    # a. Merge
    res = res.merge(cities, on = 'CityID', how = 'left', indicator = True)

    # b. Check that everyone that had city in last merge also got real city info
    assert (res.loc[res['has_city'] == 1, '_merge'] == 'both').mean() == 1

    # c. Drop extraneous columns
    res.drop(['WTeamID','LTeamID','_merge'],axis = 1, inplace = True)
    return res

all_games = mergeGameCities(all_games, gameCities, cities)

In [11]:
"""
==================
A. Copy and stack
==================
"""
all_games_c = all_games.copy()
all_games_c.rename(columns = {'Tm1ID' : 'Tm2ID', 
                            'Tm2ID' : 'Tm1ID',
                            'Tm1Score' : 'Tm2Score', 
                            'Tm2Score' : 'Tm1Score'
                            }, inplace = True)
all_games_c['Tm1Loc'] = all_games_c['Tm1Loc'].replace({'H': 'A', 'A':'H'})
all_games_f = pd.concat([all_games, all_games_c])

In [12]:
""" 
==============
A. Sanity checks
==============
"""
def checkFlips(all_games, all_games_c) -> bool:
    a = all_games.groupby(['Tm1ID'], as_index = False).sum()[['Tm1ID','wins']]
    b = all_games.groupby(['Tm2ID'], as_index = False).sum()[['Tm2ID','wins']]
    c = all_games.groupby(['Tm2ID'], as_index = False).sum()[['Tm2ID','wins']]
    d = all_games.groupby(['Tm1ID'], as_index = False).sum()[['Tm1ID','wins']]

    e = a.merge(b, suffixes = ['_W','_L'], left_on = 'Tm1ID', right_on = 'Tm2ID')
    f = c.merge(d, suffixes = ['_L','_W'], left_on = 'Tm2ID', right_on = 'Tm1ID')
    e.merge(f, on = ['Tm1ID','Tm2ID','wins_W','wins_L'])
    assert len(e) == len(a)
    return True

def checkHomeAway(all_games, all_games_c) -> bool:
    a = all_games['Tm1Loc'].value_counts()
    b = all_games_c['Tm1Loc'].value_counts()

    for i in range(len(a)):
        assert a[i] == b[i]
    return True

def checkNoDups(all_games_f) -> bool:
    assert all(all_games_f == all_games_f.drop_duplicates())
    return True

assert (checkFlips(all_games, all_games_c) and checkHomeAway(all_games, all_games_c)) and checkNoDups(all_games_f)

In [13]:
all_games_f

Unnamed: 0,Season,DayNum,Tm1ID,Tm1Score,Tm2ID,Tm2Score,Tm1Loc,NumOT,wins,CRType,CityID,has_city,City,State
0,1985,20,1228.0,81.0,1328.0,64.0,N,0.0,1.0,,,0,,
1,1985,25,1106.0,77.0,1354.0,70.0,H,0.0,1.0,,,0,,
2,1985,25,1112.0,63.0,1223.0,56.0,H,0.0,1.0,,,0,,
3,1985,25,1165.0,70.0,1432.0,54.0,H,0.0,1.0,,,0,,
4,1985,25,1192.0,86.0,1447.0,74.0,H,0.0,1.0,,,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178392,2021,148,1425.0,66.0,1211.0,85.0,N,0.0,1.0,NCAA,4161.0,1,Indianapolis,IN
178393,2021,148,1276.0,49.0,1417.0,51.0,N,0.0,1.0,NCAA,4161.0,1,Indianapolis,IN
178394,2021,152,1222.0,59.0,1124.0,78.0,N,0.0,1.0,NCAA,4161.0,1,Indianapolis,IN
178395,2021,152,1417.0,90.0,1211.0,93.0,N,1.0,1.0,NCAA,4161.0,1,Indianapolis,IN


#### 1b. Merge team names on

In [188]:
# teams = pd.read_csv(KAG + 'MTeams.csv')
# reg = pd.read_csv(KAG + 'MRegularSeasonCompactResults.csv')
# post = pd.read_csv(KAG  + 'MRegularSeasonCompactResults.csv')
# cities = pd.read_csv(KAG + 'Cities.csv')
# gameCities = pd.read_csv(KAG + 'MGameCities.csv')
# ords = pd.read_csv(KAG + 'MMasseyOrdinals.csv')dd

In [14]:
"""
=================
A. Merge team names
=================
"""
def mergeTeamNames(df, teams):
    int1 = df.merge(teams[['TeamID','TeamName']], left_on = 'Tm1ID', right_on = 'TeamID').drop(columns='TeamID',axis=1)
    int1 = int1.merge(teams[['TeamID','TeamName']], left_on = 'Tm2ID', right_on = 'TeamID').drop(columns='TeamID',axis=1)
    return int1

int1 = mergeTeamNames(all_games_f, teams)
int1

Unnamed: 0,Season,DayNum,Tm1ID,Tm1Score,Tm2ID,Tm2Score,Tm1Loc,NumOT,wins,CRType,CityID,has_city,City,State,TeamName_x,TeamName_y
0,1985,20,1228.0,81.0,1328.0,64.0,N,0.0,1.0,,,0,,,Illinois,Oklahoma
1,1985,33,1228.0,73.0,1328.0,70.0,H,0.0,1.0,,,0,,,Illinois,Oklahoma
2,1986,33,1228.0,57.0,1328.0,59.0,N,0.0,1.0,,,0,,,Illinois,Oklahoma
3,1995,67,1106.0,68.0,1328.0,97.0,N,0.0,1.0,,,0,,,Alabama St,Oklahoma
4,1990,82,1112.0,78.0,1328.0,74.0,H,0.0,1.0,,,0,,,Arizona,Oklahoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356789,2022,26,1370.0,81.0,1472.0,64.0,H,0.0,1.0,Regular,4313.0,1,Seattle,WA,Seattle,St Thomas MN
356790,2022,82,1377.0,90.0,1472.0,79.0,A,0.0,1.0,Regular,4482.0,1,St. Paul,MN,South Dakota,St Thomas MN
356791,2022,108,1377.0,81.0,1472.0,60.0,H,0.0,1.0,Regular,4359.0,1,Vermillion,SD,South Dakota,St Thomas MN
356792,2022,49,1303.0,73.0,1472.0,80.0,H,0.0,1.0,Regular,4255.0,1,Omaha,NE,NE Omaha,St Thomas MN


#### 1c. Join Massey Ordinals

##### i. Select rating systems to use - which systems are from 2011-2022 with good data availability

In [18]:
END_YEAR = 2022

"""
=================
A. Decide scoring systems to keep
=================
"""
def keepScoringSystems(ords) -> pd.DataFrame:
    # 1. Get min, max ranking day for each scoring system
    ord_sub_int = ords.groupby(['SystemName','Season'], as_index = False).agg({'RankingDayNum' : [min, max]})
    ord_sub_int.columns = ['SystemName','Season','min','max']
    ord_sub_int = ord_sub_int.pivot_table(index = ['SystemName'], columns = 'Season', values = ['min', 'max'])
    ord_sub_int.columns = [i[0] + '_' + str(i[1]) for i in ord_sub_int.columns]
    ord_sub_int = ord_sub_int.reset_index()

    # 2. Re-arrange columns to go by year
    empty = []
    for i in range(2003, END_YEAR + 1):
        empty.append('min_' + str(i))
        empty.append('max_' + str(i))
    ord_sub_int = ord_sub_int[['SystemName'] + empty]

    # 3. View only systems that
    #   - a. Had ratings in 2011 and 2022
    #   - b. Had their first ratings 2011 and 2022 prior to day 23 of the season
    ord_sub = ord_sub_int.loc[(ord_sub_int['min_2022'].notnull()) & (ord_sub_int['min_2022'] < 23) & 
                (ord_sub_int['min_2011'].notnull()) & (ord_sub_int['min_2011'] < 23),
                ['SystemName'] + [i for i in empty if (int(i[-4:]) in range(2011, 2023) and i[:3] == 'min')]].copy()

    # 4. Keep systems with first ratings within first three weeks of season
    #  Barring 2021 - (COVID-impacted year)
    print(ord_sub)
    ord_sub['max_day'] = ord_sub.drop('min_2021', axis=1).max(1, numeric_only = True)
    return ord_sub[ord_sub['max_day'] < 23]

"""
=================
b. Remove top-25 polls
=================
"""
def rmTop25Polls(ords : pd.DataFrame, systems : List[str]) -> List[str]:
    tmsRanked = ords[ords['SystemName'].isin(systems)].groupby(['Season','SystemName','RankingDayNum'], as_index = False).count()
    tmsRanked = tmsRanked.groupby(['Season', 'SystemName'], as_index = False).min()[['Season','SystemName','TeamID']]
    tmsRanked = tmsRanked.pivot_table(index = 'SystemName', columns = 'Season', values = 'TeamID').reset_index()
    return list(tmsRanked.loc[tmsRanked[2011] > 25, 'SystemName'])

# Decide which scoring systems you want to keep based on 
# availability of data going back by years
systems = keepScoringSystems(ords)
systemNames = list(systems['SystemName'])

# Limit those systems further by eliminating those that
# don't rank every team in the country
kpSystems = rmTop25Polls(ords, systemNames)


# Limit ords to identified systems
ords_i = ords[(ords['SystemName'].isin(kpSystems)) & (ords['Season'] >= 2011)]
ords_i

    SystemName  min_2011  min_2012  min_2013  min_2014  min_2015  min_2016  \
3           AP       9.0       7.0       8.0       9.0      16.0      16.0   
43         DES      13.0      13.0      16.0       9.0      23.0      23.0   
45         DOK       8.0      15.0       8.0       9.0      16.0      16.0   
97         MAS       0.0      12.0       7.0       9.0      16.0      16.0   
105        MOR       9.0       7.0       8.0       9.0      16.0      16.0   
117        PGH       9.0       7.0       8.0       9.0      16.0      16.0   
123        POM       9.0       7.0       8.0       9.0      16.0      16.0   
144        SAG      15.0      11.0       8.0       9.0      16.0      16.0   
176        USA       9.0       7.0       8.0       9.0      16.0      16.0   

     min_2017  min_2018  min_2019  min_2020  min_2021  min_2022  
3        16.0      16.0       9.0       9.0      30.0      16.0  
43       23.0      23.0      16.0       9.0      30.0      16.0  
45       16.0      16

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
1256303,2011,0,MAS,1102,233
1256304,2011,0,MAS,1103,109
1256305,2011,0,MAS,1104,64
1256306,2011,0,MAS,1105,332
1256307,2011,0,MAS,1106,278
...,...,...,...,...,...
4597524,2022,128,SAG,1468,203
4597525,2022,128,SAG,1469,294
4597526,2022,128,SAG,1470,227
4597527,2022,128,SAG,1471,256


In [19]:
print("This is subset of systems we want to keep")
print("From here, we also remove AP and USA, which only rank top 25 geams")
systems

This is subset of systems we want to keep
From here, we also remove AP and USA, which only rank top 25 geams


Unnamed: 0,SystemName,min_2011,min_2012,min_2013,min_2014,min_2015,min_2016,min_2017,min_2018,min_2019,min_2020,min_2021,min_2022,max_day
3,AP,9.0,7.0,8.0,9.0,16.0,16.0,16.0,16.0,9.0,9.0,30.0,16.0,16.0
45,DOK,8.0,15.0,8.0,9.0,16.0,16.0,16.0,16.0,9.0,9.0,30.0,16.0,16.0
97,MAS,0.0,12.0,7.0,9.0,16.0,16.0,16.0,16.0,9.0,9.0,30.0,16.0,16.0
105,MOR,9.0,7.0,8.0,9.0,16.0,16.0,16.0,16.0,9.0,9.0,30.0,16.0,16.0
117,PGH,9.0,7.0,8.0,9.0,16.0,16.0,16.0,16.0,9.0,9.0,30.0,16.0,16.0
123,POM,9.0,7.0,8.0,9.0,16.0,16.0,16.0,16.0,9.0,9.0,30.0,16.0,16.0
144,SAG,15.0,11.0,8.0,9.0,16.0,16.0,16.0,16.0,9.0,9.0,30.0,16.0,16.0
176,USA,9.0,7.0,8.0,9.0,16.0,16.0,16.0,16.0,9.0,9.0,30.0,16.0,16.0


##### iii. Pick years of data to keep based on consistency of data

In [20]:
"""
=================
b. Choose season based on rk day
=================
"""

# Note 2014 is year when ratings start being published
# in a consistent weekly fashion - will be much easier 
# to work with
print(ords_i.groupby('Season', as_index = False)['RankingDayNum'].nunique())
RK_START_YEAR = 2014

    Season  RankingDayNum
0     2011             42
1     2012             35
2     2013             36
3     2014             19
4     2015             18
5     2016             18
6     2017             18
7     2018             18
8     2019             19
9     2020             18
10    2021             16
11    2022             17


##### iv. Select ranking day associated with each unique season-day in data

In [21]:
"""
====================
A. Year-RankDay Dict
====================
For years 2014 onwards, rating start coming in on same days
- So create dict of {Year : [RkDay1, RkDay2, ...]}
     -for each of those years
"""     
def rankDaysDict(ords_f : pd.DataFrame) -> Dict[int, List[int]]:
     tmp = ords_f.loc[ords_f['Season'] >= RK_START_YEAR, ['Season','RankingDayNum']].drop_duplicates()
     yrDict = {}
     for i in range(RK_START_YEAR, END_YEAR + 1):
          yrDict[i] = [0] + list(tmp.loc[tmp['Season'] == i, 'RankingDayNum'].unique()) + [500]
     return yrDict

yrDict= rankDaysDict(ords_i)


"""
====================
B. Rank day for each data day
====================
For each unique season-day present in the actual data, match to appropriate
 ranking data
"""
def getRkDayDf(df : pd.DataFrame, yrDict : Dict[int, List[int]]) -> pd.DataFrame:
     days_df = df.loc[(int1['Season'] >= RK_START_YEAR), ['Season', 'DayNum']].drop_duplicates().sort_values(['Season','DayNum'])
     days_df 

     def getRkDay(row):
          season = row['Season']
          day = row['DayNum']
          for num, i in enumerate(yrDict[season]):
               if i >= day:
                    return yrDict[season][num - 1]

     days_df['RankingDayNum'] = days_df.apply(getRkDay, axis = 1)
     return days_df

rkDayDf = getRkDayDf(int1, yrDict)
assert (rkDayDf['DayNum'] < rkDayDf['RankingDayNum']).sum() == 0

##### v. Create final ords dataset

In [22]:
"""
====================
A. Pivot ordinals 
====================
"""
ords_f = ords_i[ords_i['Season'] >= RK_START_YEAR]
ords_f = ords_f.pivot_table(index = ['Season','RankingDayNum','TeamID'],
                    values = 'OrdinalRank', 
                    columns = 'SystemName').reset_index().sort_values(['Season','RankingDayNum','TeamID'])

# ords_f[ords_f.isnull().sum(1)  > 0]
#Note that there are rows with null data in the 2021 season

##### vi. Do big join

In [68]:
"""
===============
A. Merge rank lookup
===============
"""
def mergeRkLookup(df : pd.DataFrame) -> pd.DataFrame:
    """
    Merges rank day lookup onto working dataframe
    """
    int2 = df.merge(rkDayDf, on = ['Season','DayNum'], how ='left', indicator = True)
    int2 = int2[int2['_merge'] == 'both'].sort_values(['Season','DayNum'])
    assert len(int2) == len(int1[int1['Season'] >= 2014]) # Check merge captures all data 2014 onwards
    int2.drop('_merge',axis = 1, inplace= True)
    return int2
int2 = mergeRkLookup(int1)

"""
===============
B. Merge ranks
===============
"""
def mergeRanks(df : pd.DataFrame, ords_f : pd.DataFrame) -> pd.DataFrame:
    # 1. Merge ranks by Tm1 
    # -(outer join to do sanity checks below)
    tmp = int2.merge(ords_f, left_on = ['Season','RankingDayNum','Tm1ID'],
                        right_on = ['Season','RankingDayNum','TeamID'],
                        how = 'left',
                        indicator = True)
    
    # Sanity checks on what data is excluded
    # Excluded data is that which has DayNum occuring before first RankingDayNum
    print("Max value of DayNum that is ultimately excluded from dataset for each year...")
    print(tmp[tmp['_merge'] == 'left_only'].groupby(['Season']).max(numeric_only = True)['DayNum'].reset_index())

    print("\nCompare to min day of rankings as we saw in systems dataset above")
    system_show_cols = [i for i in systems.columns if i[-5 : -4]  == '_' and int(i[-4:]) >= 2014]
    print(systems[system_show_cols].max())

    # Now that point is made, exclude DayNums without valid RankingDayNum preceding them
    tmp = tmp[tmp['_merge'] == 'both']
    tmp.drop('_merge',axis = 1, inplace = True)
    
    # 2. Merge ranks by Tm2 
    # -(inner join b/c sanity check already proven above)
    tmp2 = tmp.merge(ords_f, left_on = ['Season','RankingDayNum','Tm2ID'],
                        right_on = ['Season','RankingDayNum','TeamID'],
                        how = 'inner',
                        suffixes = ['_tm1','_tm2'])
    assert len(tmp) == len(tmp2)
    return tmp2

int3 = mergeRanks(int2, ords_f)
int3

Max value of DayNum that is ultimately excluded from dataset for each year...
   Season  DayNum
0    2014       9
1    2015      16
2    2016      16
3    2017      16
4    2018      16
5    2019       9
6    2020       9
7    2021      30
8    2022      16

Compare to min day of rankings as we saw in systems dataset above
min_2014     9.0
min_2015    16.0
min_2016    16.0
min_2017    16.0
min_2018    16.0
min_2019     9.0
min_2020     9.0
min_2021    30.0
min_2022    16.0
dtype: float64


Unnamed: 0,Season,DayNum,Tm1ID,Tm1Score,Tm2ID,Tm2Score,Tm1Loc,NumOT,wins,CRType,CityID,has_city,City,State,TeamName_x,TeamName_y,RankingDayNum,TeamID_tm1,DOK_tm1,MAS_tm1,MOR_tm1,PGH_tm1,POM_tm1,SAG_tm1,TeamID_tm2,DOK_tm2,MAS_tm2,MOR_tm2,PGH_tm2,POM_tm2,SAG_tm2
0,2014,10,1390.0,71.0,1321.0,58.0,H,0.0,1.0,Regular,4331.0,1,Stanford,CA,Stanford,Northwestern,9.0,1390.0,65.0,45.0,37.0,44.0,55.0,58.0,1321,96.0,82.0,104.0,106.0,80.0,71.0
1,2014,13,1229.0,68.0,1321.0,64.0,A,0.0,1.0,Regular,4113.0,1,Evanston,IL,Illinois St,Northwestern,9.0,1229.0,137.0,89.0,213.0,86.0,178.0,121.0,1321,96.0,82.0,104.0,106.0,80.0,71.0
2,2014,16,1227.0,58.0,1321.0,93.0,H,0.0,1.0,Regular,4071.0,1,Chicago,IL,IL Chicago,Northwestern,9.0,1227.0,233.0,210.0,242.0,201.0,263.0,196.0,1321,96.0,82.0,104.0,106.0,80.0,71.0
3,2014,10,1427.0,62.0,1222.0,80.0,A,0.0,1.0,Regular,4157.0,1,Houston,TX,UT San Antonio,Houston,9.0,1427.0,276.0,292.0,301.0,291.0,304.0,252.0,1222,171.0,170.0,193.0,145.0,183.0,125.0
4,2014,13,1250.0,66.0,1222.0,80.0,A,0.0,1.0,Regular,4157.0,1,Houston,TX,Lehigh,Houston,9.0,1250.0,118.0,148.0,181.0,120.0,189.0,165.0,1222,171.0,170.0,193.0,145.0,183.0,125.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88491,2022,131,1463.0,67.0,1335.0,61.0,N,0.0,1.0,Regular,4039.0,1,Boston,MA,Yale,Penn,128.0,1463.0,160.0,138.0,149.0,146.0,147.0,135.0,1335,205.0,181.0,255.0,211.0,213.0,186.0
88492,2022,131,1335.0,61.0,1463.0,67.0,N,0.0,1.0,Regular,4039.0,1,Boston,MA,Penn,Yale,128.0,1335.0,205.0,181.0,255.0,211.0,213.0,186.0,1463,160.0,138.0,149.0,146.0,147.0,135.0
88493,2022,132,1343.0,64.0,1463.0,66.0,N,0.0,1.0,Regular,4052.0,1,Cambridge,MA,Princeton,Yale,128.0,1343.0,127.0,115.0,87.0,100.0,106.0,120.0,1463,160.0,138.0,149.0,146.0,147.0,135.0
88494,2022,131,1343.0,77.0,1165.0,73.0,N,0.0,1.0,Regular,4039.0,1,Boston,MA,Princeton,Cornell,128.0,1343.0,127.0,115.0,87.0,100.0,106.0,120.0,1165,233.0,190.0,176.0,210.0,195.0,205.0


In [66]:
def getTeamCity(df):
    geolocator = Nominatim(user_agent="geoapiExercises")
    res = int3.groupby(['Tm1ID','TeamName_x','City','State']).count()[['Season']].reset_index().sort_values(['Tm1ID','Season'],
                    ascending = [True, False])
    res_n = res.drop_duplicates(['Tm1ID','TeamName_x'])
    res_n = res_n.drop('Season',axis = 1)
    res_n.columns = ['TmID','TeamName','City','State']
    return res_n

tmCities_p = getTeamCity(int3)

def getCityCoords(row):
    loc = row['City'] + ', ' + row['State']
    res = geolocator.geocode(loc)
    row['lat'] = res.latitude
    row['long'] = res.longitude
    return row

# In normal process, will skip actually get coords
# teamCities = tmCities_p.apply(getCityCoords, axis = 1)
# with open('../data/my_data/teamLocs.p', 'wb') as handle:
#     pickle.dump(teamCities, handle, protocol=pickle.HIGHEST_PROTOCOL)

# And load dataset from saved location
with open('../data/my_data/teamLocs.p', 'rb') as handle:
    teamCities = pickle.load(handle)
teamCities

Unnamed: 0,TmID,TeamName,City,State,lat,long
0,1101.0,Abilene Chr,Abilene,TX,32.446450,-99.747591
83,1102.0,Air Force,USAF Academy,CO,38.774069,-104.301534
85,1103.0,Akron,Akron,OH,41.083064,-81.518485
161,1104.0,Alabama,Tuscaloosa,AL,33.209561,-87.567526
206,1105.0,Alabama A&M,Normal,AL,34.788979,-86.571937
...,...,...,...,...,...,...
15542,1468.0,Bellarmine,Louisville,KY,38.254238,-85.759407
15566,1469.0,Dixie St,Saint George,UT,37.080266,-113.578461
15584,1470.0,Tarleton St,Stephenville,TX,32.219184,-98.213063
15591,1471.0,UC San Diego,La Jolla,CA,32.840162,-117.274078


In [485]:
int1[int1['Season'] == 2014].sort_values(['Season','DayNum'])

Unnamed: 0,Season,DayNum,Tm1ID,Tm1Score,Tm2ID,Tm2Score,Tm1Loc,NumOT,wins,TeamName_x,TeamName_y
314,2014,4,1104,73,1328,82,N,0,1,Alabama,Oklahoma
3272,2014,4,1207,75,1332,82,N,0,1,Georgetown,Oregon
3680,2014,4,1231,100,1152,72,H,0,1,Indiana,Chicago St
4878,2014,4,1368,53,1281,89,A,0,1,SE Louisiana,Missouri
6854,2014,4,1444,70,1308,64,N,0,1,W Michigan,New Mexico St
...,...,...,...,...,...,...,...,...,...,...,...
46562,2014,152,1163,63,1196,53,N,0,1,Connecticut,Florida
201282,2014,152,1196,53,1163,63,N,0,1,Florida,Connecticut
282730,2014,152,1458,73,1246,74,N,0,1,Wisconsin,Kentucky
201621,2014,154,1246,54,1163,60,N,0,1,Kentucky,Connecticut


In [573]:
[i for i in systems.columns if i[-5 : -4]  == '_' and int(i[-4:]) >= 2014]

['min_2014',
 'min_2015',
 'min_2016',
 'min_2017',
 'min_2018',
 'min_2019',
 'min_2020',
 'min_2021',
 'min_2022']

In [475]:
rkDayDf

Unnamed: 0,Season,DayNum,RkDay
314,2014,4,0
6440,2014,5,0
20687,2014,6,0
1136,2014,7,0
4218,2014,8,0
...,...,...,...
2129,2022,128,121
442,2022,129,128
247,2022,130,128
7661,2022,131,128


In [367]:
int1[(int1['Season'] >= 2014) & (int1['DayNum'] > 16)]

Unnamed: 0,Season,DayNum,Tm1ID,Tm1Score,Tm2ID,Tm2Score,Tm1Loc,NumOT,wins,TeamName_x,TeamName_y
11,2016,51,1218,81,1328,84,H,0,1,Hawaii,Oklahoma
46,2014,65,1242,90,1328,83,A,0,1,Kansas,Oklahoma
47,2014,112,1242,83,1328,75,H,0,1,Kansas,Oklahoma
48,2015,77,1242,85,1328,78,H,0,1,Kansas,Oklahoma
49,2016,63,1242,109,1328,106,H,3,1,Kansas,Oklahoma
...,...,...,...,...,...,...,...,...,...,...,...
356789,2022,26,1370,81,1472,64,H,0,1,Seattle,St Thomas MN
356790,2022,82,1377,90,1472,79,A,0,1,South Dakota,St Thomas MN
356791,2022,108,1377,81,1472,60,H,0,1,South Dakota,St Thomas MN
356792,2022,49,1303,73,1472,80,H,0,1,NE Omaha,St Thomas MN


In [354]:
ords_f.tail(100)

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
4597429,2022,128,SAG,1370,148
4597430,2022,128,SAG,1371,29
4597431,2022,128,SAG,1372,117
4597432,2022,128,SAG,1373,242
4597433,2022,128,SAG,1374,43
...,...,...,...,...,...
4597524,2022,128,SAG,1468,203
4597525,2022,128,SAG,1469,294
4597526,2022,128,SAG,1470,227
4597527,2022,128,SAG,1471,256


In [352]:
ords_f.groupby(['Season','RankingDayNum']).count().reset_index().tail(100)

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
174,2017,58,2106,2106,2106
175,2017,65,2106,2106,2106
176,2017,72,2106,2106,2106
177,2017,79,2106,2106,2106
178,2017,86,2106,2106,2106
...,...,...,...,...,...
269,2022,100,2148,2148,2148
270,2022,107,2148,2148,2148
271,2022,114,2148,2148,2148
272,2022,121,2148,2148,2148


In [344]:
# ords_f[(ords_f['Season'] != 2021 & ords_f['RankingDayNum'] >= 16) | (ords_f['Season'] == 2021 & ords_f['RankingDayNum'] >= 30)]
ords_f = ords_f[((ords_f['Season'] != 2021) & (ords_f['RankingDayNum'] >= 16)) | 
        ((ords_f['Season'] == 2021) & (ords_f['RankingDayNum'] >= 30))]
ords_f.sort_values(['Season','RankingDayNum'])
o
# ords_g = ords_f.pivot_table(index = ['Season', 'RankingDayNum', 'TeamID'], columns = 'SystemName', values = 'OrdinalRank').reset_index()
# ords_g[700:1000]



Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
1263008,2011,21,MAS,1102,187
1263009,2011,21,MAS,1103,121
1263010,2011,21,MAS,1104,129
1263011,2011,21,MAS,1105,334
1263012,2011,21,MAS,1106,296
...,...,...,...,...,...
4597524,2022,128,SAG,1468,203
4597525,2022,128,SAG,1469,294
4597526,2022,128,SAG,1470,227
4597527,2022,128,SAG,1471,256


In [343]:
ords_f[(ords_f['Season'] == 2011) & (ords_f['RankingDayNum'] == 16)]

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank


In [291]:
ords_f[['Season','SystemName','RankingDayNum']].drop_duplicates()

Unnamed: 0,Season,SystemName,RankingDayNum
1256303,2011,MAS,0
1256993,2011,DOK,8
1257363,2011,MOR,9
1257708,2011,PGH,9
1258053,2011,POM,9
...,...,...,...
4592517,2022,MAS,128
4593233,2022,MOR,128
4594665,2022,PGH,128
4595381,2022,POM,128
