In [1]:

import pandas as pd
import numpy as np
import pickle
from bs4 import BeautifulSoup
from bs4 import NavigableString

from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn import model_selection 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
# Load necessary data
data_dir = 'mens-machine-learning-competition-2019/DataFiles/'
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'NCAATourneyCompactResults.csv')
df_regular_season = pd.read_csv(data_dir + 'RegularSeasonCompactResults.csv')
df_conference_games = pd.read_csv(data_dir + 'ConferenceTourneyGames.csv')
df_team_rpis = pd.read_csv("RPI/TeamRPI.csv") # scrapped from the web
df_teams = pd.read_csv(data_dir + "Teams.csv")

In [3]:
# collect win counts for each team per season
df_regular_season.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
df_win_counts = df_regular_season.groupby(['Season', 'WTeamID']).count().reset_index()
df_win_counts = df_win_counts.rename(columns = {'LTeamID': 'RWinCount', 'WTeamID': 'TeamID'})

In [4]:
# collectin loss counts for each team per son
df_loss_counts = df_regular_season.groupby(['Season', 'LTeamID']).count().reset_index()
df_loss_counts = df_loss_counts.rename(columns = {'WTeamID': 'RLossCount', 'LTeamID': 'TeamID'})

In [5]:
# merge win/loss counts and show regular season win percentage
df_regular_season_totals = pd.merge(left=df_win_counts, right=df_loss_counts, how='left', on=['Season', 'TeamID'])
df_regular_season_totals['RWinPerc'] = df_regular_season_totals.RWinCount / (df_regular_season_totals.RWinCount + df_regular_season_totals.RLossCount)
df_regular_season_totals.head()

Unnamed: 0,Season,TeamID,RWinCount,RLossCount,RWinPerc
0,1985,1102,5,19.0,0.208333
1,1985,1103,9,14.0,0.391304
2,1985,1104,21,9.0,0.7
3,1985,1106,10,14.0,0.416667
4,1985,1108,19,6.0,0.76


In [6]:
# drop unused labels
df_conference_games.drop(labels=['ConfAbbrev', 'DayNum'], inplace=True, axis=1)

In [7]:
# gather win/loss totals for conference tournaments and win %
df_c_win_counts = df_conference_games.groupby(['Season', 'WTeamID']).count().reset_index()
df_c_win_counts = df_c_win_counts.rename(columns = {'LTeamID': 'CWinCount', 'WTeamID': 'TeamID'})
df_c_loss_counts = df_conference_games.groupby(['Season', 'LTeamID']).count().reset_index()
df_c_loss_counts = df_c_loss_counts.rename(columns = {'WTeamID': 'CLossCount', 'LTeamID': 'TeamID'})
df_conference_tournament_totals = pd.merge(left=df_c_win_counts, right=df_c_loss_counts, how='left', on=['Season', 'TeamID'])
df_conference_tournament_totals = df_conference_tournament_totals.fillna(value={'CWinCount': 0, 'CLossCount': 0})
df_conference_tournament_totals['CWinPerc'] = df_conference_tournament_totals.CWinCount / (df_conference_tournament_totals.CWinCount + df_conference_tournament_totals.CLossCount)
df_conference_tournament_totals.head()

Unnamed: 0,Season,TeamID,CWinCount,CLossCount,CWinPerc
0,2001,1104,1,1.0,0.5
1,2001,1106,3,0.0,1.0
2,2001,1108,2,1.0,0.666667
3,2001,1111,1,1.0,0.5
4,2001,1114,1,1.0,0.5


In [8]:
# combine regular season and conference tournament win/loss
df_win_totals = pd.merge(left=df_regular_season_totals, right=df_conference_tournament_totals, how='left', on=['Season', 'TeamID'])
df_win_totals.tail(10)

Unnamed: 0,Season,TeamID,RWinCount,RLossCount,RWinPerc,CWinCount,CLossCount,CWinPerc
11217,2019,1457,15,12.0,0.555556,,,
11218,2019,1458,23,10.0,0.69697,1.0,1.0,0.5
11219,2019,1459,26,4.0,0.866667,3.0,0.0,1.0
11220,2019,1460,19,13.0,0.59375,2.0,1.0,0.666667
11221,2019,1461,7,24.0,0.225806,,,
11222,2019,1462,18,15.0,0.545455,1.0,1.0,0.5
11223,2019,1463,21,7.0,0.75,2.0,0.0,1.0
11224,2019,1464,10,20.0,0.333333,,,
11225,2019,1465,12,14.0,0.461538,,,
11226,2019,1466,7,22.0,0.241379,,,


In [9]:
# convert seed into int
df_seeds['Seed_Int'] = df_seeds['Seed'].str[1:3]
df_seeds['Seed_Int'] = df_seeds['Seed_Int'].apply(pd.to_numeric)
# Drop unused labels
df_seeds.drop(labels=['Seed'], inplace=True, axis=1)
df_tour.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
# Rename to winning team
df_winseeds = df_seeds.rename(columns={'TeamID': 'WTeamID', 'Seed_Int': 'WSeed'})
# Rename lossing teams
df_losseeds = df_seeds.rename(columns={'TeamID': 'LTeamID', 'Seed_Int': 'LSeed'})
# Lets make a table with teams and their seeds

In [10]:
df_seeds.head()

Unnamed: 0,Season,TeamID,Seed_Int
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [11]:
df_tour.head()

Unnamed: 0,Season,WTeamID,LTeamID
0,1985,1116,1234
1,1985,1120,1345
2,1985,1207,1250
3,1985,1229,1425
4,1985,1242,1325


In [12]:
df_winseeds.head()

Unnamed: 0,Season,WTeamID,WSeed
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [13]:
df_losseeds.head()

Unnamed: 0,Season,LTeamID,LSeed
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [14]:
df_dummy = pd.merge(left=df_tour, right=df_winseeds, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_losseeds, how='left', on=['Season', 'LTeamID'])
df_concat['SeedDiff'] = df_concat.WSeed - df_concat.LSeed

In [15]:
df_concat.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,SeedDiff
0,1985,1116,1234,9,8,1
1,1985,1120,1345,11,6,5
2,1985,1207,1250,1,16,-15
3,1985,1229,1425,9,8,1
4,1985,1242,1325,3,14,-11


In [16]:
df_team_master = pd.merge(left=df_team_rpis, right=df_teams, how="left", on=["TeamName"])
df_team_master.dropna()
df_team_master.drop(labels=['FirstD1Season', 'LastD1Season'], inplace=True, axis=1)
df_team_master.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Season,TeamName,RPI,TeamID
0,0,0,2000,Duke,0.67,1181.0
1,1,1,2000,Kansas,0.657,1242.0
2,2,2,2000,Virginia,0.655,1438.0
3,3,3,2000,North Carolina,0.653,1314.0
4,4,4,2000,Houston,0.648,1222.0


In [17]:
df_seeds.head()

Unnamed: 0,Season,TeamID,Seed_Int
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [18]:
# add seeds
df_team_master = pd.merge(left=df_team_master, right=df_seeds, how="left", on=["Season", "TeamID"])
# drop teams that weren't in the tournament
# df_team_master['TeamID'] = pd.to_numeric(df_team_master['TeamID'], downcast='integer')
df_team_master.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Season,TeamName,RPI,TeamID,Seed_Int
0,0,0,2000,Duke,0.67,1181.0,1.0
1,1,1,2000,Kansas,0.657,1242.0,8.0
2,2,2,2000,Virginia,0.655,1438.0,
3,3,3,2000,North Carolina,0.653,1314.0,8.0
4,4,4,2000,Houston,0.648,1222.0,


In [19]:
# drop teams that didn't participate in the tournament
df_team_master = df_team_master.dropna()

In [20]:
df_team_master = df_team_master.rename(columns={'Seed_Int': 'Seed'})
df_team_master.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Season,TeamName,RPI,TeamID,Seed
0,0,0,2000,Duke,0.67,1181.0,1.0
1,1,1,2000,Kansas,0.657,1242.0,8.0
3,3,3,2000,North Carolina,0.653,1314.0,8.0
5,5,5,2000,Michigan St,0.647,1277.0,1.0
6,6,6,2000,Tennessee,0.645,1397.0,4.0


In [21]:
# add win/loss totals
df_team_master = pd.merge(left=df_team_master, right=df_win_totals, how='left', on=["Season", "TeamID"])
df_team_master.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Season,TeamName,RPI,TeamID,Seed,RWinCount,RLossCount,RWinPerc,CWinCount,CLossCount,CWinPerc
0,0,0,2000,Duke,0.67,1181.0,1.0,27,4.0,0.870968,,,
1,1,1,2000,Kansas,0.657,1242.0,8.0,22,9.0,0.709677,,,
2,3,3,2000,North Carolina,0.653,1314.0,8.0,18,13.0,0.580645,,,
3,5,5,2000,Michigan St,0.647,1277.0,1.0,26,7.0,0.787879,,,
4,6,6,2000,Tennessee,0.645,1397.0,4.0,23,6.0,0.793103,,,


In [22]:
# top_40 = df_team_rpis.head(40)
top_40 = df_team_rpis.groupby("Season").head(40)
top_40 = pd.merge(left=top_40, right=df_teams, how="left", on=["TeamName"])

In [23]:
df_regular_season_wins_again = pd.read_csv(data_dir + 'RegularSeasonCompactResults.csv')
teams_hash = {}

#     {
#         '1985': [
#             '112': [
#                 {day: 23, outcome: "win", top40: True}
#             ]
#         ]
#     }

for index, row in df_regular_season_wins_again.iterrows():
    # grab relevant data
    season = row.Season
    winning_team = row.WTeamID
    losing_team = row.LTeamID
    day = row.DayNum
    # initialize the data struct
    if teams_hash.get(season) is None: teams_hash[season] = {}
    if teams_hash[season].get(winning_team) is None: teams_hash[season][winning_team] = []
    if teams_hash[season].get(losing_team) is None: teams_hash[season][losing_team] = []
    top_40_win = len(top_40.loc[(top_40['Season'] == season) & (top_40['TeamID'] == losing_team)]) != 0
    top_40_loss = len(top_40.loc[(top_40['Season'] == season) & (top_40['TeamID'] == winning_team)]) != 0
    teams_hash[season][winning_team].append({ "day": day, "outcome": "win", "top40": top_40_win })
    teams_hash[season][losing_team].append({ "day": day, "outcome": "loss", "top40": top_40_loss })
    
# season, teamid, wins, losses, win_perc
columns = ["Season", "TeamID", "Last10WinCount", "Last10LossCount", "Last10WinPerc"]
rows = []
for season, teams in teams_hash.items():
    for team, games in teams.items():
        # games are already ordered, just grab the last 10
        last_ten_games = games[-10:]
        wins = list(filter(lambda game: game["outcome"] == "win", last_ten_games))
        losses = list(filter(lambda game: game["outcome"] == "loss", last_ten_games))
        win_perc = len(wins) / (len(wins) + len(losses))
        rows.append([season, team, len(wins), len(losses), win_perc])
df_last_ten_games = pd.DataFrame(rows, columns=columns)
df_last_ten_games.head()
            
    

Unnamed: 0,Season,TeamID,Last10WinCount,Last10LossCount,Last10WinPerc
0,1985,1228,7,3,0.7
1,1985,1328,9,1,0.9
2,1985,1106,6,4,0.6
3,1985,1354,4,6,0.4
4,1985,1112,7,3,0.7


In [24]:
df_team_master = pd.merge(left=df_team_master, right=df_last_ten_games, how="left", on=["Season", "TeamID"])

In [25]:
# season, teamid, top40wins, top40losses, win_perc
columns = ["Season", "TeamID", "Top40Wins", "Top40LOsses", "Top40WinPerc"]
rows = []
for season, teams in teams_hash.items():
    for team, games in teams.items():
        wins = 0
        losses = 0
        for game in games:
            if (game["top40"] == True) and (game["outcome"] == "win"):
                wins += 1
            elif (game["top40"] == True) and (game["outcome"] == "loss"):
                losses += 1
        win_perc = 0
        if (wins + losses) != 0:
            win_perc = wins / (wins + losses)
        rows.append([season, team, wins, losses, win_perc])
df_top_40_games = pd.DataFrame(rows, columns=columns)                

In [26]:
df_team_master = pd.merge(left=df_team_master, right=df_top_40_games, how='left', on=['Season', 'TeamID'])
df_team_master.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Season,TeamName,RPI,TeamID,Seed,RWinCount,RLossCount,RWinPerc,CWinCount,CLossCount,CWinPerc,Last10WinCount,Last10LossCount,Last10WinPerc,Top40Wins,Top40LOsses,Top40WinPerc
0,0,0,2000,Duke,0.67,1181.0,1.0,27,4.0,0.870968,,,,9,1,0.9,9,1,0.9
1,1,1,2000,Kansas,0.657,1242.0,8.0,22,9.0,0.709677,,,,6,4,0.6,5,3,0.625
2,3,3,2000,North Carolina,0.653,1314.0,8.0,18,13.0,0.580645,,,,5,5,0.5,4,9,0.307692
3,5,5,2000,Michigan St,0.647,1277.0,1.0,26,7.0,0.787879,,,,9,1,0.9,7,2,0.777778
4,6,6,2000,Tennessee,0.645,1397.0,4.0,23,6.0,0.793103,,,,6,4,0.6,4,1,0.8


In [27]:
df_team_master.drop(labels=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True, axis=1)
df_team_master.head()

Unnamed: 0,Season,TeamName,RPI,TeamID,Seed,RWinCount,RLossCount,RWinPerc,CWinCount,CLossCount,CWinPerc,Last10WinCount,Last10LossCount,Last10WinPerc,Top40Wins,Top40LOsses,Top40WinPerc
0,2000,Duke,0.67,1181.0,1.0,27,4.0,0.870968,,,,9,1,0.9,9,1,0.9
1,2000,Kansas,0.657,1242.0,8.0,22,9.0,0.709677,,,,6,4,0.6,5,3,0.625
2,2000,North Carolina,0.653,1314.0,8.0,18,13.0,0.580645,,,,5,5,0.5,4,9,0.307692
3,2000,Michigan St,0.647,1277.0,1.0,26,7.0,0.787879,,,,9,1,0.9,7,2,0.777778
4,2000,Tennessee,0.645,1397.0,4.0,23,6.0,0.793103,,,,6,4,0.6,4,1,0.8


In [28]:
df_cleansed_master = df_team_master.drop(labels=["RWinCount", "RLossCount", "CWinCount", "CLossCount", "CWinPerc", "Last10WinCount", "Last10LossCount", "Top40Wins", "Top40LOsses" ], inplace=False, axis=1)
df_cleansed_master.head()

Unnamed: 0,Season,TeamName,RPI,TeamID,Seed,RWinPerc,Last10WinPerc,Top40WinPerc
0,2000,Duke,0.67,1181.0,1.0,0.870968,0.9,0.9
1,2000,Kansas,0.657,1242.0,8.0,0.709677,0.6,0.625
2,2000,North Carolina,0.653,1314.0,8.0,0.580645,0.5,0.307692
3,2000,Michigan St,0.647,1277.0,1.0,0.787879,0.9,0.777778
4,2000,Tennessee,0.645,1397.0,4.0,0.793103,0.6,0.8


In [29]:
df_tour = df_tour.loc[df_tour['Season'] >= 2000]

In [44]:
df_dummy = df_cleansed_master.rename(columns = { 'TeamID': 'WTeamID', 'RPI': 'FirstRPI', 'Seed': 'FirstSeed', 'RWinPerc': 'FirstRegWinPerc', 'Last10WinPerc': 'FirstLast10WinPerc', 'Top40WinPerc': 'FirstTop40WinPerc' })
df_tournament_games = pd.merge(left=df_tour, right=df_dummy, how="left", on=["Season", "WTeamID"])
df_tournament_games.drop(labels=["TeamName"], inplace=True, axis=1)

In [45]:
df_tournament_games.head()

Unnamed: 0,Season,WTeamID,LTeamID,FirstRPI,FirstSeed,FirstRegWinPerc,FirstLast10WinPerc,FirstTop40WinPerc
0,2000,1112,1238,0.529,1.0,0.8125,0.8,0.857143
1,2000,1120,1166,0.619,7.0,0.709677,0.4,0.625
2,2000,1211,1257,0.644,10.0,0.75,0.7,0.666667
3,2000,1235,1148,0.6,2.0,0.866667,0.9,0.75
4,2000,1246,1382,0.643,5.0,0.7,0.6,0.555556


In [46]:
df_dummy_2 = df_cleansed_master.rename(columns = { 'TeamID': 'LTeamID', 'RPI': 'SecondRPI', 'Seed': 'SecondSeed', 'RWinPerc': 'SecondRegWinPerc', 'Last10WinPerc': 'SecondLast10WinPerc', 'Top40WinPerc': 'SecondTop40WinPerc' })
df_tournament_games = pd.merge(left=df_tournament_games, right=df_dummy_2, how="left", on=["Season", "LTeamID"])
# df_tournament_games.drop(labels=["TeamName"], inplace=True, axis=1)

In [47]:
df_tournament_games.head()

Unnamed: 0,Season,WTeamID,LTeamID,FirstRPI,FirstSeed,FirstRegWinPerc,FirstLast10WinPerc,FirstTop40WinPerc,TeamName,SecondRPI,SecondSeed,SecondRegWinPerc,SecondLast10WinPerc,SecondTop40WinPerc
0,2000,1112,1238,0.529,1.0,0.8125,0.8,0.857143,Jackson St,0.41,16.0,0.516129,0.8,0.0
1,2000,1120,1166,0.619,7.0,0.709677,0.4,0.625,Creighton,0.562,10.0,0.71875,0.8,0.0
2,2000,1211,1257,0.644,10.0,0.75,0.7,0.666667,Louisville,0.593,7.0,0.62069,0.8,0.5
3,2000,1235,1148,0.6,2.0,0.866667,0.9,0.75,Central Conn,0.434,15.0,0.833333,0.7,0.0
4,2000,1246,1382,0.643,5.0,0.7,0.6,0.555556,St Bonaventure,0.526,12.0,0.7,0.7,0.5


In [48]:
df_tournament_games['RPIDifference'] = df_tournament_games['FirstRPI'] - df_tournament_games['SecondRPI']
df_tournament_games['SeedDifference'] = df_tournament_games['FirstSeed'] - df_tournament_games['SecondSeed']
df_tournament_games['RegWinPercDifference'] = df_tournament_games['FirstRegWinPerc'] - df_tournament_games['SecondRegWinPerc']
df_tournament_games['Last10WinPercDifference'] = df_tournament_games['FirstLast10WinPerc'] - df_tournament_games['SecondLast10WinPerc']
df_tournament_games['Top40WinPercDifference'] = df_tournament_games['FirstTop40WinPerc'] - df_tournament_games['SecondTop40WinPerc']

In [49]:
df_tournament_games.head()

Unnamed: 0,Season,WTeamID,LTeamID,FirstRPI,FirstSeed,FirstRegWinPerc,FirstLast10WinPerc,FirstTop40WinPerc,TeamName,SecondRPI,SecondSeed,SecondRegWinPerc,SecondLast10WinPerc,SecondTop40WinPerc,RPIDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
0,2000,1112,1238,0.529,1.0,0.8125,0.8,0.857143,Jackson St,0.41,16.0,0.516129,0.8,0.0,0.119,-15.0,0.296371,0.0,0.857143
1,2000,1120,1166,0.619,7.0,0.709677,0.4,0.625,Creighton,0.562,10.0,0.71875,0.8,0.0,0.057,-3.0,-0.009073,-0.4,0.625
2,2000,1211,1257,0.644,10.0,0.75,0.7,0.666667,Louisville,0.593,7.0,0.62069,0.8,0.5,0.051,3.0,0.12931,-0.1,0.166667
3,2000,1235,1148,0.6,2.0,0.866667,0.9,0.75,Central Conn,0.434,15.0,0.833333,0.7,0.0,0.166,-13.0,0.033333,0.2,0.75
4,2000,1246,1382,0.643,5.0,0.7,0.6,0.555556,St Bonaventure,0.526,12.0,0.7,0.7,0.5,0.117,-7.0,0.0,-0.1,0.055556


In [50]:
df_tournament_games.drop(labels=["FirstRPI", "SecondRPI", "FirstSeed", "SecondSeed", "FirstRegWinPerc", "SecondRegWinPerc", "FirstLast10WinPerc", "SecondLast10WinPerc", "FirstTop40WinPerc", "SecondTop40WinPerc"], inplace=True, axis=1)

In [51]:
df_tournament_games.head()

Unnamed: 0,Season,WTeamID,LTeamID,TeamName,RPIDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
0,2000,1112,1238,Jackson St,0.119,-15.0,0.296371,0.0,0.857143
1,2000,1120,1166,Creighton,0.057,-3.0,-0.009073,-0.4,0.625
2,2000,1211,1257,Louisville,0.051,3.0,0.12931,-0.1,0.166667
3,2000,1235,1148,Central Conn,0.166,-13.0,0.033333,0.2,0.75
4,2000,1246,1382,St Bonaventure,0.117,-7.0,0.0,-0.1,0.055556


In [52]:
df_tournament_games.drop(labels=["Season", "TeamName", "WTeamID", "LTeamID"], inplace=True, axis=1)

In [53]:
df_tournament_games.head()

Unnamed: 0,RPIDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
0,0.119,-15.0,0.296371,0.0,0.857143
1,0.057,-3.0,-0.009073,-0.4,0.625
2,0.051,3.0,0.12931,-0.1,0.166667
3,0.166,-13.0,0.033333,0.2,0.75
4,0.117,-7.0,0.0,-0.1,0.055556


In [54]:
df_tournament_games["win"] = 1

In [55]:
df_tournament_games.head()

Unnamed: 0,RPIDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference,win
0,0.119,-15.0,0.296371,0.0,0.857143,1
1,0.057,-3.0,-0.009073,-0.4,0.625,1
2,0.051,3.0,0.12931,-0.1,0.166667,1
3,0.166,-13.0,0.033333,0.2,0.75,1
4,0.117,-7.0,0.0,-0.1,0.055556,1


In [58]:
df_tournament_games_flipped = df_tournament_games
df_tournament_games_flipped['RPIDifference'] = -1 * df_tournament_games_flipped['RPIDifference']
df_tournament_games_flipped['SeedDifference'] = -1 * df_tournament_games_flipped['SeedDifference']
df_tournament_games_flipped['RegWinPercDifference'] = -1 * df_tournament_games_flipped['RegWinPercDifference']
df_tournament_games_flipped['Last10WinPercDifference'] = -1 * df_tournament_games_flipped['Last10WinPercDifference']
df_tournament_games_flipped['Top40WinPercDifference'] = -1 * df_tournament_games_flipped['Top40WinPercDifference']
df_tournament_games_flipped['win'] = 0

In [59]:
df_tournament_games_flipped.head()

Unnamed: 0,RPIDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference,win
0,0.119,-15.0,0.296371,0.0,0.857143,0
1,0.057,-3.0,-0.009073,-0.4,0.625,0
2,0.051,3.0,0.12931,-0.1,0.166667,0
3,0.166,-13.0,0.033333,0.2,0.75,0
4,0.117,-7.0,0.0,-0.1,0.055556,0


In [62]:
frames = [df_tournament_games, df_tournament_games_flipped]
df_historical_games = pd.concat(frames)

In [65]:
df_tournament_games.head()

Unnamed: 0,RPIDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference,win
0,0.119,-15.0,0.296371,0.0,0.857143,0
1,0.057,-3.0,-0.009073,-0.4,0.625,0
2,0.051,3.0,0.12931,-0.1,0.166667,0
3,0.166,-13.0,0.033333,0.2,0.75,0
4,0.117,-7.0,0.0,-0.1,0.055556,0
