In [47]:
import pandas as pd
import math
from random import * 
# import data sets that will be used
seeds = pd.read_csv("../csvs/kaggle/predictive/NCAATourneySeeds.csv")
reg_season = pd.read_csv("../csvs/kaggle/predictive/RegularSeasonDetailedResults.csv")
regions = pd.read_csv("../csvs/kaggle/predictive/Seasons.csv")
team_names = pd.read_csv("../csvs/kaggle/predictive/Teams.csv")
outcomes_14 = pd.read_csv("../csvs/1314/1314_outcomes.csv", encoding = 'latin-1')
outcomes_15 = pd.read_csv("../csvs/1415/1415_outcomes.csv", encoding = 'latin-1')
outcomes_16 = pd.read_csv("../csvs/1516/1516_outcomes.csv", encoding = 'latin-1')
outcomes_17 = pd.read_csv("../csvs/1617/1617_outcomes.csv", encoding = 'latin-1')
regular_season = pd.read_csv("../csvs/kaggle/regular_season_stats.csv", encoding = 'latin-1')

pd.set_option('display.max_rows',1755)

# adjust data sets to only 2014 and later
seeds = seeds[seeds.Season > 2013]
regions = regions[regions.Season > 2013]
reg_season = reg_season[reg_season.Season > 2013]



# splits the regular season by win and loss and will later combine
# this is because the df is by game not overall
reg_season_win = reg_season.groupby(['WTeamID','Season'], as_index = False).agg(sum)
reg_season_lose = reg_season.groupby(['LTeamID','Season'], as_index = False).agg(sum)


# calculates wins and losses for each team
team_wins = reg_season.groupby(['WTeamID','Season']).WTeamID.count()
team_losses = reg_season.groupby(['LTeamID','Season']).LTeamID.count()

# creat lists for wins and losses
xwin = []
for i in team_wins:
    xwin.append(i)

xloss = []
for i in team_losses:
    xloss.append(i)

'''
Reason: Undefeated
Missing 2015 season for teamId=1246 losing data 690
Missing 2014 season for teamId=1455 losing data 1705

Reason: Defeated
Missing 2015 season for teamId=1212 winning data
Missing 2015 season for teamId=1363 winning data 
'''

# inserts a 0 where the value was missing
xloss.insert(691,0)
xloss.insert(1705,0)

xwin.insert(526,0)
xwin.insert(1261,0)

# renames columns to make it easier to combine, merges based on team
# winning team becomes team and losing team become opponent stats
# same but flipped below for the lose regular season
# drops unnecessary columns
reg_season_win=reg_season_win.rename(columns={'WTeamID':'TeamID', 'WFGM':'FGM', 'WFGA':'FGA', 'WFGM3':'FGM3', 'WFGA3':'FGA3', 
                                          'WFTM':'FTM', 'WFTA':'FTA', 'WOR':'OR', 'WDR':'DR', 'WAst':'Ast', 'WTO':'TO', 
                                          'WStl':'Stl', 'WBlk':'Blk', 'WPF':'PF', 'LFGM':'OFGM', 'LFGA':'OFGA', 
                                          'LFGM3':'OFGM3', 'LFGA3':'OFGA3', 'LFTM':'OFTM', 'LFTA':'OFTA', 'LOR':'OOR',
                                          'LDR':'ODR', 'LAst':'OAst', 'LTO':'OTO', 'LStl':'OStl', 'LBlk':'OBlk', 'LPF':'OPF'
                                           })
reg_season_win = reg_season_win.drop(['DayNum','WScore','LTeamID','LScore','NumOT'], axis = 1)

reg_season_lose=reg_season_lose.rename(columns={'LTeamID':'TeamID', 'LFGM':'FGM', 'LFGA':'FGA', 'LFGM3':'FGM3', 'LFGA3':'FGA3', 
                                          'LFTM':'FTM', 'LFTA':'FTA', 'LOR':'OR', 'LDR':'DR', 'LAst':'Ast', 'LTO':'TO', 
                                          'LStl':'Stl', 'LBlk':'Blk', 'LPF':'PF', 'WFGM':'OFGM', 'WFGA':'OFGA', 
                                          'WFGM3':'OFGM3', 'WFGA3':'OFGA3', 'WFTM':'OFTM', 'WFTA':'OFTA', 'WOR':'OOR',
                                          'WDR':'ODR', 'WAst':'OAst', 'WTO':'OTO', 'WStl':'OStl', 'WBlk':'OBlk', 'WPF':'OPF'
                                           })
reg_season_lose = reg_season_lose.drop(['DayNum','WScore','WTeamID','LScore','NumOT'], axis = 1)

regular_season = pd.concat([reg_season_win,reg_season_lose])

# reindexs to make it look cleaner
regular_season = regular_season.reindex_axis(reg_season_win.columns, axis=1)

# Once again takes the sum of every column grouped by each team
regular_season = regular_season.groupby(['TeamID','Season'],as_index = False).agg(sum)

# creates names array for each team ID 
names = []
for team in regular_season.TeamID:
    names.append(team_names['TeamName'][team_names['TeamID']==team].values[0])

# inserts Team name, wins, losses columns
regular_season.insert(loc=0, column ='TeamName',value=names)
regular_season.insert(loc=3, column = 'W', value = xwin)
regular_season.insert(loc=4, column = 'L', value = xloss)


# creates new columns Games, field goal %, 3-point %, free throw %
regular_season.insert(loc=3, column = 'G', value = regular_season.W+regular_season.L)
regular_season.insert(loc=8, column = 'FG%', value = regular_season.FGM*100/regular_season.FGA)
regular_season.insert(loc=11, column = 'FG3%', value = regular_season.FGM3*100/regular_season.FGA3)
regular_season.insert(loc=14, column = 'FT%', value = regular_season.FTM*100/regular_season.FTA)


print(len(xwin))
print(len(xloss))
print(len(names))
# views new tidy, clean data frame
regular_season.head()
# regular_season.to_csv("regular_season_stats.csv", encoding='utf-8', index=False)



1755
1755
1755


Unnamed: 0,TeamName,TeamID,Season,G,W,L,FGM,FGA,FG%,FGM3,...,OFGA3,OFTM,OFTA,OOR,ODR,OAst,OTO,OStl,OBlk,OPF
0,Abilene Chr,1101,2014,21,2,19,427,1053,40.550807,140,...,340,385,542,218,506,327,255,147,105,392
1,Abilene Chr,1101,2015,28,7,21,600,1482,40.48583,203,...,468,437,636,281,725,362,377,164,119,463
2,Abilene Chr,1101,2016,27,9,18,643,1456,44.162088,185,...,409,478,674,232,681,347,363,138,89,531
3,Abilene Chr,1101,2017,25,9,16,611,1333,45.836459,177,...,490,397,595,266,626,340,370,151,79,431
4,Abilene Chr,1101,2018,27,12,15,689,1525,45.180328,175,...,533,449,633,268,676,325,418,176,79,477
