In [247]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import itertools
from sklearn import cross_validation
from sklearn import preprocessing

np.random.seed(15)

#read in datasets
teams = pd.read_csv('Teams.csv')
seasons = pd.read_csv('Seasons.csv')
tourneyCompactResults = pd.read_csv('TourneyCompactResults.csv')
tourneyDetailedResults = pd.read_csv('TourneyDetailedResults.csv')
tourneySeeds = pd.read_csv('TourneySeeds.csv')
tourneySlots = pd.read_csv('TourneySlots.csv')
regularSeasonCompactResults = pd.read_csv('RegularSeasonCompactResults.csv')
regularSeasonDetailedResults = pd.read_csv('RegularSeasonDetailedResults.csv')

#turn all datasets into DataFrames
teams_df = pd.DataFrame(teams)
seasons_df = pd.DataFrame(seasons)
tourneyCompact_df = pd.DataFrame(tourneyCompactResults)
tourneyDetailed_df = pd.DataFrame(tourneyDetailedResults)
tourneySeeds_df = pd.DataFrame(tourneySeeds)
tourneySlots_df = pd.DataFrame(tourneySlots)
regSeasonCompact_df = pd.DataFrame(regularSeasonCompactResults)
regSeasonDetailed_df = pd.DataFrame(regularSeasonDetailedResults)

In [248]:
tourneySlots_df.head()

Unnamed: 0,Season,Slot,Strongseed,Weakseed
0,1985,R1W1,W01,W16
1,1985,R1W2,W02,W15
2,1985,R1W3,W03,W14
3,1985,R1W4,W04,W13
4,1985,R1W5,W05,W12


In [249]:
#team dictionary to use with all other datasets to replace team ID with team name
team_dict = teams_df.set_index('Team_Id')['Team_Name'].to_dict()
team_dict

{1101: 'Abilene Chr',
 1102: 'Air Force',
 1103: 'Akron',
 1104: 'Alabama',
 1105: 'Alabama A&M',
 1106: 'Alabama St',
 1107: 'Albany NY',
 1108: 'Alcorn St',
 1109: 'Alliant Intl',
 1110: 'American Univ',
 1111: 'Appalachian St',
 1112: 'Arizona',
 1113: 'Arizona St',
 1114: 'Ark Little Rock',
 1115: 'Ark Pine Bluff',
 1116: 'Arkansas',
 1117: 'Arkansas St',
 1118: 'Armstrong St',
 1119: 'Army',
 1120: 'Auburn',
 1121: 'Augusta',
 1122: 'Austin Peay',
 1123: 'Ball St',
 1124: 'Baylor',
 1125: 'Belmont',
 1126: 'Bethune-Cookman',
 1127: 'Binghamton',
 1128: 'Birmingham So',
 1129: 'Boise St',
 1130: 'Boston College',
 1131: 'Boston Univ',
 1132: 'Bowling Green',
 1133: 'Bradley',
 1134: 'Brooklyn',
 1135: 'Brown',
 1136: 'Bryant',
 1137: 'Bucknell',
 1138: 'Buffalo',
 1139: 'Butler',
 1140: 'BYU',
 1141: 'C Michigan',
 1142: 'Cal Poly SLO',
 1143: 'California',
 1144: 'Campbell',
 1145: 'Canisius',
 1146: 'Cent Arkansas',
 1147: 'Centenary',
 1148: 'Central Conn',
 1149: 'Charleston So

In [250]:
### The following data frames use the team_dict to map all
### of the team id numbers to the actual team name

In [251]:
#Tourney Seeds
tourneySeeds_df['Team'] = tourneySeeds_df['Team'].map(team_dict)
tourneySeeds_df

Unnamed: 0,Season,Seed,Team
0,1985,W01,Georgetown
1,1985,W02,Georgia Tech
2,1985,W03,Illinois
3,1985,W04,Loyola-Chicago
4,1985,W05,SMU
5,1985,W06,Georgia
6,1985,W07,Syracuse
7,1985,W08,Temple
8,1985,W09,Virginia Tech
9,1985,W10,DePaul


In [252]:
#Tourney Compact
tourneyCompact_df['Wteam'] = tourneyCompact_df['Wteam'].map(team_dict) #winning teams
tourneyCompact_df['Lteam'] = tourneyCompact_df['Lteam'].map(team_dict) #losing teams
tourneyCompact_df.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,136,Arkansas,63,Iowa,54,N,0
1,1985,136,Auburn,59,Purdue,58,N,0
2,1985,136,Georgetown,68,Lehigh,43,N,0
3,1985,136,Illinois St,58,USC,55,N,0
4,1985,136,Kansas,49,Ohio,38,N,0


In [253]:
#Tourney Detailed
tourneyDetailed_df['Wteam'] = tourneyDetailed_df['Wteam'].map(team_dict) #winning teams
tourneyDetailed_df['Lteam'] = tourneyDetailed_df['Lteam'].map(team_dict) #losing teams
tourneyDetailed_df.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,134,UNC Asheville,92,TX Southern,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,Arizona,80,Vermont,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,Arizona St,84,Memphis,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,C Michigan,79,Creighton,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,California,76,NC State,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [254]:
#Regular Season Compact
regSeasonCompact_df['Wteam'] = regSeasonCompact_df['Wteam'].map(team_dict) #winning teams
regSeasonCompact_df['Lteam'] = regSeasonCompact_df['Lteam'].map(team_dict) #losing teams
regSeasonCompact_df.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,20,Illinois,81,Oklahoma,64,N,0
1,1985,25,Alabama St,77,S Carolina St,70,H,0
2,1985,25,Arizona,63,Houston Bap,56,H,0
3,1985,25,Cornell,70,Utica,54,H,0
4,1985,25,F Dickinson,86,Wagner,74,H,0


In [255]:
#Regular Season Detailed
regSeasonDetailed_df['Wteam'] = regSeasonDetailed_df['Wteam'].map(team_dict) #winning teams
regSeasonDetailed_df['Lteam'] = regSeasonDetailed_df['Lteam'].map(team_dict) #losing teams
regSeasonDetailed_df.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,10,Alabama,68,Oklahoma,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,Memphis,70,Syracuse,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,Marquette,73,Villanova,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,N Illinois,56,Winthrop,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,Texas,77,Georgia,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [256]:

tourneySeeds_df.head()

Unnamed: 0,Season,Seed,Team
0,1985,W01,Georgetown
1,1985,W02,Georgia Tech
2,1985,W03,Illinois
3,1985,W04,Loyola-Chicago
4,1985,W05,SMU


In [263]:
# trying to create a dictionary that looks similiar to this:
# {(1985,W01):Georgetown, (1985,W02):Georgia Tech}

#seeds_dict = dict(zip(tourneySeeds_df.Season, tourneySeeds_df.Seed))
