# Loading initial packages

In [1]:
import re
import numpy
import pandas as pd
from pandas.stats.api import ols
from subprocess import check_output

# Reading in the data

In [2]:
TourneySeeds = pd.read_csv('data/TourneySeeds.csv')
SampleSubmission = pd.read_csv('data/SampleSubmission.csv')
Seasons = pd.read_csv('data/Seasons.csv')
Teams = pd.read_csv('data/Teams.csv')
TourneySlots = pd.read_csv('data/TourneySlots.csv')
TourneyDetailedResults = pd.read_csv('data/TourneyDetailedResults.csv')
TourneyCompactResults = pd.read_csv('data/TourneyCompactResults.csv')
team_dict = dict(zip(Teams['Team_Id'].values, Teams['Team_Name'].values))
RegularSeasonCompact = pd.read_csv('data/RegularSeasonCompactResults.csv')
TourneyDetailedResults['Wteam_name'] = TourneyDetailedResults['Wteam'].map(team_dict)
TourneyDetailedResults['Lteam_name'] = TourneyDetailedResults['Lteam'].map(team_dict)

# A quick look at the data

In [3]:
print(TourneySeeds.head(6))

   Season Seed  Team
0    1985  W01  1207
1    1985  W02  1210
2    1985  W03  1228
3    1985  W04  1260
4    1985  W05  1374
5    1985  W06  1208


In [4]:
print(TourneySlots.head(6))

   Season  Slot Strongseed Weakseed
0    1985  R1W1        W01      W16
1    1985  R1W2        W02      W15
2    1985  R1W3        W03      W14
3    1985  R1W4        W04      W13
4    1985  R1W5        W05      W12
5    1985  R1W6        W06      W11


In [5]:
print(SampleSubmission.head(6))

               Id  Pred
0  2016_1112_1114   0.5
1  2016_1112_1122   0.5
2  2016_1112_1124   0.5
3  2016_1112_1138   0.5
4  2016_1112_1139   0.5
5  2016_1112_1143   0.5


In [6]:
print(Seasons.head(6))

   Season     Dayzero Regionw    Regionx    Regiony    Regionz
0    1985  10/29/1984    East       West    Midwest  Southeast
1    1986  10/28/1985    East    Midwest  Southeast       West
2    1987  10/27/1986    East  Southeast    Midwest       West
3    1988   11/2/1987    East    Midwest  Southeast       West
4    1989  10/31/1988    East       West    Midwest  Southeast
5    1990  10/30/1989    East    Midwest  Southeast       West


In [7]:
print(Teams.head(6))

   Team_Id    Team_Name
0     1101  Abilene Chr
1     1102    Air Force
2     1103        Akron
3     1104      Alabama
4     1105  Alabama A&M
5     1106   Alabama St


In [8]:
print(TourneyDetailedResults.head(6))

   Season  Daynum  Wteam  Wscore  Lteam  Lscore Wloc  Numot  Wfgm  Wfga  \
0    2003     134   1421      92   1411      84    N      1    32    69   
1    2003     136   1112      80   1436      51    N      0    31    66   
2    2003     136   1113      84   1272      71    N      0    31    59   
3    2003     136   1141      79   1166      73    N      0    29    53   
4    2003     136   1143      76   1301      74    N      1    27    64   
5    2003     136   1163      58   1140      53    N      0    17    52   

      ...       Lfta  Lor  Ldr  Last  Lto  Lstl  Lblk  Lpf     Wteam_name  \
0     ...         31   17   28    16   15     5     0   22  UNC Asheville   
1     ...          7    8   26    12   17    10     3   15        Arizona   
2     ...         21   20   22    11   12     2     5   18     Arizona St   
3     ...         17   14   17    20   21     6     6   21     C Michigan   
4     ...         20   10   26    16   14     5     8   19     California   
5     ...   

In [9]:
print(TourneyCompactResults.head(6))

   Season  Daynum  Wteam  Wscore  Lteam  Lscore Wloc  Numot
0    1985     136   1116      63   1234      54    N      0
1    1985     136   1120      59   1345      58    N      0
2    1985     136   1207      68   1250      43    N      0
3    1985     136   1229      58   1425      55    N      0
4    1985     136   1242      49   1325      38    N      0
5    1985     136   1246      66   1449      58    N      0


# TrueSkill

## Add the package

In [10]:
from trueskill import Rating, quality_1vs1, rate_1vs1

## Data Cleaning
- Remove all games before 2008 (three point line was introduced)

In [11]:
RegPost2008 = RegularSeasonCompact[RegularSeasonCompact['Season'] >= 2008]
print(RegPost2008.head(6))

       Season  Daynum  Wteam  Wscore  Lteam  Lscore Wloc  Numot
97710    2008       0   1272     102   1404      71    H      0
97711    2008       0   1350      44   1263      42    N      0
97712    2008       1   1205      69   1105      55    N      0
97713    2008       1   1246      67   1146      40    H      0
97714    2008       1   1272      80   1350      63    H      0
97715    2008       1   1404      66   1263      62    N      0


## Apply TrueSkill to season data
- For each season, calculate the TrueSkill of each team in the season

In [13]:
# initialize the environment
env = TrueSkill()

# (EXAMPLE) Seed-Based model

## Extract seeds for each team

In [1]:
TourneySeeds['SeedNum'] = TourneySeeds['Seed'].apply(lambda x: re.sub("[A-z+a-z]","",x,flags=re.IGNORECASE))
print(TourneySeeds.tail(10))

NameError: name 'TourneySeeds' is not defined

In [15]:
game_to_predict = pd.concat([SampleSubmission['Id'],SampleSubmission['Id'].str.split('_', expand=True)], axis=1)
game_to_predict.rename(columns={0: 'season', 1: 'team1',2: 'team2'}, inplace=True)
game_to_predict['season'] = pd.to_numeric(game_to_predict['season'])
game_to_predict['team1'] = pd.to_numeric(game_to_predict['team1'])
game_to_predict['team2'] = pd.to_numeric(game_to_predict['team2'])
TourneySeeds['Season'] = pd.to_numeric(TourneySeeds['Season'])
TourneySeeds['Team'] = pd.to_numeric(TourneySeeds['Team'])
TourneySeeds['SeedNum'] = pd.to_numeric(TourneySeeds['SeedNum'])
game_to_predict = pd.merge(game_to_predict,TourneySeeds[['Season','Team','SeedNum']].rename(columns={'Season': 'season', 'Team': 'team1','SeedNum':'TeamSeed1'}),how='left',on=['season','team1'])
game_to_predict = pd.merge(game_to_predict,TourneySeeds[['Season','Team','SeedNum']].rename(columns={'Season': 'season', 'Team': 'team2','SeedNum':'TeamSeed2'}),how='left',on=['season','team2'])
print(game_to_predict.head(10))

               Id  season  team1  team2  TeamSeed1  TeamSeed2
0  2016_1112_1114    2016   1112   1114          6         12
1  2016_1112_1122    2016   1112   1122          6         16
2  2016_1112_1124    2016   1112   1124          6          5
3  2016_1112_1138    2016   1112   1138          6         14
4  2016_1112_1139    2016   1112   1139          6          9
5  2016_1112_1143    2016   1112   1143          6          4
6  2016_1112_1151    2016   1112   1151          6         12
7  2016_1112_1153    2016   1112   1153          6          9
8  2016_1112_1160    2016   1112   1160          6          8
9  2016_1112_1163    2016   1112   1163          6          9


## Join results to seeds

In [16]:
compact_results = pd.merge(TourneyCompactResults, TourneySeeds[['Season','Team','SeedNum']].rename(columns={'Team': 'Wteam','SeedNum':'WSeedNum'}), how='left', on=['Season','Wteam'])
compact_results = pd.merge(compact_results, TourneySeeds[['Season','Team','SeedNum']].rename(columns={'Team': 'Lteam','SeedNum':'LSeedNum'}), how='left', on=['Season','Lteam'])
print(compact_results.head(6))

   Season  Daynum  Wteam  Wscore  Lteam  Lscore Wloc  Numot  WSeedNum  \
0    1985     136   1116      63   1234      54    N      0         9   
1    1985     136   1120      59   1345      58    N      0        11   
2    1985     136   1207      68   1250      43    N      0         1   
3    1985     136   1229      58   1425      55    N      0         9   
4    1985     136   1242      49   1325      38    N      0         3   
5    1985     136   1246      66   1449      58    N      0        12   

   LSeedNum  
0         8  
1         6  
2        16  
3         8  
4        14  
5         5  


## Fix wins

In [17]:
set1 = compact_results[['WSeedNum','LSeedNum']].rename(columns={'WSeedNum': 'Team1Seed','LSeedNum':'Team2Seed'})
set1['Team1Win'] = 1
set2 = compact_results[['LSeedNum','WSeedNum']].rename(columns={'LSeedNum': 'Team1Seed','WSeedNum':'Team2Seed'})
set2['Team1Win'] = 0
full_set = pd.concat([set1,set2],ignore_index=True)
full_set['Team1Seed'] = pd.to_numeric(full_set['Team1Seed'])
full_set['Team2Seed'] = pd.to_numeric(full_set['Team2Seed'])
full_set['Team1Win'] = pd.to_numeric(full_set['Team1Win'])

print(full_set.head(6))

   Team1Seed  Team2Seed  Team1Win
0          9          8         1
1         11          6         1
2          1         16         1
3          9          8         1
4          3         14         1
5         12          5         1


## Linear model

In [18]:
linmodel=ols(y=full_set['Team1Win'],x=full_set['Team2Seed']-full_set['Team1Seed'])
print(linmodel)

  exec(code_obj, self.user_global_ns, self.user_ns)


ImportError: No module named 'scikits'

In [None]:
game_to_predict['Pred'] = linmodel.predict(x=game_to_predict['TeamSeed2']-game_to_predict['TeamSeed1'])
game_to_predict[['Id','Pred']].to_csv('seed_submission.csv',index=False)