In [116]:
# Load main packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import getpass # For identifying user/directory structure
import networkx as nx
import re
from sklearn import linear_model
%matplotlib inline

# Load local files
import sys # For reading files in other directories
if(getpass.getuser() == 'rockc_000'):
    sys.path.insert(0, 'C:/Users/rockc_000/Documents/GitHub/GithubSandbox/Python/NCAA Scripts')
    dir = 'C:/Users/rockc_000/Documents/Personal Files/Kaggle/NCAA'
if(getpass.getuser() == 'josh'):
    sys.path.insert(0, '/home/josh/Documents/Github/GithubSandbox/Python/NCAA Scripts')
    dir = '/home/josh/Documents/Personal/Kaggle/NCAA'
from make_submission import make_submission

In [2]:
# Read in data
seasons = pd.read_csv(dir + '/Seasons.csv')
teams = pd.read_csv(dir + '/Teams.csv')
regular_compact_results = pd.read_csv(dir + '/RegularSeasonCompactResults.csv')
regular_detailed_results = pd.read_csv(dir + '/RegularSeasonDetailedResults.csv')
tourney_compact_results = pd.read_csv(dir + '/TourneyCompactResults.csv')
tourney_detailed_results = pd.read_csv(dir + '/TourneyDetailedResults.csv')
tourney_seed = pd.read_csv(dir + '/TourneySeeds.csv')
tourney_slots = pd.read_csv(dir + '/TourneySlots.csv')
sample_submission = pd.read_csv(dir + '/SampleSubmission.csv')

In [3]:
# Look at the first few rows of each dataset, and their dimensions
# sample_submission.iloc[np.arange(5)]
nrow = 4
print('seasons:\n', seasons.head(nrow), '\nDimension: ', seasons.shape)
print('teams:\n', teams.head(nrow), '\nDimension: ', teams.shape)
print('regular_compact_results:\n', regular_compact_results.head(nrow),
      '\nDimension: ', regular_compact_results.shape)
print('regular_detailed_results:\n', regular_detailed_results.head(nrow),
      '\nDimension: ', regular_detailed_results.shape)
print('tourney_compact_results:\n', tourney_compact_results.head(nrow),
      '\nDimension: ', tourney_compact_results.shape)
print('tourney_detailed_results:\n', tourney_detailed_results.head(nrow),
      '\nDimension: ', tourney_detailed_results.shape)
print('tourney_seed:\n', tourney_seed.head(nrow),
      '\nDimension: ', tourney_seed.shape)
print('tourney_slots:\n', tourney_slots.head(nrow),
      '\nDimension: ', tourney_slots.shape)
print('sample_submission:\n', sample_submission.head(nrow),
      '\nDimension: ', sample_submission.shape)

seasons:
    Season     Dayzero Regionw    Regionx    Regiony    Regionz
0    1985  10/29/1984    East       West    Midwest  Southeast
1    1986  10/28/1985    East    Midwest  Southeast       West
2    1987  10/27/1986    East  Southeast    Midwest       West
3    1988  11/02/1987    East    Midwest  Southeast       West 
Dimension:  (32, 6)
teams:
    Team_Id    Team_Name
0     1101  Abilene Chr
1     1102    Air Force
2     1103        Akron
3     1104      Alabama 
Dimension:  (364, 2)
regular_compact_results:
    Season  Daynum  Wteam  Wscore  Lteam  Lscore Wloc  Numot
0    1985      20   1228      81   1328      64    N      0
1    1985      25   1106      77   1354      70    H      0
2    1985      25   1112      63   1223      56    H      0
3    1985      25   1165      70   1432      54    H      0 
Dimension:  (139920, 8)
regular_detailed_results:
    Season  Daynum  Wteam  Wscore  Lteam  Lscore Wloc  Numot  Wfgm  Wfga ...   \
0    2003      10   1104      68   1328      6

## Pagerank algorithm on historical wins

In [6]:
# Initialize DataFrame with one (meaningless) row
pagerank_score = pd.DataFrame({'Season': [1900], 'Team': [0], 'Score': [0]})
for season in seasons['Season']:
    G = nx.DiGraph()
    edges = regular_compact_results.ix[regular_compact_results['Season'] == season,
                                       ('Wteam', 'Lteam')]
    for row in np.arange(edges.shape[0]):
        G.add_edge(edges.iloc[row][1], edges.iloc[row][0])
    # Calculate the page-rank using the networkx package
    out = nx.pagerank(G)
    # Convert the page-rank scores into a DataFrame.
    out = pd.DataFrame.from_dict(out, orient='index')
    out.rename(columns={0: 'Score'}, inplace=True)
    out['Team'] = out.index
    out['Season'] = season
    pagerank_score = pagerank_score.append(out)
pagerank_score = pagerank_score.ix[pagerank_score['Team'] != 0]

## Aggregated Statistics

In [63]:
win_count = regular_compact_results.groupby(('Season', 'Wteam'))
win_count = win_count['Daynum'].count()

# Aggregate the scores for the winning team and losing team
# as well as the number of games played.
win_pts = regular_compact_results.groupby(('Season', 'Wteam'))
win_pts = pd.merge(win_pts[('Lscore', 'Wscore')].sum(),
                    pd.DataFrame(win_pts['Season'].count()),
                    left_index=True, right_index=True)
lose_pts = regular_compact_results.groupby(('Season', 'Lteam'))
lose_pts = pd.merge(lose_pts[('Lscore', 'Wscore')].sum(),
                    pd.DataFrame(lose_pts['Season'].count()),
                    left_index=True, right_index=True)

win_pts.index.set_names(('Season', 'Team'), inplace=True)
lose_pts.index.set_names(('Season', 'Team'), inplace=True)
lose_pts.rename(columns={'Lscore': 'Points_for', 'Wscore': 'Points_againt',
                         'Season': 'Game_cnt'}, inplace=True)
win_pts.rename(columns={'Wscore': 'Points_for', 'Lscore': 'Points_againt',
                        'Season': 'Game_cnt'}, inplace=True)
total_pts = win_pts.append(lose_pts)
total_pts = total_pts.groupby(total_pts.index).sum()

In [67]:
print(total_pts.iloc[0, :])
print(win_pts.loc[1985].loc[1102])
print(lose_pts.loc[1985].loc[1102])

Game_cnt           24
Points_againt    1653
Points_for       1514
Name: (1985, 1102), dtype: int64
Points_againt    305
Points_for       355
Game_cnt           5
Name: 1102, dtype: int64
Points_for       1159
Points_againt    1348
Game_cnt           19
Name: 1102, dtype: int64


In [128]:
#stat_vars = np.array(('or', 'dr', 'ast', 'to', 'stl', 'blk', 'pf'),
#                     dtype=str)
#win_vars = np.array(['W' + x for x in stat_vars])
#lose_vars = np.array(['L' + x for x in stat_vars])

detail_win = regular_detailed_results.groupby(('Season', 'Wteam'))
detail_win = detail_win[('Wor', 'Wdr', 'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf',
                         'Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf')].mean()
def col_rename(col_name):
    col_name = re.sub('W', 'for_', col_name)
    col_name = re.sub('L', 'against_', col_name)
    return(col_name)
detail_win.rename(columns=col_rename, inplace=True)

detail_lose = regular_detailed_results.groupby(('Season', 'Lteam'))
detail_lose = detail_lose[('Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf',
                           'Wor', 'Wdr', 'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf')].mean()

#detail_data =
detail_win

Unnamed: 0_level_0,Unnamed: 1_level_0,for_or,for_dr,for_ast,for_to,for_stl,for_blk,for_pf,against_or,against_dr,against_ast,against_to,against_stl,against_blk,against_pf
Season,Wteam,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2003,1102,3.833333,19.333333,16.916667,11.083333,7.333333,2.833333,16.083333,10.583333,18.000000,7.666667,14.166667,5.916667,0.916667,20.250000
2003,1103,9.384615,21.461538,17.692308,12.538462,7.307692,1.846154,20.461538,13.384615,20.076923,13.692308,16.000000,6.461538,2.461538,25.076923
2003,1104,13.529412,26.411765,14.000000,13.058824,7.235294,4.176471,16.470588,11.235294,22.882353,10.823529,15.117647,5.176471,2.529412,20.235294
2003,1105,14.571429,25.857143,15.857143,18.000000,11.285714,2.000000,19.428571,13.142857,23.571429,12.428571,22.000000,7.142857,4.000000,20.000000
2003,1106,12.769231,28.000000,13.000000,17.692308,9.384615,3.769231,18.384615,11.461538,19.615385,9.230769,15.923077,8.692308,2.461538,18.153846
2003,1107,8.714286,25.000000,11.285714,13.000000,6.000000,2.857143,16.428571,11.285714,24.142857,15.000000,14.428571,5.142857,2.285714,18.285714
2003,1108,11.714286,24.785714,16.142857,16.500000,8.785714,4.857143,18.928571,14.285714,22.000000,12.142857,17.142857,6.000000,2.285714,19.857143
2003,1110,9.750000,24.000000,16.250000,12.875000,7.250000,1.687500,16.937500,9.750000,21.375000,10.937500,16.375000,5.062500,2.687500,18.687500
2003,1111,12.125000,28.437500,16.687500,16.812500,10.500000,5.500000,16.875000,13.937500,23.937500,15.500000,19.187500,9.437500,3.562500,21.562500
2003,1112,14.840000,28.160000,18.200000,14.800000,8.440000,4.240000,17.480000,13.000000,22.560000,15.240000,16.800000,5.840000,2.120000,22.320000
