In [99]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

In [100]:
# Base url, and a lambda func to return url for a given year
base_url = 'http://kenpom.com/index.php'
url_year = lambda x: '%s?y=%s' % (base_url, str(x) if x != 2020 else base_url)

# Years on kenpom's site (could also scrape this and 
# set as a list if you want to be more dynamic)
years = range(2002, 2020)

In [101]:
# Create a method that parses a given year and spits out a raw dataframe
def import_raw_year(year):
    """
    Imports raw data from a ken pom year into a dataframe
    """
    f = requests.get(url_year(year))
    soup = BeautifulSoup(f.text)
    table_html = soup.find_all('table', {'id': 'ratings-table'})

    # Weird issue w/ <thead> in the html
    # Prevents us from just using pd.read_html
    # Let's find all the thead contents and just replace/remove them
    # This allows us to easily put the table row data into a dataframe using panda
    thead = table_html[0].find_all('thead')

    table = table_html[0]
    for x in thead:
        table = str(table).replace(str(x), '')

#    table = "<table id='ratings-table'>%s</table>" % table
    df = pd.read_html(table)[0]
    df['year'] = year
    return df

In [102]:
# Import all the years into a singular dataframe
df = None
for x in years:
    df = pd.concat( (df, import_raw_year(x)), axis=0) \
#         if df is not None else import_raw_year(2002)

In [103]:
# Column rename based off of original website
df.columns = ['Rank', 'Team', 'Conference', 'W-L', 'Pyth', 
             'AdjustO', 'AdjustO Rank', 'AdjustD', 'AdjustD Rank',
             'AdjustT', 'AdjustT Rank', 'Luck', 'Luck Rank', 
             'SOS Pyth', 'SOS Pyth Rank', 'SOS OppO', 'SOS OppO Rank',
             'SOS OppD', 'SOS OppD Rank', 'NCSOS Pyth', 'NCSOS Pyth Rank', 'Year']

In [104]:
# Lambda that returns true if given string is a number and a valid seed number (1-16)
valid_seed = lambda x: True if str(x).replace(' ', '').isdigit() \
                and int(x) > 0 and int(x) <= 16 else False

# Use lambda to parse out seed/team
df['Seed'] = df['Team'].apply(lambda x: x[-2:].replace(' ', '') \
                              if valid_seed(x[-2:]) else np.nan )

df['Team'] = df['Team'].apply(lambda x: x[:-2] if valid_seed(x[-2:]) else x)

In [105]:
# Split W-L column into wins and losses
df['Wins'] = df['W-L'].apply(lambda x: int(re.sub('-.*', '', x)) )
df['Losses'] = df['W-L'].apply(lambda x: int(re.sub('.*-', '', x)) )
df.drop('W-L', inplace=True, axis=1)

In [106]:
# Reorder columns just cause I'm OCD
df=df[[ 'Year', 'Rank', 'Team', 'Conference', 'Wins', 'Losses', 'Seed','Pyth', 
             'AdjustO', 'AdjustO Rank', 'AdjustD', 'AdjustD Rank',
             'AdjustT', 'AdjustT Rank', 'Luck', 'Luck Rank', 
             'SOS Pyth', 'SOS Pyth Rank', 'SOS OppO', 'SOS OppO Rank',
             'SOS OppD', 'SOS OppD Rank', 'NCSOS Pyth', 'NCSOS Pyth Rank']]

In [107]:
df.to_csv('kenpom.csv', index=False)

### Fields
- AdjEM = AdjO - AdjD
- Luck
- SoS (conference) AdjEM = OppO - OppD
- NCSOS (non-conference)

In [108]:
print(df.shape)
df.head()

(6157, 24)


Unnamed: 0,Year,Rank,Team,Conference,Wins,Losses,Seed,Pyth,AdjustO,AdjustO Rank,...,Luck,Luck Rank,SOS Pyth,SOS Pyth Rank,SOS OppO,SOS OppO Rank,SOS OppD,SOS OppD Rank,NCSOS Pyth,NCSOS Pyth Rank
0,2002,1,Duke,ACC,31,4,1,34.19,121.0,1,...,-0.027,223,9.87,18,109.3,13,99.5,31,6.66,34
1,2002,2,Cincinnati,CUSA,31,4,1,30.19,118.1,7,...,0.002,165,6.58,57,106.5,66,100.0,44,3.48,80
2,2002,3,Maryland,ACC,32,4,1,29.25,119.2,4,...,0.025,104,9.88,16,109.4,11,99.5,32,1.62,120
3,2002,4,Kansas,B12,33,4,1,28.99,118.7,5,...,0.022,109,10.67,10,110.5,4,99.9,40,8.32,23
4,2002,5,Oklahoma,B12,31,5,2,26.04,114.9,20,...,0.043,69,8.77,26,109.2,15,100.4,62,-0.44,169


### Goal:
My goal is to take just the fields presented in the **Fields** heading and weigh each appropriately to get a score. The higher of the two scores between teams will be the one that advances. This heuristic should mirror the outcomes of previous tournaments.

I believe I can take two teams input fields and set a classifier by whether team 1 beats team 2.

In [3]:
df = pd.read_csv('kenpom.csv')

In [4]:
columns = ['Year', 'Team', 'Pyth', 'Luck', 'SOS Pyth', 'NCSOS Pyth']
# Reduced dataset
small_df = df[columns]
small_df.head()

Unnamed: 0,Year,Team,Pyth,Luck,SOS Pyth,NCSOS Pyth
0,2002,Duke,34.19,-0.027,9.87,6.66
1,2002,Cincinnati,30.19,0.002,6.58,3.48
2,2002,Maryland,29.25,0.025,9.88,1.62
3,2002,Kansas,28.99,0.022,10.67,8.32
4,2002,Oklahoma,26.04,0.043,8.77,-0.44


## Combine Historical Data (2012-2018)

In [98]:
df1 = pd.read_csv('tournament_results/2002_2012.csv')
df2 = pd.read_csv('tournament_results/2013_2018.csv')

### Create classifier dataframes

In [94]:
# Isolate matchup data
team1 = winner_2002['Team1_Name'][0]
team2 = winner_2002['Team2_Name'][0]
winner = winner_2002['Winner'][0]

year = 2002
columns = ['Pyth', 'Luck', 'SOS Pyth', 'NCSOS Pyth']
team1_data = small_df[(small_df['Team'] == team1) &
                      (small_df['Year'] == year)][columns].reset_index(drop=True)
team2_data = small_df[(small_df['Team'] == team2) &
                      (small_df['Year'] == year)][columns].reset_index(drop=True)


# Rename columns
team1_data = team1_data.rename(columns={'Pyth': 'Team1_Pyth',
                                   'Luck': 'Team1_Luck',
                                   'SOS Pyth': 'Team1_SOS_Pyth',
                                   'NCSOS Pyth': 'Team1_NCSOS_Pyth'})
team2_data = team2_data.rename(columns={'Pyth': 'Team2_Pyth',
                                   'Luck': 'Team2_Luck',
                                   'SOS Pyth': 'Team2_SOS_Pyth',
                                   'NCSOS Pyth': 'Team2_NCSOS_Pyth'})

# Merge Data
feature_vector = pd.concat([team1_data, team2_data],
                           axis=1)

# Label if Team1 won (1) or not (0)
if winner == 'Team1':
    feature_vector['outcome'] = 1
else:
    feature_vector['outcome'] = 0

In [95]:
feature_vector

Unnamed: 0,Team1_Pyth,Team1_Luck,Team1_SOS_Pyth,Team1_NCSOS_Pyth,Team2_Pyth,Team2_Luck,Team2_SOS_Pyth,Team2_NCSOS_Pyth,outcome
0,-1.21,-0.07,-4.14,-0.42,-8.71,0.097,-14.53,2.6,1


Notes: Once we have a dataset full of vectors like example above, we can apply classifier models against the data and do CV of the data to find the best model. That model can then be used to predict the game outcomes for the remainder of games.