Script to scrape data from KenPom's website

Created: 12/15/2016

In [135]:
# Load packages for this script
import mechanize
from bs4 import BeautifulSoup
import urllib2 
import cookielib
import requests
import pandas as pd
import numpy as np
import re
import html5lib
import time

In [136]:
def remove_seed(name):
    split_name = name.split(" ")
    if split_name[-1].isdigit():
        return " ".join(split_name[:-1])
    return name

In [137]:
def create_column_title(columns, year):
    """
    Function to create column titles for the data during scraping.
    """
    column_title = []
    first_line = columns[0].find_all('th')
    second_line = columns[1].find_all("th")
    if not second_line:
        for x in first_line:
            x_colspan = int(x['colspan']) if x.has_attr('colspan') else 1
            column_title.append(x.text)
            if x_colspan == 2:
                column_title.append(x.text + 'Rank')
    else:
        count_first = 0
        count_second = 0
        count = 0
        for x in first_line:
            count_first += int(x['colspan']) if x.has_attr('colspan') else 1
            while count_second < count_first:
                y = second_line[count]
                y_colspan = int(y['colspan']) if y.has_attr('colspan') else 1
                count_second += y_colspan
                column_title.append(x.text + y.text)
                if y_colspan == 2:
                    column_title.append(x.text + y.text + 'Rank')
                count += 1
    return column_title

# Create a method that parses a given year and spits out a raw dataframe
def import_raw_year(year, s):
    """
    Imports raw data from a ken pom year into a dataframe.
    """
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'}
    f = s.get(url_year(year), headers=headers)
    soup = BeautifulSoup(f.text)
    table_html = soup.find_all('table', {'id': 'ratings-table'})

    thead = table_html[0].find_all('thead')
    columns = soup.find_all('tr')

    table = table_html[0]
    for x in thead:
        table = str(table).replace(str(x), '')

    # Parsing and formatting
    df = pd.read_html(table)[0]
    df.columns = create_column_title(columns, year)
    df = df[df['Team'] == df['Team']]
    df['Year'] = int(year)
    df['Team'] = df['Team'].apply(remove_seed)
    
    return df

# Base url, and a lambda func to return url for a given year
base_urls = ['http://kenpom.com/index.php', \
             'http://kenpom.com/summary.php', \
             'http://kenpom.com/stats.php', \
             'http://kenpom.com/pointdist.php', \
             'http://kenpom.com/height.php', \
             'http://kenpom.com/teamstats.php']

# A dataframe for the data to be scraped
df_final = None

# Login to website before scraping
payload = { 'email': 'john.ezekowitz@gmail.com', 'password': 'STEEEEVE', 'submit': 'Login!'}
with requests.Session() as s:
    p = s.post('http://kenpom.com/handlers/login_handler.php', data=payload)

    for base_url in base_urls:
        url_year = lambda x: '%s?y=%s' % (base_url, str(x))

        print "Scraping page " + base_url
        
        if base_url == 'http://kenpom.com/height.php':
            years = range(2007, 2018)
        elif base_url == 'http://kenpom.com/pointdist.php':
            years = range(2003, 2008)
        else:
            years = range(2002, 2018)

        df = None
        for x in years:
            df = pd.merge(df, import_raw_year(x, s), how='outer') if df is not None else import_raw_year(years[0], s)
            time.sleep(3)
            
        df_final = pd.merge(df_final, df, how='outer', on=['Team','Year','Conf']) if df_final is not None else df


Scraping page http://kenpom.com/index.php
Scraping page http://kenpom.com/summary.php
Scraping page http://kenpom.com/stats.php
Scraping page http://kenpom.com/pointdist.php
Scraping page http://kenpom.com/height.php
Scraping page http://kenpom.com/teamstats.php


In [139]:
# Split W-L column into wins and losses
df_final = df_final[df_final['Team'] != 'Team']
df_final['Wins'] = df_final['W-L'].apply(lambda x: int( str(x).split('-')[0] ))
df_final['Losses'] = df_final['W-L'].apply(lambda x: int( str(x).split('-')[1] ))
df_final.drop('W-L', inplace=True, axis=1)
df_final = df_final.dropna(axis=1)

In [141]:
df_final.to_csv('kenpom_team_ratings.csv')