In [23]:
import warnings
# import os
import numpy as np
import pandas as pd
# import time
# import datetime
# import re

from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
# from scipy import stats

warnings.filterwarnings('ignore')


In [24]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    print(e)

In [25]:
url = "https://kenpom.com/index.php?y="
raw_html = simple_get(url)
soup = BeautifulSoup(raw_html, 'html5lib')
dta = [s.text for s in soup.select("td")]

dta[:10]

['1', 'Kansas', 'B12', '26-3', '+30.81', '115.6', '8', '84.8', '1', '67.3']

In [26]:
lvars = 21
years = range(2000,2021)
url = "https://kenpom.com/index.php?y="
season_dict = {}

for y in years:
    nurl = url + str(y) if y != 2020 else url[:-3]
    raw_html = simple_get(nurl)
    soup = BeautifulSoup(raw_html, 'html5lib')
    dta = [s.text for s in soup.select('td')]
    ldta = len(dta)
    season_dict[y] = pd.DataFrame(np.array(dta).reshape(int((ldta/lvars)),lvars))



In [27]:
years = list(season_dict.keys())
start = years[-1]
yearsub = years[:-1]


In [28]:
kenpom = season_dict[start]
kenpom['year'] = start

for y in reversed(yearsub):
    work = season_dict[y]
    work['year'] = y
    kenpom = pd.concat([kenpom,work])
    
kenpom.columns = ['rank','school','conference','w_l','adjem','adjo','adjoR',
                  'adjd','adjdR','adjt','adjtR','luck','luckR','adjems','adjemsR',
                 'oppos','opposR','oppds','oppdsR','adjemn','adjemnR','year']



In [29]:
meta = [['rank','int'],['adjem','float'],['adjo','float'],['adjoR','int'],
        ['adjd','float'],['adjdR','int'],['adjt','float'],['adjtR','int'],
        ['luck','float'],['luckR','int'],['adjems','float'],['adjemsR','int'],
        ['oppos','float'],['opposR','int'],['oppds','float'],['oppdsR','int'],
        ['adjemn','float'],['adjemnR','int']
       ]
for m in meta: kenpom[m[0]] = kenpom[m[0]].astype(m[1])

kenpom.sort_values(['year','rank'],ascending = 'True',inplace=True)
kenpom.reset_index(inplace=True,drop=True)


In [30]:
kenpom['wins'] = [int(s.split("-")[0]) for s in kenpom.w_l]
kenpom['losses'] = [int(s.split("-")[1]) for s in kenpom.w_l]
kenpom['wpct'] = kenpom.wins/(kenpom.wins+kenpom.losses)

In [31]:
kenpom['seed'] = [int(s.split()[-1]) if s.split()[-1].isdigit() else np.NaN for s in kenpom.school]
kenpom['name'] = [s if pd.isna(kenpom.seed[i]) else " ".join(s.split()[:-1]) for (i,s) in enumerate(kenpom.school)]

In [32]:
colOrder = ['year','rank','school', 'name', 'conference', 'wins',
            'losses','wpct', 'seed', 'w_l','adjem','adjo','adjoR','adjd', 'adjdR','adjt',
            'adjtR','luck','luckR','adjems', 'adjemsR', 'oppos', 'opposR', 'oppds', 'oppdsR', 'adjemn', 'adjemnR',
            ]

kenpom = kenpom[colOrder]


In [33]:
kenpom.year.min()

2000

In [34]:
kenpom.year.max()

2020

In [35]:
kenpom.shape

(7216, 27)

In [36]:
kenpom.head()

Unnamed: 0,year,rank,school,name,conference,wins,losses,wpct,seed,w_l,...,luck,luckR,adjems,adjemsR,oppos,opposR,oppds,oppdsR,adjemn,adjemnR
0,2000,1,Kansas,Kansas,B12,26,3,0.896552,,26-3,...,0.031,101,12.74,1,107.3,20,94.6,1,9.84,11
1,2000,2,Gonzaga,Gonzaga,WCC,29,2,0.935484,,29-2,...,0.045,68,1.71,117,102.9,130,101.2,104,-1.85,243
2,2000,3,Baylor,Baylor,B12,25,3,0.892857,,25-3,...,0.004,180,9.37,33,106.1,43,96.7,17,1.21,141
3,2000,4,San Diego St.,San Diego St.,MWC,28,1,0.965517,,28-1,...,0.01,158,2.62,107,104.9,82,102.3,146,-1.36,229
4,2000,5,Duke,Duke,ACC,23,6,0.793103,,23-6,...,-0.013,222,7.59,46,106.0,44,98.5,53,2.72,85


In [37]:
kenpom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7216 entries, 0 to 7215
Data columns (total 27 columns):
year          7216 non-null int64
rank          7216 non-null int64
school        7216 non-null object
name          7216 non-null object
conference    7216 non-null object
wins          7216 non-null int64
losses        7216 non-null int64
wpct          7216 non-null float64
seed          1652 non-null float64
w_l           7216 non-null object
adjem         7216 non-null float64
adjo          7216 non-null float64
adjoR         7216 non-null int64
adjd          7216 non-null float64
adjdR         7216 non-null int64
adjt          7216 non-null float64
adjtR         7216 non-null int64
luck          7216 non-null float64
luckR         7216 non-null int64
adjems        7216 non-null float64
adjemsR       7216 non-null int64
oppos         7216 non-null float64
opposR        7216 non-null int64
oppds         7216 non-null float64
oppdsR        7216 non-null int64
adjemn        7216 

In [None]:
# sanity check some years

In [38]:
kenpom.loc[kenpom.year == 2019].head()

Unnamed: 0,year,rank,school,name,conference,wins,losses,wpct,seed,w_l,...,luck,luckR,adjems,adjemsR,oppos,opposR,oppds,oppdsR,adjemn,adjemnR
6510,2019,1,Virginia 1,Virginia,ACC,35,3,0.921053,1.0,35-3,...,0.05,62,11.18,22,109.2,34,98.1,14,-3.24,255
6511,2019,2,Gonzaga 1,Gonzaga,WCC,33,4,0.891892,1.0,33-4,...,-0.001,180,4.46,75,106.9,69,102.5,87,1.87,105
6512,2019,3,Michigan St. 2,Michigan St.,B10,32,7,0.820513,2.0,32-7,...,0.001,177,13.67,2,110.6,7,96.9,1,3.24,82
6513,2019,4,Duke 1,Duke,ACC,32,6,0.842105,1.0,32-6,...,0.018,134,12.85,7,110.7,4,97.8,11,5.08,40
6514,2019,5,Texas Tech 3,Texas Tech,B12,31,7,0.815789,3.0,31-7,...,0.004,171,11.18,21,109.8,19,98.7,22,-5.39,307


In [39]:
kenpom.loc[kenpom.year == 2010].head()

Unnamed: 0,year,rank,school,name,conference,wins,losses,wpct,seed,w_l,...,luck,luckR,adjems,adjemsR,oppos,opposR,oppds,oppdsR,adjemn,adjemnR
3371,2010,1,Duke 1,Duke,ACC,35,5,0.875,1.0,35-5,...,0.009,145,11.53,4,107.9,19,96.4,1,2.5,94
3372,2010,2,Kansas 1,Kansas,B12,33,3,0.916667,1.0,33-3,...,0.038,83,9.53,16,108.2,16,98.7,32,1.13,125
3373,2010,3,Syracuse 1,Syracuse,BE,30,5,0.857143,1.0,30-5,...,-0.008,191,8.62,36,108.5,10,99.8,67,-2.96,246
3374,2010,4,Kentucky 1,Kentucky,SEC,35,3,0.921053,1.0,35-3,...,0.06,43,7.06,55,106.5,50,99.5,57,-1.28,194
3375,2010,5,West Virginia 2,West Virginia,BE,31,7,0.815789,2.0,31-7,...,0.057,45,12.58,1,110.0,1,97.5,7,5.45,50


In [40]:
kenpom.loc[kenpom.year != 2020].to_csv('kenpom.csv',index=False)