In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time
import json
from collections import namedtuple
import csv
import glob
from time import strptime

In [2]:
def scrape_table(year, week, soup, headers_=False):
    teams = []
    for i, team in enumerate(soup.table.find_all('tr')[1:]):
        if not i:
            headers = ["YEAR", "WEEK"]+[t.text for t in team.find_all('td')]+["GAMES"]
        else:
            games = team.find(class_ = 'previous-games').text.replace(u'\xa0', u' ').encode('utf-8', 'replace')
            teams.append([year, week, i]+[t.text for t in team.find_all('li')] + [games.split("This Week:")[-1]])
    if headers_:
        return headers, teams
    else:
        return teams

In [3]:
def scrape_AP_Top_25_ESPN(year, path, nice):
    week = 1
    first = True
    while True:
        if nice:
            time.sleep(1)
        url = 'http://espn.go.com/mens-college-basketball/rankings/_/poll/1/year/'+str(year)+'/week/'+str(week)+'/seasontype/2'
        soup = BeautifulSoup(requests.get(url).text, "html.parser")
        if int(soup.h1.text.split(" ")[0]) != year:
            break
        if soup.find("tr", class_ = "oddrow").text == 'No rankings available.':
            year -= 1
            week = 1
            continue
        print '\rScraping Week:', week,"; Year:", year,
        if first:
            headers, data = scrape_table(year, week, soup, True)
            first = False
        else:
            data = data + scrape_table(year, week, soup)
        week +=1
    print 'Finished!'
    print 'Saving to csv:', path+'NCAA_AP_TOP_25.csv',
    with open(path+'NCAA_AP_TOP_25.csv', 'wb') as f:
        w = csv.writer(f)
        w.writerow(headers)
        w.writerows(data)
    print 'Finished!'
    return year

In [4]:
def helper(t):
    data = t.p.get_text().split('\n')[-1].split()
    team = ' '.join(data[:-1])
    score = data[-1]
    return team, score

In [5]:
def foldResults(results):
    final = []
    while True:
        try:
            a = results.pop()
            b = results.pop()
        except IndexError:
            break
        else:
            final.append(b+a[2:])
    return list(reversed(final))

In [6]:
def scrapeRegion(table, region):
    d = {}
    results = []
    for t in table.find_all(class_ = "align_right"):
        team, score = helper(t.findNext("td"))
        d[team] = t.text
        results.append([region, 1, d[team], team, score])
    for i, r in enumerate([2,5,11]):
        for t in table.find_all(rowspan = r):
            team, score = helper(t)
            results.append([region, i+2, d[team], team, score])
    return foldResults(results), d

In [7]:
def final_four(tag, rowspan_=None):
    if tag.name != 'td':
        return False
    if not rowspan_:
        return len(tag.attrs) == 0
    else:
        try:
            v = tag.attrs['rowspan']
        except:
            return False
        else:
            return v == str(rowspan_)

In [8]:
def scrapeBracket(soup, year):
    d = {}
    results = []
    region = soup.h2.findNext("h2")
    temp = soup.find_all("table", class_ = 'bracket')
    for t in temp[:-1]:
        res, d_ = scrapeRegion(t, region.text)
        region = region.findNext("h2")
        d.update(d_)
        results = results + res
    res = []
    for i, r in zip([0,2],['Final_Four', 'Championship']):
        for t in temp[-1].find_all(lambda tag: final_four(tag,i)):
            team, score = helper(t)
            res.append(['Final_Four', r, d[team], team, score])
    return [[year]+a for a in results + foldResults(res)]

In [11]:
def main(inityear=2016, nice=True, path = ''):
    year = scrape_AP_Top_25_ESPN(inityear, path, nice)
    bracket = []
    for y in range(year+1, inityear):
        print 'Scraping Year:', y,
        url = 'http://www.sports-reference.com/cbb/postseason/'+str(y)+'-ncaa.html'
        soup = BeautifulSoup(requests.get(url).text, "html.parser")
        bracket = bracket + scrapeBracket(soup, y)
    print 'Finished!'
    print 'Saving to csv:', path+'NCAA_BRACKET.csv',
    with open(path+'NCAA_BRACKET.csv', 'wb') as f:
        w = csv.writer(f)
        w.writerow(['YEAR', 'REGION', 'ROUND', 'SEED', 'TEAM', 'SCORE', 'O_SEED', 'O_NAME', 'O_SCORE'])
        w.writerows(bracket)
    print 'Finished!'

In [12]:
main(nice = False, inityear=2016)

Scraping Week: 18 ; Year: 2002 Finished!
Saving to csv: NCAA_AP_TOP_25.csv Finished!
Scraping Year: 2002 Scraping Year: 2003 Scraping Year: 2004 Scraping Year: 2005 Scraping Year: 2006 Scraping Year: 2007 Scraping Year: 2008 Scraping Year: 2009 Scraping Year: 2010 Scraping Year: 2011 Scraping Year: 2012 Scraping Year: 2013 Scraping Year: 2014 Scraping Year: 2015 Finished!
Saving to csv: NCAA_BRACKET.csv Finished!
