In [2]:
import numpy as np
import sklearn as sk
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
from sqlite_api import *

## Scraping Roster Data

CREATE TABLE RosterSeason 
    (Player VARCHAR(40), 
        Team VARCHAR(3),
        Position VARCHAR(2),
        String INTEGER)

In [11]:
def scrape_roster(team, season):
    archive_nums = {2016: '188', 2015: '176', 2014: '164', 2013: '151', 2012: '136',
                    2011: '124', 2010: '112', 2009: '41', 2008: '30', 2007: '18'}
    
    roster = dict()
    
    # build url and request html
    url = 'http://www.ourlads.com/nfldepthcharts/archive/' + archive_nums[season] + '/' + team
    r  = requests.get(url, headers={'User-agent': 'Mozilla/5.0'})
    contents = BeautifulSoup(r.text, 'lxml')
    
    df = pd.DataFrame(columns=['Player', 'Position', 'Team', 'Season', 'StartingRank'])
    
    table = contents.find('tbody')
    white_rows = table.find_all('tr', {'class': 'row-dc-wht'})
    for row in white_rows:
        pos = row.find('td').text
        if pos in ['QB', 'RB', 'LWR', 'RWR', 'TE', 'PK']:
            if pos in ['LWR', 'RWR']:
                pos = 'WR'
            if pos  == 'PK':
                pos = 'K'
    
            starters = list()
            cols = row.findAll('td')
            for idx in range(len(cols)):
                if idx > 1 and idx % 2 == 0:
                    if cols[idx].text != "":
                        starters.append(extract_name(cols[idx].text))
                    else:
                        break
            
            for idx in range(len(starters)):
                df = df.append({'Player': starters[idx], 'Position': pos, 'Team': team, 'Season': season, 
                                'StartingRank': int(idx+1)}, ignore_index=True)
    
    grey_rows = table.find_all('tr', {'class': 'row-dc-grey'})
    for row in grey_rows:
        pos = row.find('td').text
        if pos in ['QB', 'RB', 'LWR', 'SWR', 'RWR', 'TE', 'PK']:
            if pos in ['LWR', 'SWR', 'RWR']:
                pos = 'WR'
            if pos  == 'PK':
                pos = 'K'
    
            starters = list()
            cols = row.findAll('td')
            for idx in range(len(cols)):
                if idx > 1 and idx % 2 == 0:
                    if cols[idx].text != "":
                        starters.append(extract_name(cols[idx].text))
                    else:
                        break
            
            for idx in range(len(starters)):
                df = df.append({'Player': starters[idx], 'Position': pos, 'Team': team, 'Season': season, 
                                'StartingRank': int(idx+1)}, ignore_index=True)
    
    return df
    
def extract_name(string):
    if string == "":
        return ""
    
    if ', ' in string:
        split = string.split(', ')
    else:
        split = string.split(' ')

    first = split[1].split(' ')[0]
    if first.isupper():
        first = first.title()
    last = split[0]
    if last.isupper():
        last = last.title()
    
    return first + ' ' + last
    
    
scrape_roster('ARZ', 2016)

Unnamed: 0,Player,Position,Team,Season,StartingRank
0,Larry Fitzgerald,WR,ARZ,2016.0,1.0
1,Jaron Brown,WR,ARZ,2016.0,2.0
2,Jermaine Gresham,TE,ARZ,2016.0,1.0
3,David Johnson,RB,ARZ,2016.0,1.0
4,Chris Johnson,RB,ARZ,2016.0,2.0
5,Andre Ellington,RB,ARZ,2016.0,3.0
6,Stepfan Taylor,RB,ARZ,2016.0,4.0
7,Michael Floyd,WR,ARZ,2016.0,1.0
8,John Brown,WR,ARZ,2016.0,2.0
9,Jj Nelson,WR,ARZ,2016.0,3.0


In [33]:
conn = sqlite3.connect('fantasy.db')
c = conn.cursor()

In [98]:
c.execute('''CREATE TABLE Roster
                (Player VARCHAR(30),
                Team VARCHAR(3),
                Position CHARACTER(2),
                Season CHARACTER(4),
                Rank INT,
                PRIMARY KEY (Player, Team, Season, Position))''')

<sqlite3.Cursor at 0x94635e0>

In [15]:
teams = {'Arizona' : 'ARZ',
                'Atlanta' : 'ATL',
                'Baltimore': 'BAL',
                'Buffalo': 'BUF',
                'Carolina': 'CAR',
                'Chicago': 'CHI',
                'Cincinnati': 'CIN',
                'Cleveland': 'CLE',
                'Dallas': 'DAL',
                'Denver': 'DEN',
                'Detroit': 'DET',
                'Green Bay': 'GB',
                'Houston': 'HOU',
                'Indianapolis': 'IND',
                'Jacksonville': 'JAX',
                'Kansas City': 'KC',
                'Miami': 'MIA',
                'Minnesota': 'MIN',
                'New England': 'NE',
                'New Orleans': 'NO',
                'NY Giants': 'NYG',
                'NY Jets': 'NYJ',
                'Oakland': 'OAK',
                'Philadelphia': 'PHI',
                'Pittsburgh': 'PIT',
                'LA Chargers': 'LAC',
                'Seattle': 'SEA',
                'San Francisco': 'SF',
                'LA Rams': 'LAR',
                'Tampa Bay': 'TB',
                'Tennessee': 'TEN',
                'Washington': 'WAS'}

In [34]:
for season in range(2007, 2017):
    print('Scraping season '+str(season)+'...')
    for key in teams:
        team = teams[key]
        
        df = scrape_roster(team, season)
        for tup in df.iterrows():
            row = tup[1]
            try:
                to_insert = (row['Player'], row['Team'], row['Position'], 
                             str(int(row['Season'])), int(row['StartingRank']))
                c.execute('INSERT INTO Roster VALUES (?,?,?,?,?)', to_insert)
            except sqlite3.IntegrityError:
                j = 0
                #print('Unique constraint failed with ' + str(to_insert))

Scraping season 2007...
Scraping season 2008...
Scraping season 2009...
Scraping season 2010...
Scraping season 2011...
Scraping season 2012...
Scraping season 2013...
Scraping season 2014...
Scraping season 2015...
Scraping season 2016...


In [36]:
conn.commit()

In [9]:
cn = 0

for row in c.execute('SELECT * FROM Roster WHERE Position = \'K\''):
    cn += 1
    print(row)
    if cn == 50:
        break

('Morten Andersen', 'ATL', 'K', '2007', 1)
('Sebastian Janikowski', 'OAK', 'K', '2007', 1)
('Mason Crosby', 'GB', 'K', '2007', 1)
('Matt Stover', 'BAL', 'K', '2007', 1)
('Rhys Lloyd', 'BAL', 'K', '2007', 2)
('Rian Lindell', 'BUF', 'K', '2007', 1)
('John Kasay', 'CAR', 'K', '2007', 1)
('Adam Vinatieri', 'IND', 'K', '2007', 1)
('David Akers', 'PHI', 'K', '2007', 1)
('Shaun Suisham', 'WAS', 'K', '2007', 1)
('Jay Feely', 'MIA', 'K', '2007', 1)
('Lawrence Tynes', 'NYG', 'K', '2007', 1)
('Nick Folk', 'DAL', 'K', '2007', 1)
('Jason Hanson', 'DET', 'K', '2007', 1)
('Josh Brown', 'SEA', 'K', '2007', 1)
('Nate Kaeding', 'LAC', 'K', '2007', 1)
('Jeff Wilkins', 'LAR', 'K', '2007', 1)
('Joe Nedney', 'SF', 'K', '2007', 1)
('Phil Dawson', 'CLE', 'K', '2007', 1)
('Olindo Mare', 'NO', 'K', '2007', 1)
('Bironas 2', 'TEN', 'K', '2007', 1)
('Jason Elam', 'DEN', 'K', '2007', 1)
('Robbie Gould', 'CHI', 'K', '2007', 1)
('Shayne Graham', 'CIN', 'K', '2007', 1)
('Stephen Gostkowski', 'NE', 'K', '2007', 1)
('Da

In [21]:
c.execute('UPDATE Roster SET Team = \'ARI\' WHERE Team = \'ARZ\'')
conn.commit()

In [37]:
conn.close()