In [1]:
import numpy as np
import sklearn as sk
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3

In [2]:
def scrape(season, position):
    # column name for the dataframe to be created
    labels = ['Player', 'Team', 'Plays', 'Fantasy', 'Games', 'FP/G', 'Num Runs', 
              'Run Yrds', 'Run TDs', 'Pass Attempts', 'Complete', 'Pass Yrds', 'Pass TDs', 'Fumbles', 'Interceptions']
    
    # build pandas dataframe for this season's data
    df = pd.DataFrame(columns=labels)
    
    # build url and request html
    url = 'http://thehuddle.com/stats/'+str(season)+'/plays_std.php?pos=' + position
    r  = requests.get(url, headers={'User-agent': 'Mozilla/5.0'})
    contents = BeautifulSoup(r.text, 'lxml')
    
    # hard-coded number of columns in table
    num_cols = 15
    table = contents.find("tbody")
    entries = table.find_all("td")
    
    # find number of rows in the table
    num_rows = int(len(entries)/15)
    
    for rowN in range(num_rows):
        cur_row = dict()
        for colN in range(len(labels)):
            cur_row[labels[colN]] = entries[rowN*num_cols + colN].text.strip()
        cur_row['Season'] = str(season)
        cur_row['Position'] = position
        df = df.append(pd.Series(cur_row), ignore_index=True)
        
    return df
        #print('row ' + str(rowN) +': '+str(cur_row))
    
qb2010 = scrape(2010, 'QB')

In [3]:
conn = sqlite3.connect('players.db')

c = conn.cursor()

c.execute('''CREATE TABLE PlayerSeason
             (Player VARCHAR(40), 
             Team VARCHAR(3), 
             Plays SMALLINT, 
             Fantasy SMALLINT, 
             Games SMALLINT,
             FPG DECIMAL(5,2),
             NumRuns SMALLINT,
             RunYrds SMALLINT,
             RunTDs SMALLINT,
             PassAttempts SMALLINT, 
             Complete SMALLINT, 
             PassYrds SMALLINT, 
             PassTDs SMALLINT, 
             Fumbles SMALLINT, 
             Interceptions SMALLINT,
             Position CHARACTER(2),
             Season CHARACTER(4),
             UNIQUE (Player, Season, Position, Team))''')

<sqlite3.Cursor at 0x600c7a0>

In [4]:
def update_players(conn, df):
    tuple_list = []
    
    c = conn.cursor()
    for ndx in range(len(df)):
        try:
            c.execute('INSERT INTO PlayerSeason VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', tuple(df.iloc[ndx])) 
        except:
            print(str(df.iloc[ndx]['Player'])+' '+str(df.iloc[ndx]['Season'])+' already in table')
        
    conn.commit()

In [6]:
pos = ['QB', 'WR', 'RB', 'TE']

conn = sqlite3.connect('players.db')

# loop through all seasons available
for year in range(2006, 2017):
    # loop through all positions
    for cur_pos in pos:
        print(str(year)+'  '+cur_pos)
        update_players(conn, scrape(year, cur_pos))

conn.close()

2006  QB
2006  WR
2006  RB
2006  TE
2007  QB
2007  WR
2007  RB
2007  TE
2008  QB
2008  WR
2008  RB
2008  TE
2009  QB
2009  WR
2009  RB
2009  TE
2010  QB
2010  WR
2010  RB
2010  TE
2011  QB
2011  WR
2011  RB
2011  TE
2012  QB
2012  WR
2012  RB
2012  TE
2013  QB
2013  WR
2013  RB
2013  TE
2014  QB
2014  WR
2014  RB
2014  TE
2015  QB
2015  WR
2015  RB
2015  TE
2016  QB
2016  WR
2016  RB
2016  TE


In [21]:
conn.close()

In [11]:
qb2010

Unnamed: 0,Player,Team,Plays,Fantasy,Games,FP/G,Num Runs,Run Yrds,Run TDs,Pass Attempts,Complete,Pass Yrds,Pass TDs,Fumbles,Interceptions,Position,Season
0,Peyton Manning,IND,697,369,16,23.05,18,18,0,679,450,4700,33,1,17,QB,2010
1,Aaron Rodgers,GB,539,368,15,24.51,64,356,4,475,312,3922,28,1,11,QB,2010
2,Drew Brees,NO,677,363,16,22.67,18,-3,0,658,448,4620,33,2,22,QB,2010
3,Philip Rivers,SD,570,361,16,22.54,29,52,0,541,357,4710,30,4,13,QB,2010
4,Michael Vick,PHI,472,356,12,29.71,100,676,9,372,233,3018,21,3,6,QB,2010
5,Tom Brady,NE,523,348,16,21.75,31,30,1,492,324,3900,36,1,4,QB,2010
6,Eli Manning,NYG,571,331,16,20.69,32,70,0,539,339,4002,31,5,25,QB,2010
7,Matt Schaub,HOU,596,317,16,19.83,22,28,0,574,365,4370,24,3,12,QB,2010
8,Matt Ryan,ATL,617,309,16,19.34,46,122,0,571,357,3705,28,3,9,QB,2010
9,Josh Freeman,TB,542,309,16,19.31,68,364,0,474,291,3451,25,3,6,QB,2010
