In [1]:
import numpy as np
import sklearn as sk
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
from sqlite_api import *

In [2]:
def scrape(season, position):
    # column name for the dataframe to be created
    labels = ['Player', 'Team', 'Plays', 'Fantasy', 'Games', 'FPG', 'NumRuns', 
              'RushYrds', 'RushTDs', 'PassAttempts', 'Complete', 'PassYrds', 'PassTDs', 'Fumbles', 'Interceptions']
    
    # build pandas dataframe for this season's data
    df = pd.DataFrame(columns=labels)
    
    # build url and request html
    url = 'http://thehuddle.com/stats/'+str(season)+'/plays_std.php?pos=' + position
    r  = requests.get(url, headers={'User-agent': 'Mozilla/5.0'})
    contents = BeautifulSoup(r.text, 'lxml')
    
    # hard-coded number of columns in table
    num_cols = 15
    table = contents.find("tbody")
    entries = table.find_all("td")
    
    # find number of rows in the table
    num_rows = int(len(entries)/15)
    
    for rowN in range(num_rows):
        cur_row = dict()
        for colN in range(len(labels)):
            cur_row[labels[colN]] = entries[rowN*num_cols + colN].text.strip()
        cur_row['Season'] = str(season)
        cur_row['Position'] = position
        df = df.append(pd.Series(cur_row), ignore_index=True)
        
    return df
        #print('row ' + str(rowN) +': '+str(cur_row))
    
qb2010 = scrape(2010, 'QB')

In [3]:
def scrape_kicker(season, position='PK'):
    labels = ['Player', 'Team', 'Fantasy', 'Games', 'FPG', 'FGoals', 
              'FGoalsMissed', 'ExtraPoints', 'ExtraPointsMissed']
    
    # build pandas dataframe for this season's data
    df = pd.DataFrame(columns=labels)
    
    # build url and request html
    url = 'http://thehuddle.com/stats/'+str(season)+'/plays_std.php?pos=' + position
    r  = requests.get(url, headers={'User-agent': 'Mozilla/5.0'})
    contents = BeautifulSoup(r.text, 'lxml')
    
    # number of columns in table
    num_cols = len(labels)
    table = contents.find("tbody")
    entries = table.find_all("td")
    
    # find number of rows in the table
    num_rows = int(len(entries)/num_cols)
    
    for rowN in range(num_rows):
        cur_row = dict()
        for colN in range(len(labels)):
            cur_row[labels[colN]] = entries[rowN*num_cols + colN].text.strip()
        cur_row['Season'] = str(season)
        cur_row['Position'] = position
        df = df.append(pd.Series(cur_row), ignore_index=True)
        
    return df

In [4]:
def scrape_defense(season, position='DF'):
    labels = ['City', 'Team', 'Fantasy', 'Games', 'FPG', 'Sacks', 'FRecoveries', 
              'Interceptions', 'TDs', 'Safeties', 'RushYrdsAllowed', 'PassYrdsAllowed', 'TotalYrdsAllowed']
    
    # build pandas dataframe for this season's data
    df = pd.DataFrame(columns=labels)
    
    # build url and request html
    url = 'http://thehuddle.com/stats/'+str(season)+'/plays_std.php?pos=' + position
    r  = requests.get(url, headers={'User-agent': 'Mozilla/5.0'})
    contents = BeautifulSoup(r.text, 'lxml')
    
    # number of columns in table
    num_cols = len(labels)
    table = contents.find("tbody")
    entries = table.find_all("td")
    
    # find number of rows in the table
    num_rows = int(len(entries)/num_cols)
    
    for rowN in range(num_rows):
        cur_row = dict()
        for colN in range(len(labels)):
            cur_row[labels[colN]] = entries[rowN*num_cols + colN].text.strip()
        cur_row['Season'] = str(season)
        cur_row['Position'] = position
        df = df.append(pd.Series(cur_row), ignore_index=True)
        
    return df

In [5]:
conn = sqlite3.connect('players.db')

c = conn.cursor()

In [6]:
c.execute('''CREATE TABLE PlayerSeason
             (Player VARCHAR(40), 
             Team VARCHAR(3), 
             Plays SMALLINT, 
             Fantasy SMALLINT, 
             Games SMALLINT,
             FPG DECIMAL(5,2),
             NumRuns SMALLINT,
             RushYrds SMALLINT,
             RushTDs SMALLINT,
             PassAttempts SMALLINT, 
             Complete SMALLINT, 
             PassYrds SMALLINT, 
             PassTDs SMALLINT, 
             Fumbles SMALLINT, 
             Interceptions SMALLINT,
             Position CHARACTER(2),
             Season CHARACTER(4),
             UNIQUE (Player, Season, Position, Team))''')

<sqlite3.Cursor at 0x5f6cb20>

In [7]:
c.execute('''CREATE TABLE KickerSeason
            (Player VARCHAR(40),
            Team VARCHAR(3),
            Fantasy SMALLINT,
            Games SMALLINT,
            FPG DECIMAL(5,2),
            FGoals SMALLINT,
            FGoalsMissed SMALLINT,
            ExtraPoints SMALLINT,
            ExtraPointsMissed SMALLINT,
            Position CHARACTER(2),
            Season CHARACTER(4),
            UNIQUE (Player, Season))''')

<sqlite3.Cursor at 0x5f6cb20>

In [8]:
c.execute('''CREATE TABLE DefenseSeason
            (City VARCHAR(20),
            Team VARCHAR(3),
            Fantasy SMALLINT,
            Games SMALLINT,
            FPG DECIMAL(5,2),
            Sacks SMALLINT,
            FRecoveries SMALLINT,
            Interceptions SMALLINT,
            TDs SMALLINT,
            Safeties SMALLINT,
            RushYrdsAllowed SMALLINT,
            PassYrdsAllowed SMALLINT,
            TotalYrdsAllowed SMALLINT,
            Position CHARACTER(2),
            Season CHARACTER(4),
            UNIQUE (Team, Season))''')

<sqlite3.Cursor at 0x5f6cb20>

In [9]:
def update_players(conn, df):
    tuple_list = []
    
    c = conn.cursor()
    for ndx in range(len(df)):
        try:
            c.execute('INSERT INTO PlayerSeason VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', tuple(df.iloc[ndx])) 
        except:
            print(str(df.iloc[ndx]['Player'])+' '+str(df.iloc[ndx]['Season'])+' already in table')
        
    conn.commit()

In [10]:
def update_kickers(conn, df):
    tuple_list = []
    
    c = conn.cursor()
    for ndx in range(len(df)):
        try:
            c.execute('INSERT INTO KickerSeason VALUES (?,?,?,?,?,?,?,?,?,?,?)', tuple(df.iloc[ndx])) 
        except:
            print(str(df.iloc[ndx]['Player'])+' '+str(df.iloc[ndx]['Season'])+' already in table')
        
    conn.commit()

In [11]:
def update_defense(conn, df):
    tuple_list = []
    
    c = conn.cursor()
    for ndx in range(len(df)):
        try:
            c.execute('INSERT INTO DefenseSeason VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', tuple(df.iloc[ndx])) 
        except:
            print(str(df.iloc[ndx]['City'])+' '+str(df.iloc[ndx]['Season'])+' already in table')
        
    conn.commit()

In [12]:
pos = ['QB', 'WR', 'RB', 'TE']

conn = sqlite3.connect('players.db')

# loop through all seasons available
for year in range(2006, 2017):
    # loop through all positions
    for cur_pos in pos:
        update_players(conn, scrape(year, cur_pos))

conn.close()

In [13]:
conn = sqlite3.connect('players.db')

for year in range(2006, 2017):
    update_kickers(conn, scrape_kicker(year))
    
conn.close()

In [14]:
conn = sqlite3.connect('players.db')

for year in range(2006, 2017):
    update_defense(conn, scrape_defense(year))
    
conn.close()

In [15]:
select_to_df('players.db', 'DefenseSeason', ['Team', 'Season', 'FPG', 'Sacks'], 'WHERE Season IN (2009, 2010)')

Unnamed: 0,Team,Season,FPG,Sacks
0,NO,2009,10.44,35.0
1,PHI,2009,10.00,44.0
2,GB,2009,8.94,37.0
3,SF,2009,8.00,44.0
4,DEN,2009,7.69,39.0
5,NYG,2009,7.12,37.0
6,BAL,2009,7.50,32.0
7,MIN,2009,7.25,48.0
8,CAR,2009,7.19,31.0
9,ARI,2009,7.06,43.0
