In [1]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import re
import sys
from lxml.html import parse
from urllib2 import urlopen
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 250

In [2]:
sys.path.append('../')
import read_player_stats
import training_data

In [2]:
season = 2014
page = 0
week = 1
posid = 20

column_names = ['Name', 'Team', 'Games', 'RunAtt', 'RunYards', 'RunTD', 'Targets', 'Rec', 'RecYards', 'RecTD', 'FFP', 'FFPPG']

url_str = 'http://fftoday.com/stats/playerstats.php?Season=%d&GameWeek=%d&PosID=%d&LeagueID=1&order_by=FFPts&sort_order=DESC&cur_page=%d' % (season, week, posid, page)   
print(url_str)

http://fftoday.com/stats/playerstats.php?Season=2014&GameWeek=1&PosID=20&LeagueID=1&order_by=FFPts&sort_order=DESC&cur_page=0


In [3]:
#parse html and find the main data table
parsed = parse(urlopen(url_str))
page = parsed.getroot()
tables = page.findall('.//table')
#get the rows of the player table
main_table = tables[10]
rows = main_table.findall('.//tr')

In [4]:
#dataframe we will return
week_data_df = pd.DataFrame(columns=column_names)
for i,row in enumerate(rows[2:]): #rows[2] is first player entry
    elements = row.findall('.//td') #find all elements in the row
    values = [val.text_content() for val in elements] #make a list of all the values from the row
    week_data_df.loc[i] = values

In [5]:
#clean up entries
week_data_df['Name'] = week_data_df['Name'].str.replace('[^a-z]', '',flags=re.IGNORECASE) #remove extra stuff from name cell
#remove thousands commas
if 'RunYards' in column_names:
    week_data_df['RunYards'] = week_data_df['RunYards'].str.replace(',', '') 
if 'RecYards' in column_names:
    week_data_df['RecYards'] = week_data_df['RecYards'].str.replace(',', '')
if 'PassYards' in column_names:
    week_data_df['PassYards'] = week_data_df['PassYards'].str.replace(',', '')

In [6]:
week_data_df['Season'] = season
week_data_df['Week'] = week
week_data_df[week_data_df.drop(['Name', 'Team'], axis=1).columns] = week_data_df[week_data_df.drop(['Name', 'Team'], axis=1).columns].astype(float)

In [7]:
week_data_df.drop(['Team', 'Games'], axis=1, inplace=True)

In [8]:
week_data_df

Unnamed: 0,Name,RunAtt,RunYards,RunTD,Targets,Rec,RecYards,RecTD,FFP,FFPPG,Season,Week
0,LeVeonBell,21,109,1,7,6,88,0,25.7,25.7,2014,1
1,MarshawnLynch,20,110,2,1,1,14,0,24.4,24.4,2014,1
2,DeMarcoMurray,22,118,1,4,3,25,0,20.3,20.3,2014,1
3,KnowshonMoreno,24,134,1,0,0,0,0,19.4,19.4,2014,1
4,MarkIngram,13,60,2,1,1,1,0,18.1,18.1,2014,1
5,MattForte,17,82,0,9,8,87,0,16.9,16.9,2014,1
6,ChrisIvory,10,102,1,0,0,0,0,16.2,16.2,2014,1
7,RashadJennings,16,46,1,5,4,50,0,15.6,15.6,2014,1
8,IsaiahCrowell,5,32,2,0,0,0,0,15.2,15.2,2014,1
9,ChrisJohnson,13,68,0,5,5,23,1,15.1,15.1,2014,1


In [9]:
#make complete season record
seasons = range(2004,2015)
weeks = range(1,18)
pages = [0,1]

total_df = None
for season in seasons:
    for week in weeks:
        for page in pages:
            df = read_player_stats.read_stats(season, week=week,page=page,pos='rb')
            if total_df is None:
                total_df = df
            else:
                total_df = total_df.append(df, ignore_index = True)
                    
total_df.sort_index(by=['Name','Season', 'Week'], inplace=True)

In [10]:
total_df[(total_df.Name == 'AdrianPeterson') & (total_df.Team != 'MIN')]

Unnamed: 0,Name,Team,Games,RunAtt,RunYards,RunTD,Targets,Rec,RecYards,RecTD,FFP,FFPPG,Season,Week
299,AdrianPeterson,CHI,1,0,0,0,0,0,0,0,0.0,0.0,2004,3
399,AdrianPeterson,CHI,1,0,0,0,0,0,0,0,0.0,0.0,2004,4
1038,AdrianPeterson,CHI,1,4,15,0,1,1,30,0,5.5,5.5,2004,11
1577,AdrianPeterson,CHI,1,2,4,0,1,1,0,0,1.4,1.4,2004,16
1955,AdrianPeterson,CHI,1,0,0,0,1,1,7,0,1.7,1.7,2005,3
2158,AdrianPeterson,CHI,1,2,19,0,0,0,0,0,1.9,1.9,2005,5
2257,AdrianPeterson,CHI,1,0,0,0,1,1,7,0,1.7,1.7,2005,6
2459,AdrianPeterson,CHI,1,4,8,0,1,1,1,0,1.9,1.9,2005,8
2517,AdrianPeterson,CHI,1,6,58,1,1,1,3,0,13.1,13.1,2005,9
2610,AdrianPeterson,CHI,1,24,120,1,1,0,0,0,18.0,18.0,2005,10


In [11]:
#fix duplicate player names
total_df.loc[(total_df.Name == 'AdrianPeterson') & (total_df.Team != 'MIN'), 'Name'] = 'AdrianPeterson2'

In [12]:
total_df

Unnamed: 0,Name,Team,Games,RunAtt,RunYards,RunTD,Targets,Rec,RecYards,RecTD,FFP,FFPPG,Season,Week
8574,AaronBrown,DET,1,1,9,0,0,0,0,0,0.9,0.9,2009,1
8667,AaronBrown,DET,1,4,10,0,2,1,3,0,2.3,2.3,2009,2
8752,AaronBrown,DET,1,5,6,0,1,1,9,0,2.5,2.5,2009,3
8851,AaronBrown,DET,1,1,3,0,1,1,14,0,2.7,2.7,2009,4
8974,AaronBrown,DET,1,0,0,0,0,0,0,0,0.0,0.0,2009,5
9061,AaronBrown,DET,1,2,13,0,0,0,0,0,1.3,1.3,2009,6
9232,AaronBrown,DET,1,2,15,0,4,2,13,0,4.8,4.8,2009,8
9339,AaronBrown,DET,1,4,27,0,0,0,0,0,2.7,2.7,2009,9
9526,AaronBrown,DET,1,0,0,0,1,1,26,1,9.6,9.6,2009,11
9685,AaronBrown,DET,1,1,5,0,1,0,0,0,0.5,0.5,2009,12


In [4]:
qb_games_df = training_data.make_total_game_data(seasons=range(2004,2016), weeks=range(1,18), pages=[0], pos='qb')
rb_games_df = training_data.make_total_game_data(seasons=range(2004,2016), weeks=range(1,18), pages=[0,1], pos='rb')
wr_games_df = training_data.make_total_game_data(seasons=range(2004,2016), weeks=range(1,18), pages=[0,1], pos='wr')
te_games_df = training_data.make_total_game_data(seasons=range(2004,2016), weeks=range(1,18), pages=[0], pos='te')

In [5]:
#fix duplicate player names
rb_games_df.loc[(rb_games_df.Name == 'AdrianPeterson') & (rb_games_df.Team != 'MIN'), 'Name'] = 'AdrianPeterson2'

In [6]:
qb_games_df.to_csv('../game_stats/qb_game_stats.csv')
rb_games_df.to_csv('../game_stats/rb_game_stats.csv')
wr_games_df.to_csv('../game_stats/wr_game_stats.csv')
te_games_df.to_csv('../game_stats/te_game_stats.csv')

In [7]:
wr_games_df

Unnamed: 0,Name,Team,Games,Targets,Rec,RecYards,RecTD,RunAtt,RunYards,RunTD,FFP,FFPPG,Season,Week
11927,AJGreen,CIN,1,4,1,41,1,0,0,0,10.6,10.6,2011,1
12006,AJGreen,CIN,1,14,10,124,1,0,0,0,23.4,23.4,2011,2
12160,AJGreen,CIN,1,5,4,29,0,0,0,0,4.9,4.9,2011,3
12217,AJGreen,CIN,1,10,4,118,0,1,6,0,14.4,14.4,2011,4
12312,AJGreen,CIN,1,8,5,90,1,0,0,0,17.5,17.5,2011,5
12410,AJGreen,CIN,1,7,5,51,1,0,0,0,13.6,13.6,2011,6
12612,AJGreen,CIN,1,10,4,63,1,0,0,0,14.3,14.3,2011,8
12720,AJGreen,CIN,1,7,7,83,0,0,0,0,11.8,11.8,2011,9
12829,AJGreen,CIN,1,2,1,36,1,1,7,0,10.8,10.8,2011,10
13022,AJGreen,CIN,1,4,3,110,0,0,0,0,12.5,12.5,2011,12
