In [2]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import os 
import os.path
import csv 
import time 
import numpy as np
import pandas as pd


In [3]:
"credit: http://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/"

class HTMLTableParser:
       
    def parse_url(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        return [(table,self.parse_html_table(table))\
                for table in soup.find_all('table')]  
    
    def parse_html_table(self, table):
        
        
        n_columns = 0
        n_rows=0
        column_names = []
    
        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):
                
            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)
                        
            # Handle column names if we find them
            th_tags = row.find_all('nbsp') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())
    
        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")
    
        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = ['Date','Team','Acquired','Relinquished','Notes'],index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1
                
                    
        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass
        
        df = df.drop(df.index[0])
        
        return df
    

In [74]:
#URL to scrape
url = "https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=2009-01-01&EndDate=2018-02-04&ILChkBx=yes&submit=Search"

#access URL
response = requests.get(url)

# Access the HTML with the text property
response.text[:100] 


'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtm'

In [142]:
hp = HTMLTableParser()
table = hp.parse_url(url)[0][1] # Grabbing the table from the tuple
table


Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
1,2018-01-02,Falcons,,• Andy Levitre,placed on IR with triceps injury
2,2018-01-02,Panthers,,• Chris Manhertz,placed on IR with ankle injury
3,2018-01-02,Cardinals,• Carson Palmer,,activated from IR
4,2018-01-03,Chiefs,,• Phillip Gaines,placed on IR with dislocated elbow
5,2018-01-03,Saints,,• Garrett Griffin,placed on IR with foot injury
6,2018-01-09,Vikings,,• Dylan Bradley,placed on practice squad IR with undisclosed ...
7,2018-01-10,Saints,,• Andrus Peat,placed on IR with fractured fibula in leg
8,2018-01-10,Saints,,• Tony McDaniel,placed on IR with leg injury
9,2018-01-13,Vikings,• Sam Bradford,,activated from IR
10,2018-01-17,Patriots,,• Jonathan Jones,placed on IR with ankle injury


In [146]:
table.replace('\•','',regex=True)

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
1,2018-01-02,Falcons,,Andy Levitre,placed on IR with triceps injury
2,2018-01-02,Panthers,,Chris Manhertz,placed on IR with ankle injury
3,2018-01-02,Cardinals,Carson Palmer,,activated from IR
4,2018-01-03,Chiefs,,Phillip Gaines,placed on IR with dislocated elbow
5,2018-01-03,Saints,,Garrett Griffin,placed on IR with foot injury
6,2018-01-09,Vikings,,Dylan Bradley,placed on practice squad IR with undisclosed ...
7,2018-01-10,Saints,,Andrus Peat,placed on IR with fractured fibula in leg
8,2018-01-10,Saints,,Tony McDaniel,placed on IR with leg injury
9,2018-01-13,Vikings,Sam Bradford,,activated from IR
10,2018-01-17,Patriots,,Jonathan Jones,placed on IR with ankle injury


In [None]:
#write table to csv
table.to_csv("table", encoding='utf-8', index=False)

In [147]:
#INJURIES scaper
hp = HTMLTableParser()

#base URL to scrape
#dates 01/01/2009-02/04/2018 (superbowl)
base_url = "https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=2009-01-01&EndDate=2018-02-04&ILChkBx=yes&submit=Search"

#create CSVs for each search page table -- 388 pages total
for i in range(0,389):
    if i == 0:
        #URL to scrape
        url = base_url
        #create table
        file = hp.parse_url(url)[0][1]
        file = file.replace('\•','',regex=True)
        #write to CSV
        file.to_csv('inj_table'+'_'+str(i), encoding='utf-8', index=False)
    else:
        url = base_url+'&start='+str(25*i)
        #create table
        file = hp.parse_url(url)[0][1]
        file = file.replace('\•','',regex=True)
        #write to CSV
        file.to_csv('inj_table'+'_'+str(i), encoding='utf-8', index=False)

#combine CSVs

fout=open("injuries.csv","a")
# first file:
for line in open("inj_table_0"):
    fout.write(line)
# now the rest:    
for num in range(1,389):
    f = open("inj_table_"+str(num))
    f.__next__() # skip the header
    for line in f:
        fout.write(line)
    f.close() # not really needed
fout.close()


In [153]:
#Player/Coach/Exec Movements Scraper

hp = HTMLTableParser()

#base URL to scrape
#dates 01/01/2009-02/04/2018 (superbowl)
base_url = "https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=2009-01-01&EndDate=2018-02-04&PlayerMovementChkBx=yes&submit=Search"

#create CSVs for each search page table -- 2639 pages total
for i in range(0,2640):
    if i == 0:
        #URL to scrape
        url = base_url
        #create table
        file = hp.parse_url(url)[0][1]
        file = file.replace('\•','',regex=True)
        #write to CSV
        file.to_csv('table'+'_'+str(i), encoding='utf-8', index=False)
    else:
        url = base_url+'&start='+str(25*i)
        #create table
        file = hp.parse_url(url)[0][1]
        file = file.replace('\•','',regex=True)
        #write to CSV
        file.to_csv('table'+'_'+str(i), encoding='utf-8', index=False)


In [154]:
#combine CSVs - for Player/Coach/Exec Movements

fout=open("pce_movements.csv","a")
# first file:
for line in open("table_0"):
    fout.write(line)
# now the rest:    
for num in range(1,2640):
    f = open("table_"+str(num))
    f.__next__() # skip the header
    for line in f:
        fout.write(line)
    f.close() # not really needed
fout.close()

In [5]:
#missed games due to injury

hp = HTMLTableParser()

#base URL to scrape
#dates 01/01/2009-02/04/2018 (superbowl)
base_url = "https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=2009-01-01&EndDate=2018-02-04&InjuriesChkBx=yes&submit=Search"

#create CSVs for each search page table -- 285 pages total
for i in range(0,286):
    if i == 0:
        #URL to scrape
        url = base_url
        #create table
        file = hp.parse_url(url)[0][1]
        file = file.replace('\•','',regex=True)
        #write to CSV
        file.to_csv('mg_i_table'+'_'+str(i), encoding='utf-8', index=False)
    else:
        url = base_url+'&start='+str(25*i)
        #create table
        file = hp.parse_url(url)[0][1]
        file = file.replace('\•','',regex=True)
        #write to CSV
        file.to_csv('mg_i_table'+'_'+str(i), encoding='utf-8', index=False)
        

#combine CSVs

fout=open("missed_games_injury.csv","a")
# first file:
for line in open("mg_i_table_0"):
    fout.write(line)
# now the rest:    
for num in range(1,286):
    f = open("mg_i_table_"+str(num))
    f.__next__() # skip the header
    for line in f:
        fout.write(line)
    f.close() # not really needed
fout.close()



In [6]:
#missed games due to personal reasons

hp = HTMLTableParser()

#base URL to scrape
#dates 01/01/2009-02/04/2018 (superbowl) 
base_url = "http://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=2009-01-01&EndDate=2018-02-04&PersonalChkBx=yes&submit=Search"

#URL to scrape - only 1 page
url = base_url
#create table
file = hp.parse_url(url)[0][1]
file = file.replace('\•','',regex=True)
#write to CSV
file.to_csv('mg_pr_table', encoding='utf-8', index=False)


In [7]:
#disciplinary actions(suspensions, fines, etc.)

hp = HTMLTableParser()

#base URL to scrape
#dates 01/01/2009-02/04/2018 (superbowl)
base_url = "http://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=2009-01-01&EndDate=2018-02-04&DisciplinaryChkBx=yes&submit=Search"


#create CSVs for each search page table -- 58 pages
for i in range(0,59):
    if i == 0:
        #URL to scrape
        url = base_url
        #create table
        file = hp.parse_url(url)[0][1]
        file = file.replace('\•','',regex=True)
        #write to CSV
        file.to_csv('dis_table'+'_'+str(i), encoding='utf-8', index=False)
    else:
        url = base_url+'&start='+str(25*i)
        #create table
        file = hp.parse_url(url)[0][1]
        file = file.replace('\•','',regex=True)
        #write to CSV
        file.to_csv('dis_table'+'_'+str(i), encoding='utf-8', index=False)
        
#combine CSVs

fout=open("disciplinary.csv","a")
# first file:
for line in open("dis_table_0"):
    fout.write(line)
# now the rest:    
for num in range(1,58):
    f = open("dis_table_"+str(num))
    f.__next__() # skip the header
    for line in f:
        fout.write(line)
    f.close() # not really needed
fout.close()

In [8]:
#legal/criminal incidents

hp = HTMLTableParser()

#base URL to scrape
#dates 01/01/2009-02/04/2018 (superbowl)
base_url = "http://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=2009-01-01&EndDate=2018-02-04&LegalChkBx=yes&submit=Search"


#create CSVs for each search page table -- 9 pages
for i in range(0,10):
    if i == 0:
        #URL to scrape
        url = base_url
        #create table
        file = hp.parse_url(url)[0][1]
        file = file.replace('\•','',regex=True)
        #write to CSV
        file.to_csv('leg_table'+'_'+str(i), encoding='utf-8', index=False)
    else:
        url = base_url+'&start='+str(25*i)
        #create table
        file = hp.parse_url(url)[0][1]
        file = file.replace('\•','',regex=True)
        #write to CSV
        file.to_csv('leg_table'+'_'+str(i), encoding='utf-8', index=False)
        
#combine CSVs

fout=open("legal.csv","a")
# first file:
for line in open("leg_table_0"):
    fout.write(line)
# now the rest:    
for num in range(1,9):
    f = open("leg_table_"+str(num))
    f.__next__() # skip the header
    for line in f:
        fout.write(line)
    f.close() # not really needed
fout.close()