In [1]:
import pandas as pd 
import numpy as np
from matplotlib.ticker import FuncFormatter
import csv
import requests
from bs4 import BeautifulSoup as bs


In [2]:
url="https://www.spotrac.com/nfl/contracts/quarterback/" #Getting salary data by scraping with Beautiful Soup
page=requests.get(url)
soup=bs(page.content,"lxml")

In [3]:
salary=soup.find_all(class_='right xs-hide')
names=soup.find_all(class_='player')

In [4]:
del names[0] #Removing headers
del salary[0:3]
players=[]
salaries=[]

In [5]:
for n in names: #Building list of clean player names
    x=n.children
    next(x)
    name=next(x)
    players.append(name.text)

In [6]:
for i in range(len(salary)):  #Building list of clean salaries. We want average yearly salary so take every third figure
    if (i+3) % 3 == 0:
        salaries.append(salary[i].contents[0])

In [7]:
len(players)==len(salaries) #Test that lists are in correspondence

True

In [8]:
clean_salaries=[] #Get rid of dollars signs and commas and convert to integer
for s in salaries:
    sl=s.replace("$","")
    sal=sl.replace(",","")
    num=int(sal)
    clean_salaries.append(num)
    

In [9]:
salary_df=pd.DataFrame({"Player":players,"Salary":clean_salaries}) #Putting both lists into a dataframe
salary_df.head()

Unnamed: 0,Player,Salary
0,Matt Ryan,30000000
1,Jimmy Garoppolo,27500000
2,Matthew Stafford,27000000
3,Aaron Rodgers,33500000
4,Derek Carr,25000000


In [10]:
#Save dataframe for presentation notebook
salary_df.to_csv('Resources/Cleaned_Dataframes/salary_df.csv',columns=salary_df.columns,index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'Resources/Cleaned_Dataframes/salary_df.csv'

In [None]:
#Reading NFL Stats from downloaded CSV
nfl_stats = "Resources/NFL_Stats.csv"
nfl_stats_df = pd.read_csv(nfl_stats)
nfl_stats_df.head()

In [None]:
player_names=nfl_stats_df['Player'].tolist() #Clean asterisk from names
clean_names=[]
for name in player_names:
    clean=name.replace("*","")
    clean_names.append(clean)
nfl_stats_df['Player']=clean_names
nfl_stats_df.head()

In [None]:
# Clean Dataframe by renaming and deleting unnecssary columns

nfl_stats_df = nfl_stats_df.rename(columns={"Tm":"Team",
                                           "PB":"Pro Bowls",
                                           "Ht": "Height (in)"})
del nfl_stats_df["Wt"]
del nfl_stats_df["BMI"]
del nfl_stats_df["AV"]
del nfl_stats_df["AP1"]
del nfl_stats_df["Rk"]
del nfl_stats_df["Lg"]
del nfl_stats_df["Team"]

In [None]:
#Get Winning Percentage
nfl_stats_df ["W"].fillna(0, inplace=True)
nfl_stats_df ["L"].fillna(0, inplace=True)
nfl_stats_df ["T"].fillna(0, inplace=True)

nfl_stats_df["Win Record"] = (nfl_stats_df["W"]/(nfl_stats_df["W"] + nfl_stats_df["L"] + nfl_stats_df["T"]))

nfl_stats_df.head()

In [None]:
# Fill undrafted with a 99
nfl_stats_df["Draft"].fillna(99, inplace=True)
nfl_stats_df["Win Record"].fillna(0, inplace=True)
nfl_stats_df["Draft"] = nfl_stats_df["Draft"].astype(str)

In [None]:
#Dataframe of each Draft Round

undrafted = nfl_stats_df.loc[nfl_stats_df["Draft"] == "99.0"]
one = nfl_stats_df.loc[nfl_stats_df["Draft"] == "1.0"]
two = nfl_stats_df.loc[nfl_stats_df["Draft"] == "2.0"]
three = nfl_stats_df.loc[nfl_stats_df["Draft"] == "3.0"]
four = nfl_stats_df.loc[nfl_stats_df["Draft"] == "4.0"]
five = nfl_stats_df.loc[nfl_stats_df["Draft"] == "5.0"]
six = nfl_stats_df.loc[nfl_stats_df["Draft"] == "6.0"]
seven = nfl_stats_df.loc[nfl_stats_df["Draft"] == "7.0"]
eight = nfl_stats_df.loc[nfl_stats_df["Draft"] == "8.0"]
nine = nfl_stats_df.loc[nfl_stats_df["Draft"] == "9.0"]
ten = nfl_stats_df.loc[nfl_stats_df["Draft"] == "10.0"]
eleven = nfl_stats_df.loc[nfl_stats_df["Draft"] == "11.0"]
twelve = nfl_stats_df.loc[nfl_stats_df["Draft"] == "12.0"]
thirtheen = nfl_stats_df.loc[nfl_stats_df["Draft"] == "13.0"]
fourtheen = nfl_stats_df.loc[nfl_stats_df["Draft"] == "14.0"]

In [None]:
#Build Dataframe organized by Round and Average Passer Rating

ratings_by_draft = pd.DataFrame({"Undrafted":[undrafted["Rate"].mean()],
                                 "1st Round" : [one["Rate"].mean()],
                                 "2nd Round" : [two["Rate"].mean()],
                                 "3rd Round" : [three["Rate"].mean()],
                                 "4th Round" : [four["Rate"].mean()],
                                 "5th Round" : [five["Rate"].mean()],
                                 "6th Round" : [six["Rate"].mean()],
                                 "7th Round" : [seven["Rate"].mean()],
                                 "8th Round" : [eight["Rate"].mean()],
                                 "9th Round" : [nine["Rate"].mean()],
                                 "10th Round" : [ten["Rate"].mean()],
                                 "11th Round" : [eleven["Rate"].mean()],
                                 "12th Round" : [twelve["Rate"].mean()],
                                 "13th Round" : [thirtheen["Rate"].mean()],
                                 "14th Round" : [fourtheen["Rate"].mean()]})

ratings_by_draft = ratings_by_draft.T
ratings_by_draft.reset_index(level=0, inplace = True)
ratings_by_draft = ratings_by_draft.rename(columns={"index": "Draft Round",
                                                   "0": "Passer Rating"})

In [None]:
#Save dataframes for presentation notebook
nfl_stats_df.to_csv("Resources/Cleaned_Dataframes/nfl_stats_df.csv",columns=nfl_stats_df.columns,index=False)
ratings_by_draft.to_csv("Resources/Cleaned_Dataframes/ratings_by_draft.csv",columns=ratings_by_draft.columns,index=False)

In [None]:
reduced_nfl_stats_df = nfl_stats_df.loc[:,["Player","Draft", "Rate", "Pro Bowls", "Win Record"]]
grpby_draft = reduced_nfl_stats_df.groupby("Draft", as_index=False)
average_rate = pd.DataFrame(grpby_draft ["Rate"].mean())
average_rate

In [None]:
percent_nfl_stats_df=reduced_nfl_stats_df.dropna()
percent_nfl_stats_df["Win Record"] = percent_nfl_stats_df["Win Record"]*100
percent_nfl_stats_df.head()


In [None]:
#Save dataframes for presentation notebook
percent_nfl_stats_df.to_csv("Resources/Cleaned_Dataframes/percent_nfl_stats_df.csv",columns=percent_nfl_stats_df.columns,index=False)
average_rate.to_csv("Resources/Cleaned_Dataframes/average_rate.csv",columns=average_rate.columns,index=False)

In [None]:
#Get and clean college football players and ratings
player=[]
rate=[]
for year in range(1982,2018):
    file="CFB/"+str(year)+".csv"
    with open(file,newline='') as f:
        reader=csv.reader(f)
        next(reader)
        next(reader)
        for row in reader:
            bad_name=row[1].replace("*","") #Cleaning player names
            split_names=bad_name.split("\\")
            clean_name=split_names[0]                          
            player.append(clean_name)
            rate.append(float(row[13]))

In [None]:
len(player)==len(rate) #Test

In [None]:
#Go backwards to get only last year of college for each player
players=[]
ratings=[]  
for i in range(len(player)-1,-1,-1): 
    if player[i] not in players:
        players.append(player[i])
        ratings.append(rate[i])

In [None]:
len(players)==len(ratings)#test

In [None]:
college=pd.DataFrame({"Player":players,"College Rating":ratings})
college.head()

In [None]:
names=[]
qbr=[]
wins=[]
loss=[]
pbs=[]
winper=[]
games=[]

In [None]:
with open("Resources/NFL_Stats.csv",newline="") as nf: #NFL data to be merged with college data
    read=csv.reader(nf)
    next(read)
    for r in read:
        names.append(r[1].replace("*","")) #Clean asterisk from names
        qbr.append(float(r[18]))
        try:
            wins.append(int(r[21]))
        except:
            wins.append("")
        try:
            loss.append(int(r[22]))
        except:
            loss.append("")
        try:
            per=round((int(r[21])/(int(r[21])+int(r[22])))*100,1)
            winper.append(per)         
        except:
            winper.append("")
        try:
            games.append(int(r[21])+int(r[22]))
        except:
            games.append("")
        pbs.append(int(r[25]))

In [None]:
len(names)==len(qbr)&len(wins)==(len(loss)&len(pbs)) #Test

In [None]:
nfltmp=pd.DataFrame({"Player":names,"NFL QBR":qbr,"Games":games,"Wins":wins,"Losses":loss,"Win Percentage":winper})
nfltmp.head()

In [None]:
#Save dataframes for presentation notebook
college.to_csv("Resources/Cleaned_Dataframes/college.csv",columns=college.columns,index=False)
nfltmp.to_csv("Resources/Cleaned_Dataframes/nfltmp.csv",columns=nfltmp.columns,index=False)

In [None]:
nfl_picks=pd.read_csv("Resources/QB Draft Stats.csv") #This CSV has more granular draft data, although fewer players
nfl_picks.head()

In [None]:
picks=pd.DataFrame({"Player":nfl_picks["Unnamed: 4"],"Pick":nfl_picks["Unnamed: 3"]})
picks=picks.drop(0)
picks.head()

In [None]:
pick_stats=pd.merge(picks,nfl_stats_df,on="Player")
pick_stats.head()

In [None]:
#Save dataframe for presentation notebook
pick_stats.to_csv("Resources/Cleaned_Dataframes/pick_stats.csv",columns=pick_stats.columns,index=False)
