In [1]:
import pandas as pd 
import numpy as np
from matplotlib.ticker import FuncFormatter
import csv
import requests
from bs4 import BeautifulSoup as bs
import json
from pprint import pprint
from config import api_key 
from player_key import player_key

In [2]:
url="https://www.spotrac.com/nfl/contracts/quarterback/" #Getting salary data by scraping with Beautiful Soup
page=requests.get(url)
soup=bs(page.content,"lxml")

In [3]:
salary=soup.find_all(class_='right xs-hide')
names=soup.find_all(class_='player')

In [4]:
del names[0] #Removing headers
del salary[0:3]
players=[]
salaries=[]

In [5]:
for n in names: #Building list of clean player names
    x=n.children
    next(x)
    name=next(x)
    players.append(name.text)

In [6]:
for i in range(len(salary)):  #Building list of clean salaries. We want average yearly salary so take every third figure
    if (i+3) % 3 == 0:
        salaries.append(salary[i].contents[0])

In [7]:
len(players)==len(salaries) #Test that lists are in correspondence

True

In [8]:
clean_salaries=[] #Get rid of dollars signs and commas and convert to integer
for s in salaries:
    sl=s.replace("$","")
    sal=sl.replace(",","")
    num=int(sal)
    clean_salaries.append(num)
    

In [9]:
salary_df=pd.DataFrame({"Player":players,"Salary":clean_salaries}) #Putting both lists into a dataframe
salary_df.head()

Unnamed: 0,Player,Salary
0,Matt Ryan,30000000
1,Jimmy Garoppolo,27500000
2,Matthew Stafford,27000000
3,Aaron Rodgers,33500000
4,Derek Carr,25000000


In [10]:
#Save dataframe for presentation notebook
salary_df.to_csv('Resources/Cleaned_Dataframes/salary_df.csv',columns=salary_df.columns,index=False)

In [11]:
#Reading NFL Stats from downloaded CSV
nfl_stats = "Resources/NFL_Stats.csv"
nfl_stats_df = pd.read_csv(nfl_stats)
nfl_stats_df.head()

Unnamed: 0,Rk,Player,From,To,Draft,Tm,Lg,Ht,Wt,BMI,...,Rate,Y/A,Y/G,W,L,T,Yrs,PB,AP1,AV
0,1,Peyton Manning,1998,2015,1.0,TOT,NFL,77,230,27.3,...,96.5,7.67,270.5,186.0,79.0,0.0,17,14,7,271
1,2,Tom Brady,2000,2018,6.0,NWE,NFL,76,225,27.4,...,97.6,7.51,261.5,196.0,55.0,0.0,19,13,3,255
2,3,Drew Brees,2001,2018,2.0,TOT,NFL,72,209,28.3,...,96.7,7.58,282.9,142.0,106.0,0.0,18,11,1,239
3,4,Brett Favre*,1991,2010,2.0,TOT,NFL,74,225,28.9,...,86.0,7.06,237.9,186.0,112.0,0.0,20,11,3,255
4,5,John Elway*,1983,1998,1.0,DEN,NFL,75,215,26.9,...,79.9,7.1,220.0,148.0,82.0,1.0,16,9,0,203


In [12]:
player_names=nfl_stats_df['Player'].tolist() #Clean asterisk from names
clean_names=[]
for name in player_names:
    clean=name.replace("*","")
    clean_names.append(clean)
nfl_stats_df['Player']=clean_names
nfl_stats_df.head()

Unnamed: 0,Rk,Player,From,To,Draft,Tm,Lg,Ht,Wt,BMI,...,Rate,Y/A,Y/G,W,L,T,Yrs,PB,AP1,AV
0,1,Peyton Manning,1998,2015,1.0,TOT,NFL,77,230,27.3,...,96.5,7.67,270.5,186.0,79.0,0.0,17,14,7,271
1,2,Tom Brady,2000,2018,6.0,NWE,NFL,76,225,27.4,...,97.6,7.51,261.5,196.0,55.0,0.0,19,13,3,255
2,3,Drew Brees,2001,2018,2.0,TOT,NFL,72,209,28.3,...,96.7,7.58,282.9,142.0,106.0,0.0,18,11,1,239
3,4,Brett Favre,1991,2010,2.0,TOT,NFL,74,225,28.9,...,86.0,7.06,237.9,186.0,112.0,0.0,20,11,3,255
4,5,John Elway,1983,1998,1.0,DEN,NFL,75,215,26.9,...,79.9,7.1,220.0,148.0,82.0,1.0,16,9,0,203


In [13]:
# Clean Dataframe by renaming and deleting unnecssary columns

nfl_stats_df = nfl_stats_df.rename(columns={"Tm":"Team",
                                           "PB":"Pro Bowls",
                                           "Ht": "Height (in)"})
del nfl_stats_df["Wt"]
del nfl_stats_df["BMI"]
del nfl_stats_df["AV"]
del nfl_stats_df["AP1"]
del nfl_stats_df["Rk"]
del nfl_stats_df["Lg"]
del nfl_stats_df["Team"]

In [14]:
#Get Winning Percentage
nfl_stats_df ["W"].fillna(0, inplace=True)
nfl_stats_df ["L"].fillna(0, inplace=True)
nfl_stats_df ["T"].fillna(0, inplace=True)

nfl_stats_df["Win Record"] = (nfl_stats_df["W"]/(nfl_stats_df["W"] + nfl_stats_df["L"] + nfl_stats_df["T"]))

nfl_stats_df.head()

Unnamed: 0,Player,From,To,Draft,Height (in),G,GS,Cmp,Att,Cmp%,...,Int,Rate,Y/A,Y/G,W,L,T,Yrs,Pro Bowls,Win Record
0,Peyton Manning,1998,2015,1.0,77,266,265,6125,9380,65.3,...,251,96.5,7.67,270.5,186.0,79.0,0.0,17,14,0.701887
1,Tom Brady,2000,2018,6.0,76,253,251,5629,8805,63.93,...,160,97.6,7.51,261.5,196.0,55.0,0.0,19,13,0.780876
2,Drew Brees,2001,2018,2.0,72,249,248,6222,9294,66.95,...,228,96.7,7.58,282.9,142.0,106.0,0.0,18,11,0.572581
3,Brett Favre,1991,2010,2.0,74,302,298,6300,10169,61.95,...,336,86.0,7.06,237.9,186.0,112.0,0.0,20,11,0.624161
4,John Elway,1983,1998,1.0,75,234,231,4123,7250,56.87,...,226,79.9,7.1,220.0,148.0,82.0,1.0,16,9,0.640693


In [15]:
# Fill undrafted with 14
nfl_stats_df["Draft"].fillna(14, inplace=True)
nfl_stats_df["Win Record"].fillna(0, inplace=True)
nfl_stats_df["Draft"] = nfl_stats_df["Draft"].astype(str)

In [16]:
#Reduce the effect of outliers by eliminating players who played in fewer than five games in their careers
nfl_stats_df=nfl_stats_df[nfl_stats_df["GS"]>=5]

#Lance Moore was an NFL receiver who threw one pass on a trick play. As he's not a real QB he shouldn't be in the dataset.
nfl_stats_df=nfl_stats_df[nfl_stats_df['Att']>1]

In [17]:
#Dataframe of each Draft Round

undrafted = nfl_stats_df.loc[nfl_stats_df["Draft"] == "15.0"]
one = nfl_stats_df.loc[nfl_stats_df["Draft"] == "1.0"]
two = nfl_stats_df.loc[nfl_stats_df["Draft"] == "2.0"]
three = nfl_stats_df.loc[nfl_stats_df["Draft"] == "3.0"]
four = nfl_stats_df.loc[nfl_stats_df["Draft"] == "4.0"]
five = nfl_stats_df.loc[nfl_stats_df["Draft"] == "5.0"]
six = nfl_stats_df.loc[nfl_stats_df["Draft"] == "6.0"]
seven = nfl_stats_df.loc[nfl_stats_df["Draft"] == "7.0"]
eight = nfl_stats_df.loc[nfl_stats_df["Draft"] == "8.0"]
nine = nfl_stats_df.loc[nfl_stats_df["Draft"] == "9.0"]
ten = nfl_stats_df.loc[nfl_stats_df["Draft"] == "10.0"]
eleven = nfl_stats_df.loc[nfl_stats_df["Draft"] == "11.0"]
twelve = nfl_stats_df.loc[nfl_stats_df["Draft"] == "12.0"]
thirtheen = nfl_stats_df.loc[nfl_stats_df["Draft"] == "13.0"]
fourtheen = nfl_stats_df.loc[nfl_stats_df["Draft"] == "14.0"]

In [18]:
#Build Dataframe organized by Round and Average Passer Rating

ratings_by_draft = pd.DataFrame({"Undrafted":[undrafted["Rate"].mean()],
                                 "1st Round" : [one["Rate"].mean()],
                                 "2nd Round" : [two["Rate"].mean()],
                                 "3rd Round" : [three["Rate"].mean()],
                                 "4th Round" : [four["Rate"].mean()],
                                 "5th Round" : [five["Rate"].mean()],
                                 "6th Round" : [six["Rate"].mean()],
                                 "7th Round" : [seven["Rate"].mean()],
                                 "8th Round" : [eight["Rate"].mean()],
                                 "9th Round" : [nine["Rate"].mean()],
                                 "10th Round" : [ten["Rate"].mean()],
                                 "11th Round" : [eleven["Rate"].mean()],
                                 "12th Round" : [twelve["Rate"].mean()],
                                 "13th Round" : [thirtheen["Rate"].mean()],
                                 "14th Round" : [fourtheen["Rate"].mean()]})

ratings_by_draft = ratings_by_draft.T
ratings_by_draft.reset_index(level=0, inplace = True)
ratings_by_draft = ratings_by_draft.rename(columns={"index": "Draft Round",
                                                   "0": "Passer Rating"})

In [19]:
#Save dataframes for presentation notebook
nfl_stats_df.to_csv("Resources/Cleaned_Dataframes/nfl_stats_df.csv",columns=nfl_stats_df.columns,index=False)
ratings_by_draft.to_csv("Resources/Cleaned_Dataframes/ratings_by_draft.csv",columns=ratings_by_draft.columns,index=False)

In [20]:
reduced_nfl_stats_df = nfl_stats_df.loc[:,["Player","Draft", "Rate", "Pro Bowls", "Win Record"]]
grpby_draft = reduced_nfl_stats_df.groupby("Draft", as_index=False)
average_rate = pd.DataFrame(grpby_draft ["Rate"].mean())
average_rate

Unnamed: 0,Draft,Rate
0,1.0,76.959375
1,10.0,66.225
2,11.0,73.25
3,12.0,70.45
4,13.0,79.1
5,14.0,75.569048
6,2.0,75.227273
7,3.0,74.3425
8,4.0,71.980645
9,5.0,68.16875


In [21]:
percent_nfl_stats_df=reduced_nfl_stats_df.dropna()
percent_nfl_stats_df["Win Record"] = percent_nfl_stats_df["Win Record"]*100
percent_nfl_stats_df.head()


Unnamed: 0,Player,Draft,Rate,Pro Bowls,Win Record
0,Peyton Manning,1.0,96.5,14,70.188679
1,Tom Brady,6.0,97.6,13,78.087649
2,Drew Brees,2.0,96.7,11,57.258065
3,Brett Favre,2.0,86.0,11,62.416107
4,John Elway,1.0,79.9,9,64.069264


In [22]:
#Save dataframes for presentation notebook
percent_nfl_stats_df.to_csv("Resources/Cleaned_Dataframes/percent_nfl_stats_df.csv",columns=percent_nfl_stats_df.columns,index=False)
average_rate.to_csv("Resources/Cleaned_Dataframes/average_rate.csv",columns=average_rate.columns,index=False)

In [23]:
#Get and clean college football players and ratings
player=[]
rate=[]
ya=[]
aya=[]
for year in range(1982,2018):
    file="CFB/"+str(year)+".csv"
    with open(file,newline='') as f:
        reader=csv.reader(f)
        next(reader)
        next(reader)
        for row in reader:
            bad_name=row[1].replace("*","") #Cleaning player names
            split_names=bad_name.split("\\")
            clean_name=split_names[0]                          
            player.append(clean_name)
            rate.append(float(row[13]))
            ya.append(float(row[9]))
            aya.append(float(row[10]))

In [24]:
len(player)==len(rate) #Test

True

In [25]:
#Go backwards to get only last year of college for each player
players=[]
ratings=[] 
yards_attempt=[]
yards_adjusted=[]
for i in range(len(player)-1,-1,-1): 
    if player[i] not in players:
        players.append(player[i])
        ratings.append(rate[i])
        yards_attempt.append(ya[i])
        yards_adjusted.append(aya[i])

In [26]:
len(players)&len(yards_attempt)==len(ratings)&len(yards_adjusted) #test

True

In [27]:
college=pd.DataFrame({"Player":players,"College Rating":ratings,"yards_attempt":yards_attempt,"yards_adjusted":yards_adjusted})
college.head()


Unnamed: 0,Player,College Rating,yards_attempt,yards_adjusted
0,Anthony Brown,103.5,5.3,4.6
1,Hasaan Klugh,96.1,5.5,4.1
2,Max Bortenschlager,109.1,5.6,5.5
3,Steven Williams,103.9,5.8,4.4
4,Peyton Bender,108.5,5.9,5.0


In [28]:
names=[]
qbr=[]
wins=[]
loss=[]
pbs=[]
winper=[]
games=[]

In [29]:
len(names)==len(qbr)&len(wins)==(len(loss)&len(pbs)) #Test

True

In [30]:
#Save dataframes for presentation notebook
college.to_csv("Resources/Cleaned_Dataframes/college.csv",columns=college.columns,index=False)

In [31]:
nfl_picks=pd.read_csv("Resources/QB Draft Stats.csv") #This CSV has more granular draft data, although fewer players
nfl_picks.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Rushing,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,Rk,Year,Rnd,Pick,,Pos,DrAge,Tm,From,To,...,Cmp,Att,Yds,TD,Int,Att,Yds,TD,College/Univ,
1,1,2018,1,1,Baker Mayfield,QB,23,CLE,2018,2018,...,,,,,,,,,Oklahoma,College Stats
2,2,2018,1,3,Sam Darnold,QB,21,NYJ,2018,2018,...,,,,,,,,,USC,College Stats
3,3,2018,1,7,Josh Allen,QB,22,BUF,2018,2018,...,,,,,,,,,Wyoming,College Stats
4,4,2018,1,10,Josh Rosen,QB,21,ARI,2018,2018,...,,,,,,,,,UCLA,College Stats


In [32]:
picks=pd.DataFrame({"Player":nfl_picks["Unnamed: 4"],"Pick":nfl_picks["Unnamed: 3"]})
picks=picks.drop(0)
picks.head()

Unnamed: 0,Player,Pick
1,Baker Mayfield,1
2,Sam Darnold,3
3,Josh Allen,7
4,Josh Rosen,10
5,Lamar Jackson,32


In [33]:
pick_stats=pd.merge(picks,nfl_stats_df,on="Player")
pick_stats.head()

Unnamed: 0,Player,Pick,From,To,Draft,Height (in),G,GS,Cmp,Att,...,Int,Rate,Y/A,Y/G,W,L,T,Yrs,Pro Bowls,Win Record
0,Mitch Trubisky,2,2017,2018,1.0,75,12,12,196,330,...,7,77.5,6.65,182.8,4.0,8.0,0.0,2,0,0.333333
1,Deshaun Watson,12,2017,2018,1.0,74,7,6,126,204,...,8,103.0,8.33,242.7,3.0,3.0,0.0,2,0,0.5
2,DeShone Kizer,52,2017,2018,2.0,76,15,15,255,476,...,22,60.5,6.08,192.9,0.0,15.0,0.0,2,0,0.0
3,C.J. Beathard,104,2017,2018,3.0,74,7,5,123,224,...,6,69.2,6.38,204.3,1.0,4.0,0.0,2,0,0.2
4,Jared Goff,1,2016,2017,1.0,76,22,22,408,682,...,14,89.4,7.17,222.4,11.0,11.0,0.0,2,1,0.5


In [34]:
#Save dataframe for presentation notebook
pick_stats.to_csv("Resources/Cleaned_Dataframes/pick_stats.csv",columns=pick_stats.columns,index=False)


In [35]:
url = "http://api.sportradar.us/ncaafb-t1/teams/OKL/2017/REG/statistics.json?api_key="
key_url = url + api_key

response = requests.get(key_url)
print(response.url)
data = response.json()

http://api.sportradar.us/ncaafb-t1/teams/OKL/2017/REG/statistics.json?api_key=74b3gmb4dspnhbcv4fv28n34


In [36]:
Mayfield = ["Baker Mayfield"]
mayfield_df = pd.DataFrame(Mayfield)
mayfield_df.columns = ["name"]
mayfield_df ['Draft'] = ''
mayfield_df ['height (inches)'] = ''
mayfield_df ['att'] = ''
mayfield_df ['comp'] = ''
mayfield_df ['yards'] = ''
mayfield_df ['TD'] = ''
mayfield_df ['INT'] = ''
mayfield_df ['yards'] = ''
mayfield_df ['QBR'] = ''
mayfield_df ['Wins'] = ''
mayfield_df ['Loss'] = ''
mayfield_df ['W/L Record'] = ''

In [38]:
base_url = "http://api.sportradar.us/ncaafb-t1/teams/FBS/2017/REG/standings.json?api_key="
standing_url = base_url + api_key
standing = requests.get(standing_url)
standing_data = standing.json()

In [39]:
player_url = "http://api.sportradar.us/nfl-ot1/players/30198d30-9769-4e10-ac86-b4c91d940802/profile.json?api_key="
baker_url = player_url + player_key
baker_profile = requests.get(baker_url).json()

In [40]:
mayfield_df["att"] = data["players"][86]["statistics"]["passing"]["att"]
mayfield_df["Draft"] = baker_profile["draft"]["round"]
mayfield_df["height (inches)"] = baker_profile["height"]
mayfield_df["comp"] = data["players"][86]["statistics"]["passing"]["cmp"]
mayfield_df["yards"] = data["players"][86]["statistics"]["passing"]["yds"]
mayfield_df["TD"] = data["players"][86]["statistics"]["passing"]["td"]
mayfield_df["INT"] = data["players"][86]["statistics"]["passing"]["int"]
mayfield_df["Wins"] = standing_data["division"]["conferences"][2]["teams"][0]["overall"]["wins"]
mayfield_df["Loss"] = standing_data["division"]["conferences"][2]["teams"][0]["overall"]["losses"]
mayfield_df["W/L Record"] = standing_data["division"]["conferences"][2]["teams"][0]["overall"]["wpct"]

In [41]:
#QBR Formula (http://football.stassen.com/cgi-bin/calc-pe.pl?a=404&c=285&y=4627&t=43&i=6)
completion_variable = (((mayfield_df["comp"])/(mayfield_df["att"])*(100))-30)*.05
yd_per_attempt_var = ((mayfield_df["yards"])/(mayfield_df["att"])-3)*0.25
td_var = (20)*(mayfield_df["TD"])/(mayfield_df["att"] )
int_var = (2.375) - ((25)*(mayfield_df["INT"])/(mayfield_df["att"]))

qbr = ((completion_variable + yd_per_attempt_var + td_var + int_var)/6)*100
mayfield_df["QBR"] = qbr

In [42]:
mayfield_df

Unnamed: 0,name,Draft,height (inches),att,comp,yards,TD,INT,QBR,Wins,Loss,W/L Record
0,Baker Mayfield,1,73,404,285,4627,43,6,137.881601,12,2,0.857


In [43]:
mayfield_df.to_csv("Resources/Cleaned_Dataframes/mayfield_df.csv",columns=mayfield_df.columns,index=False)