# Project Overview

The dataset I am using comes from Kaggle. It has various QB stats from 1996-2016. I want to see if any of the variables describing QB performance have predictive power for total team points.  

https://www.kaggle.com/speckledpingu/nfl-qb-stats

# Importing Basic Libraries

In [41]:
#These are the libraries I typically use in my analysis so I find it easier to import them all at once
#If I need more libraries I will import them as needed

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline

# Initial Review of the Data

In [42]:
#Reading in of the dataset

qb_analysis = pd.read_csv("QBStats_all.csv")

In [43]:
#The beginning structure of this dataset contains 13,188 rows and 14 columns worth of data

qb_analysis.shape

(13188, 14)

In [44]:
#Here I am checking what the data types are for my potential explanatory variables
#Looks like most of them are numeric (integers or floats)
#However, some like interceptions look like they need to be converted to an integer
#Year should also be converted to a string

qb_analysis.dtypes

qb              object
att              int64
cmp              int64
yds            float64
ypa            float64
td               int64
int             object
lg              object
sack           float64
loss           float64
rate           float64
game_points      int64
home_away       object
year             int64
dtype: object

In [45]:
#A brief look at the beginning of the dataset

qb_analysis.head()

Unnamed: 0,qb,att,cmp,yds,ypa,td,int,lg,sack,loss,rate,game_points,home_away,year
0,Boomer EsiasonB. Esiason,38,25,237.0,6.2,0,0,20,2.0,11.0,82.9,13,away,1996
1,Jim HarbaughJ. Harbaugh,25,16,196.0,7.8,2,1,35t,0.0,0.0,98.1,20,home,1996
2,Paul JustinP. Justin,8,5,53.0,6.6,0,0,30,1.0,11.0,81.8,20,home,1996
3,Jeff GeorgeJ. George,35,16,215.0,6.1,0,0,55,7.0,53.0,65.8,6,away,1996
4,Kerry CollinsK. Collins,31,17,198.0,6.4,2,0,30,4.0,12.0,95.9,29,home,1996


In [46]:
#A brief look at the end of the dataset

qb_analysis.tail()

Unnamed: 0,qb,att,cmp,yds,ypa,td,int,lg,sack,loss,rate,game_points,home_away,year
13183,Alex SmithA. Smith,28,21,264.0,9.4,2,1,42,1.0,2.0,112.8,37,away,2016
13184,Philip RiversP. Rivers,38,22,269.0,7.1,2,2,23,0.0,0.0,75.4,27,home,2016
13185,Russell WilsonR. Wilson,32,19,258.0,8.1,1,0,42,1.0,6.0,95.6,25,away,2016
13186,Trevone BoykinT. Boykin,6,4,42.0,7.0,0,0,19,1.0,5.0,86.8,25,away,2016
13187,Colin KaepernickC. Kaepernick,22,17,215.0,9.8,1,0,29,5.0,24.0,122.3,23,home,2016


# Glossary of Terms

1. QB: Quarterback name 
    
2. Att: Throwing attempts during the game
    
3. Cmp: Completions during the game
    
4. Yds: Yards thrown during the game
    
5. Ypa: Yards per attempt in the game
    
6. TD: Touchdowns thrown during the game
    
7. Int: Interceptions thrown during the game

8. Lg: Longest throw of the game
    
9. Sack: Sacks taken during the game
    
10. Loss: Loss of yards from the sack
    
11. Rate: QB rating during the game
    
12. Game_Points: Total points scored in the game
    
13. Home_Away: Home or away game
    
14. Year: Year the game occurred in

In [47]:
#Here are some standard statistics from the dataset

qb_analysis.describe()

Unnamed: 0,att,cmp,yds,ypa,td,sack,loss,rate,game_points,year
count,13188.0,13188.0,13188.0,13188.0,13188.0,13171.0,13171.0,13171.0,13188.0,13188.0
mean,26.870488,16.121777,186.135118,6.882909,1.118138,1.863108,11.971225,80.242571,21.379436,2005.898923
std,13.583991,8.774988,105.629874,4.226307,1.119667,1.701091,11.970053,32.068021,10.663992,6.012892
min,0.0,-6.0,-11.0,-11.0,0.0,0.0,0.0,0.0,0.0,1996.0
25%,20.0,11.0,115.0,5.2,0.0,0.0,0.0,58.6,13.0,2001.0
50%,29.0,17.0,197.0,6.7,1.0,2.0,9.0,80.9,21.0,2006.0
75%,36.0,22.0,260.0,8.1,2.0,3.0,18.0,102.0,28.0,2011.0
max,69.0,58.0,527.0,81.0,7.0,12.0,91.0,158.3,62.0,2016.0


In [48]:
#Here are some standard statistics from the dataset

qb_analysis.describe(include = 'object')

Unnamed: 0,qb,int,lg,home_away
count,13188,13188,13171,13188
unique,682,9,195,2
top,Peyton ManningP. Manning,0,0,home
freq,265,6384,738,6629


# Data Cleaning Round One

#### Checking for Null Values

In [49]:
#Thankfully there aren't any null values for sale price
#However, there are quite a few columns with many null values

qb_analysis.isna().sum() 

qb              0
att             0
cmp             0
yds             0
ypa             0
td              0
int             0
lg             17
sack           17
loss           17
rate           17
game_points     0
home_away       0
year            0
dtype: int64

In [50]:
qb_analysis = qb_analysis.dropna()

In [51]:
qb_analysis.isna().sum()

qb             0
att            0
cmp            0
yds            0
ypa            0
td             0
int            0
lg             0
sack           0
loss           0
rate           0
game_points    0
home_away      0
year           0
dtype: int64

In [52]:
qb_analysis['lg'].value_counts()

0      738
24     349
27     323
25     316
26     304
23     298
22     297
30     290
28     285
29     283
20     270
21     269
31     245
35     233
33     229
36     226
19     223
32     216
18     212
34     207
37     200
41     195
38     191
17     185
40     177
42     176
39     169
46     155
16     143
47     142
      ... 
95t      4
5t       4
4t       4
77       4
81       3
12t      3
98t      3
84       3
8t       3
-3       3
-8       3
-4       2
82       2
9t       2
6t       2
94t      2
93t      2
83       2
80       2
95       1
-6       1
88       1
-11      1
11t      1
86       1
92       1
92t      1
96t      1
87       1
-7       1
Name: lg, Length: 195, dtype: int64

In [53]:
qb_analysis['lg'] = qb_analysis['lg'].str.replace('t','')

In [54]:
qb_analysis['lg'].value_counts()

0      738
24     387
27     368
30     366
25     358
26     348
23     336
28     333
22     332
29     331
31     315
35     308
20     300
21     296
36     287
33     282
34     273
37     265
32     262
40     243
19     241
42     236
41     236
39     235
18     233
38     231
46     201
43     199
44     197
17     196
      ... 
78      28
79      25
2       22
81      20
1       19
84      16
82      16
87      15
83      13
86      12
85       9
88       7
90       7
91       6
-2       6
95       5
99       5
89       5
-1       5
-3       3
98       3
-8       3
92       2
94       2
93       2
-4       2
-7       1
-11      1
96       1
-6       1
Name: lg, Length: 107, dtype: int64

In [55]:
qb_analysis = qb_analysis.astype({'int':int, 'year':str, 'lg':int})

In [56]:
qb_analysis.dtypes

qb              object
att              int64
cmp              int64
yds            float64
ypa            float64
td               int64
int              int32
lg               int32
sack           float64
loss           float64
rate           float64
game_points      int64
home_away       object
year            object
dtype: object

# First Pass Model

In [57]:
qb_analysis_numeric = qb_analysis.drop(['qb', 'home_away', 'year'], axis = 'columns')

In [58]:
#Separating the variables between X and y

X = qb_analysis_numeric.drop('game_points', axis='columns')
y = qb_analysis_numeric.loc[:, 'game_points']

In [59]:
#Setting up a training set and a test set
#test_size = .2 means 20% of the data is set aside for the test set. 80% of the data is used for the training set
#You could also use train_size if you wish
#I am using the 80/20 split based on conventional use

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=30)

In [60]:
#Setting up a linear regression model using the training set

from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [61]:
#Scoring the model on the training set and test set.
#These are the R-squared values for the training set and test set. 

print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.4308130379375166
0.4164278914760636


#### First Impressions

blah blah blah

# Further Data Cleaning

In [62]:
qb_analysis['home_away'].value_counts()

home    6621
away    6550
Name: home_away, dtype: int64

In [64]:
qb_analysis['year'].value_counts()

2003    663
2009    654
1999    649
2004    648
2002    647
1998    642
2000    640
2007    639
2005    637
2008    626
2010    625
1997    624
1996    621
2006    620
2014    616
2011    614
2001    611
2016    610
2012    604
2015    592
2013    589
Name: year, dtype: int64

In [67]:
qb_analysis['year'].nunique()

21

In [65]:
qb_analysis['qb'].value_counts()

Peyton ManningP. Manning               265
Tom BradyT. Brady                      237
Brett FavreB. Favre                    235
Drew BreesD. Brees                     232
Eli ManningE. Manning                  199
Ben RoethlisbergerB. Roethlisberger    185
Kerry CollinsK. Collins                181
Philip RiversP. Rivers                 178
Matt HasselbeckM. Hasselbeck           177
Carson PalmerC. Palmer                 175
Donovan McNabbD. McNabb                165
Steve McNairS. McNair                  157
Drew BledsoeD. Bledsoe                 150
Mark BrunellM. Brunell                 146
Matt RyanM. Ryan                       142
Aaron RodgersA. Rodgers                141
Alex SmithA. Smith                     141
Jay CutlerJ. Cutler                    139
Jake PlummerJ. Plummer                 139
Jon KitnaJ. Kitna                      137
Joe FlaccoJ. Flacco                    137
Brad JohnsonB. Johnson                 136
Michael VickM. Vick                    134
Tony RomoT.

In [66]:
qb_analysis['qb'].nunique() 

673

In [68]:
#Here I am creating a function to convert the years built into four distinct eras of being built
#1842-1899 will be "1850"
#1900-1950 will be "1900"
#1951-1999 will be "1950"
#2000 and above will be "2000"

def era_Played(x):
    if x in ["1996", "1997", '1998', '1999']:
        return "Nineties_Era"
    elif x in ["2000", "2001", '2002', '2003', '2004', '2005']:
        return "Early_2000s"
    elif x in ["2006", "2007", '2008', '2009', '2010']:
        return "Late_2000s"
    else: return "Early_2010s"


qb_analysis['year'] = qb_analysis['year'].apply(era_Played)
print(qb_analysis['year'])

0        Nineties_Era
1        Nineties_Era
2        Nineties_Era
3        Nineties_Era
4        Nineties_Era
5        Nineties_Era
6        Nineties_Era
7        Nineties_Era
8        Nineties_Era
9        Nineties_Era
10       Nineties_Era
11       Nineties_Era
12       Nineties_Era
13       Nineties_Era
14       Nineties_Era
15       Nineties_Era
16       Nineties_Era
17       Nineties_Era
18       Nineties_Era
19       Nineties_Era
20       Nineties_Era
21       Nineties_Era
22       Nineties_Era
23       Nineties_Era
24       Nineties_Era
25       Nineties_Era
26       Nineties_Era
27       Nineties_Era
28       Nineties_Era
29       Nineties_Era
             ...     
13158     Early_2010s
13159     Early_2010s
13160     Early_2010s
13161     Early_2010s
13162     Early_2010s
13163     Early_2010s
13164     Early_2010s
13165     Early_2010s
13166     Early_2010s
13167     Early_2010s
13168     Early_2010s
13169     Early_2010s
13170     Early_2010s
13171     Early_2010s
13172     

In [69]:
qb_analysis['year'].value_counts()

Early_2000s     3846
Early_2010s     3625
Late_2000s      3164
Nineties_Era    2536
Name: year, dtype: int64

In [71]:
qb_analysis = qb_analysis.drop(['qb'], axis = 'columns')

In [72]:
qb_analysis.head()

Unnamed: 0,att,cmp,yds,ypa,td,int,lg,sack,loss,rate,game_points,home_away,year
0,38,25,237.0,6.2,0,0,20,2.0,11.0,82.9,13,away,Nineties_Era
1,25,16,196.0,7.8,2,1,35,0.0,0.0,98.1,20,home,Nineties_Era
2,8,5,53.0,6.6,0,0,30,1.0,11.0,81.8,20,home,Nineties_Era
3,35,16,215.0,6.1,0,0,55,7.0,53.0,65.8,6,away,Nineties_Era
4,31,17,198.0,6.4,2,0,30,4.0,12.0,95.9,29,home,Nineties_Era


In [73]:
qb_analysis = pd.get_dummies(qb_analysis, columns=['home_away', 'year'], drop_first=True)

# Model With Dummy Variables

In [74]:
#Separating the variables between X and y

X = qb_analysis.drop('game_points', axis='columns')
y = qb_analysis.loc[:, 'game_points']

In [75]:
#Setting up a training set and a test set
#test_size = .2 means 20% of the data is set aside for the test set. 80% of the data is used for the training set
#You could also use train_size if you wish
#I am using the 80/20 split based on conventional use

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=30)

In [76]:
#Setting up a linear regression model using the training set

from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [77]:
#Scoring the model on the training set and test set.
#These are the R-squared values for the training set and test set. 

print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.4386400872589281
0.42292590252731177


#### First Impressions

Blah Blah Blah

# Checking for Collinearity