# Survival Analysis for Running Backs

The csv file being used in this project was extracted from the site ProFootballReference. It contains a large sample of the desired population of running backs that will be used for the survival analysis of running backs in the NFL based on several attributes. 

## Importing the Required Libraries

In [10]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

## Minor Cleanup of the Dataset

In [11]:
nfl = pd.read_csv("nflrb_data.csv")
nfl.head()

Unnamed: 0,Rk,Year,Rnd,Pick,Unnamed: 4,Pos,DrAge,Tm,From,To,...,G,GS,Att,Yds,TD,Rec,Yds.1,TD.1,College/Univ,Unnamed: 23
0,1,2016,1,4,Ezekiel Elliott\ElliEz00,RB,21.0,DAL,2016.0,2016.0,...,15.0,15.0,322.0,1631.0,15.0,32.0,363.0,1.0,Ohio St.,College Stats
1,2,2016,2,45,Derrick Henry\HenrDe00,RB,22.0,TEN,2016.0,2016.0,...,15.0,1.0,110.0,490.0,5.0,13.0,137.0,0.0,Alabama,College Stats
2,3,2016,3,73,Kenyan Drake\DrakKe00,RB,22.0,MIA,2016.0,2016.0,...,16.0,1.0,33.0,179.0,2.0,9.0,46.0,0.0,Alabama,College Stats
3,4,2016,3,90,C.J. Prosise\ProsC.00,RB,22.0,SEA,2016.0,2016.0,...,6.0,2.0,30.0,172.0,1.0,17.0,208.0,0.0,Notre Dame,College Stats
4,5,2016,4,119,Tyler Ervin\ErviTy00,RB,22.0,HOU,2016.0,2016.0,...,12.0,0.0,1.0,3.0,0.0,3.0,18.0,0.0,San Jose St.,College Stats


In [12]:
## getting rid of the useless column that is never going to be used
nfl.drop(["Rk","Unnamed: 23"], axis=1, inplace=True)
nfl.head()

Unnamed: 0,Year,Rnd,Pick,Unnamed: 4,Pos,DrAge,Tm,From,To,AP1,...,CarAV,G,GS,Att,Yds,TD,Rec,Yds.1,TD.1,College/Univ
0,2016,1,4,Ezekiel Elliott\ElliEz00,RB,21.0,DAL,2016.0,2016.0,1,...,16.0,15.0,15.0,322.0,1631.0,15.0,32.0,363.0,1.0,Ohio St.
1,2016,2,45,Derrick Henry\HenrDe00,RB,22.0,TEN,2016.0,2016.0,0,...,4.0,15.0,1.0,110.0,490.0,5.0,13.0,137.0,0.0,Alabama
2,2016,3,73,Kenyan Drake\DrakKe00,RB,22.0,MIA,2016.0,2016.0,0,...,2.0,16.0,1.0,33.0,179.0,2.0,9.0,46.0,0.0,Alabama
3,2016,3,90,C.J. Prosise\ProsC.00,RB,22.0,SEA,2016.0,2016.0,0,...,3.0,6.0,2.0,30.0,172.0,1.0,17.0,208.0,0.0,Notre Dame
4,2016,4,119,Tyler Ervin\ErviTy00,RB,22.0,HOU,2016.0,2016.0,0,...,0.0,12.0,0.0,1.0,3.0,0.0,3.0,18.0,0.0,San Jose St.


### Renaming the Columns

In [13]:
nfl.columns

Index(['Year', 'Rnd', 'Pick', 'Unnamed: 4', 'Pos', 'DrAge', 'Tm', 'From', 'To',
       'AP1', 'PB', 'St', 'CarAV', 'G', 'GS', 'Att', 'Yds', 'TD', 'Rec',
       'Yds.1', 'TD.1', 'College/Univ'],
      dtype='object')

In [14]:
nfl.columns = ['Year', 'Rnd', 'Pick', 'Player', 'Pos', 'DrAge', 'Tm', 'From', 'To',
       'AP1', 'PB', 'St', 'CarAV', 'G', 'GS', 'Att', 'Yds', 'TD', 'Rec',
       'Yds.1', 'TD.1', 'College/Univ']

### Getting Rid of Some Missing Data

In [15]:
nfl.isnull().sum()

Year              0
Rnd               0
Pick              0
Player            0
Pos               0
DrAge           344
Tm                0
From            361
To              361
AP1               0
PB                0
St                0
CarAV           361
G               361
GS              361
Att             450
Yds             450
TD              450
Rec             488
Yds.1           488
TD.1            488
College/Univ      2
dtype: int64

In [16]:
print("Number of Observations:", nfl.shape[0])

Number of Observations: 1500


In [17]:
## getting rid of the observations where not enough info could be found
nfl = nfl[nfl["From"].isnull() == False]
print("Number of Observations:", nfl.shape[0])

Number of Observations: 1139


In [18]:
## getting rid of the players that did not retire by 2016
nfl = nfl[nfl["To"]!=2016]
print("Number of Observations:", nfl.shape[0])

Number of Observations: 1037


In [19]:
nfl.isnull().sum()

Year              0
Rnd               0
Pick              0
Player            0
Pos               0
DrAge             0
Tm                0
From              0
To                0
AP1               0
PB                0
St                0
CarAV             0
G                 0
GS                0
Att              88
Yds              88
TD               88
Rec             127
Yds.1           127
TD.1            127
College/Univ      0
dtype: int64

## Adding New Data 

When the original csv file was extracted, it contained a column that had the player's name along with a snippet of the URL that was part of their ProFootballReference page. This function returns a 2X2 matrix that contains lists of the player's name and respective ProFootballReference URL. 

In [22]:
baseurl = "http://www.pro-football-reference.com/players/"
def split_player(row):
    split_list = row["Player"].split("\\")
    player_name = split_list[0]
    player_url_code = split_list[1]
    first_letter = player_url_code[0]
    full_url = baseurl + first_letter + "/" + player_url_code + ".htm"
    return [player_name, full_url]
a = nfl.apply(split_player,axis=1)

In [23]:
# converted the lists into numpy arrays and then added them into the dataframe
nfl["Player"] = np.array([row[0] for row in a])
nfl["PFR_URL"] = np.array([row[1] for row in a])

In [24]:
nfl.head()

Unnamed: 0,Year,Rnd,Pick,Player,Pos,DrAge,Tm,From,To,AP1,...,G,GS,Att,Yds,TD,Rec,Yds.1,TD.1,College/Univ,PFR_URL
35,2015,5,138,David Cobb,RB,22.0,TEN,2015.0,2015.0,0,...,7.0,1.0,52.0,146.0,1.0,1.0,-2.0,0.0,Minnesota,http://www.pro-football-reference.com/players/...
37,2015,5,155,Karlos Williams,RB,22.0,BUF,2015.0,2015.0,0,...,11.0,3.0,93.0,517.0,7.0,11.0,96.0,2.0,Florida St.,http://www.pro-football-reference.com/players/...
40,2015,6,205,Josh Robinson,RB,23.0,IND,2015.0,2015.0,0,...,5.0,0.0,17.0,39.0,0.0,6.0,33.0,0.0,Mississippi St.,http://www.pro-football-reference.com/players/...
43,2015,7,231,Joey Iosefa,FB,24.0,TAM,2015.0,2015.0,0,...,2.0,0.0,15.0,51.0,0.0,,,,Hawaii,http://www.pro-football-reference.com/players/...
45,2014,2,54,Bishop Sankey,RB,21.0,TEN,2014.0,2015.0,0,...,29.0,12.0,199.0,762.0,3.0,32.0,272.0,1.0,Washington,http://www.pro-football-reference.com/players/...


Since the original csv file did not contain the players' height and weight, this function was created to take in a player's respective URL and parse the website to find their height and weight using BeautifulSoup4. If the info could not be found, it would be assigned a missing data value using an error exception. 

In [56]:
def player_info(row):
    response = requests.get(row["PFR_URL"])
    content = response.content
    parser = BeautifulSoup(content, 'html.parser')
    try:
        height = parser.find_all(itemprop="height")[0].text
        weight = parser.find_all(itemprop="weight")[0].text
    except IndexError:
        height=weight=None
    return height, weight

a = nfl.apply(player_info, axis=1)
print(a.head())

35    (5-11, 229lb)
37     (6-1, 225lb)
40     (5-9, 215lb)
43     (6-0, 245lb)
45    (5-10, 209lb)
dtype: object


In [None]:
nfl["Height"] = np.array([row[0] for row in a])
nfl["Weight"] = np.array([row[1] for row in a])

In [66]:
## deleting the observations where no height or weight could be parsed
nfl = nfl[nfl["Height"].isnull() == False]
nfl = nfl[nfl["Weight"].isnull() == False]

In [68]:
## converting the height from character to integer 
def convert_height(row):
    height = row["Height"].split("-")
    converted_height = 12*int(height[0]) + int(height[1])
    return converted_height
nfl["Height"] = nfl.apply(convert_height,axis=1)

In [74]:
## converting the weight from character to integer
def convert_weight(row):
    weight = int(row["Weight"][:3])
    return weight
nfl["Weight"] = nfl.apply(convert_weight, axis=1)

In [75]:
nfl.head()

Unnamed: 0,Year,Rnd,Pick,Player,Pos,DrAge,Tm,From,To,AP1,...,Att,Yds,TD,Rec,Yds.1,TD.1,College/Univ,PFR_URL,Height,Weight
35,2015,5,138,David Cobb,RB,22.0,TEN,2015.0,2015.0,0,...,52.0,146.0,1.0,1.0,-2.0,0.0,Minnesota,http://www.pro-football-reference.com/players/...,71,229
37,2015,5,155,Karlos Williams,RB,22.0,BUF,2015.0,2015.0,0,...,93.0,517.0,7.0,11.0,96.0,2.0,Florida St.,http://www.pro-football-reference.com/players/...,73,225
40,2015,6,205,Josh Robinson,RB,23.0,IND,2015.0,2015.0,0,...,17.0,39.0,0.0,6.0,33.0,0.0,Mississippi St.,http://www.pro-football-reference.com/players/...,69,215
43,2015,7,231,Joey Iosefa,FB,24.0,TAM,2015.0,2015.0,0,...,15.0,51.0,0.0,,,,Hawaii,http://www.pro-football-reference.com/players/...,72,245
45,2014,2,54,Bishop Sankey,RB,21.0,TEN,2014.0,2015.0,0,...,199.0,762.0,3.0,32.0,272.0,1.0,Washington,http://www.pro-football-reference.com/players/...,70,209


In [78]:
## Outputted the new dataframe into a new csv file 
nfl.to_csv("new_nflrb_data.csv")