# 03 - COMBINE SCRAPE, CLEANING, AND FEATURE ENGINEERING
- A scrape of an NFL Combine data website, which also has data from Pro Days
- Fills in missing values for metrics
- Exports data to a CSV for further manipulation in another notebook

# SCRAPE DATA

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Column and row display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

# Float appearance, Pandas and NumPy
pd.set_option('display.float_format', '{:.2f}'.format)
np.set_printoptions(suppress=True, precision = 2)

# Notebook cell width display
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 98% !important; }</style>"))

In [2]:
url = 'https://nflcombineresults.com/nflcombinedata.php?year=all&pos=WR&college='

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table with a class name 'sortable'
table = soup.find('table', {'class': 'sortable'})

# Initialize a list to store each row of the table
data = []

# Assuming the first row (<tr>) of the table contains the headers in <td> tags
header_row = table.find('tr')
headers = [header.text for header in header_row.find_all('td')]
data.append(headers)

# Extract the table rows, starting from the second row since the first row contains headers
for row in table.find_all('tr')[1:]:  # Skip the first row (headers)
    columns = row.find_all('td')
    if columns:
        data.append([col.text.strip() for col in columns])

combine_scrape = pd.DataFrame(data[1:], columns=data[0])
combine_scrape.head()

Unnamed: 0,Year,Name,College,POS,Height (in),Weight (lbs),Wonderlic,40 Yard,Bench Press,Vert Leap (in),Broad Jump (in),Shuttle,3Cone
0,2024,Javon Baker,Central Florida,WR,73.25,202,,4.54,,37.0,121.0,,
1,2024,Jermaine Burton,Alabama,WR,72.25,196,,4.45,,38.5,133.0,,
2,2024,Jalen Coker,Holy Cross (MA),WR,73.38,208,,4.57,12.0,42.5,128.0,,
3,2024,Keon Coleman,Florida State,WR,75.25,213,,4.61,,38.0,127.0,,
4,2024,Malachi Corley,Western Kentucky,WR,70.63,207,,4.56,,,,4.22,


In [13]:
# combine_scrape.to_csv('/mnt/c/Data_Science/Personal_Projects/nfl_wr_knn/combine_scrape.csv')
combine_scrape = pd.read_csv('/mnt/c/Data_Science/Personal_Projects/nfl_wr_knn/combine_data/combine_scrape.csv')

# CLEANING

In [14]:
combine = combine_scrape
combine.head()

Unnamed: 0.1,Unnamed: 0,Year,Name,College,POS,Height (in),Weight (lbs),Wonderlic,40 Yard,Bench Press,Vert Leap (in),Broad Jump (in),Shuttle,3Cone
0,0,2024.0,Javon Baker,Central Florida,WR,73.38,202.0,,4.54,,37.0,121.0,,
1,1,2024.0,Jermaine Burton,Alabama,WR,72.25,196.0,,4.45,,38.5,133.0,,
2,2,2024.0,Jalen Coker,Holy Cross (MA),WR,73.25,208.0,,4.57,,42.5,128.0,,
3,3,2024.0,Keon Coleman,Florida State,WR,74.63,213.0,,4.61,,38.0,127.0,,
4,4,2024.0,Malachi Corley,Western Kentucky,WR,70.63,215.0,,,,,,,


In [15]:
combine = combine.drop(columns = ['Unnamed: 0'])

In [16]:
# Remove blank row
combine = combine[combine['Year'] != '']

In [17]:
combine = combine.rename(columns={'Year': 'year', 'Name': 'name', "Height (in)": "height_in", 'Weight (lbs)': 'weight_lbs','40 Yard': '40'
                                  , 'Bench Press': 'bench', 'Vert Leap (in)': 'vertical', 'Broad Jump (in)': 'broad_jump'
                                  , 'Shuttle': 'shuttle', '3Cone': '3_cone'})

In [18]:
combine.head()

Unnamed: 0,year,name,College,POS,height_in,weight_lbs,Wonderlic,40,bench,vertical,broad_jump,shuttle,3_cone
0,2024.0,Javon Baker,Central Florida,WR,73.38,202.0,,4.54,,37.0,121.0,,
1,2024.0,Jermaine Burton,Alabama,WR,72.25,196.0,,4.45,,38.5,133.0,,
2,2024.0,Jalen Coker,Holy Cross (MA),WR,73.25,208.0,,4.57,,42.5,128.0,,
3,2024.0,Keon Coleman,Florida State,WR,74.63,213.0,,4.61,,38.0,127.0,,
4,2024.0,Malachi Corley,Western Kentucky,WR,70.63,215.0,,,,,,,


## Name de-dupe
- There are several groups of receivers with the same name.
- We'll look them up by Combine / Pro Day year to see who these are.

In [19]:
combine['name'].value_counts()

name
Mike Williams               3
Tony Jones                  3
Charles Johnson             2
John Brown                  2
Mike Thomas                 2
Anthony Johnson             2
Cedric Tillman              2
Reggie Davis                2
Marcus Davis                2
Chris Jones                 2
Keyarris Garrett            2
Steve Smith                 2
Jeff Graham                 2
Anthony Miller              2
Tutu Atwell                 2
Chris Harper                2
Marvin Harrison             2
Scott Miller                2
Cedrick Wilson              2
Terry Smith                 2
Charlie Jones               2
Shawn Mills                 1
Reggie Wayne                1
David Terrell               1
Kevin Kasper                1
Chris Taylor                1
Vinny Sutherland            1
Koren Robinson              1
Ken-Yon Rambo               1
Santana Moss                1
Nate Poole                  1
Bobby Newcombe              1
Quentin McCord              1
Quinc

Checking the names of duped receivers. Will delete most of these because:
- They played several decades ago, outside the football era I'm interested in OR...
- Their careers were very short and didn't have significant playing time.

In [20]:
combine = combine[~((combine['name'] == 'Mike Williams') & (combine['year'] == 2010))]
combine = combine[~((combine['name'] == 'Mike Williams') & (combine['year'] == 2005))]
combine[combine['name'] == 'Mike Williams']

Unnamed: 0,year,name,College,POS,height_in,weight_lbs,Wonderlic,40,bench,vertical,broad_jump,shuttle,3_cone
488,2017.0,Mike Williams,Clemson,WR,75.75,218.0,17.0,4.54,15.0,32.5,121.0,,


In [21]:
# combine[(combine['name'] == 'Tony Jones')]

combine = combine[~((combine['name'] == 'Tony Jones') & (combine['year'] == 2015))]
combine = combine[~((combine['name'] == 'Tony Jones') & (combine['year'] == 1990))]

In [22]:
# combine[(combine['name'] == 'Charles Johnson')]

combine = combine[~((combine['name'] == 'Charles Johnson') & (combine['year'] == 1994))]

In [23]:
combine.loc[(combine['name'] == 'Steve Smith') & (combine['year'] == 2001), 'name'] = 'Steve Smith Sr.'

In [24]:
# combine[(combine['name'] == 'John Brown')]

combine = combine[~((combine['name'] == 'John Brown') & (combine['year'] == 1992))]

In [25]:
# combine[(combine['name'] == 'Chris Harper')]

combine = combine[~((combine['name'] == 'Chris Harper') & (combine['year'] == 2013))]

In [26]:
# combine[(combine['name'] == 'Mike Thomas')]

combine = combine[~((combine['name'] == 'Mike Thomas') & (combine['year'] == 2009))]

In [27]:
# combine[(combine['name'] == 'Charlie Jones')]

combine = combine[~((combine['name'] == 'Charlie Jones') & (combine['year'] == 1996))]

In [28]:
# combine[(combine['name'] == 'Keyarris Garrett')]

combine = combine[~((combine['name'] == 'Keyarris Garrett') & (combine['year'] == 2015))]

In [29]:
# combine[(combine['name'] == 'Cedric Tillman')]

combine = combine[~((combine['name'] == 'Cedric Tillman') & (combine['year'] == 1992))]

In [30]:
# combine[(combine['name'] == 'Scott Miller')]

combine = combine[~((combine['name'] == 'Scott Miller') & (combine['year'] == 1991))]

In [31]:
# combine[(combine['name'] == 'Tutu Atwell')]

combine = combine[~((combine['name'] == 'Tutu Atwell') & (combine['year'] == 1998))]

In [32]:
# combine[(combine['name'] == 'Anthony Miller')]

combine = combine[~((combine['name'] == 'Anthony Miller') & (combine['year'] == 1988))]

In [33]:
# combine[(combine['name'] == 'Anthony Johnson')]

combine = combine[~((combine['name'] == 'Anthony Johnson') & (combine['year'] == 2015))]

In [34]:
# combine[(combine['name'] == 'Chris Jones')]

combine = combine[~((combine['name'] == 'Chris Jones') & (combine['year'] == 1995))]

In [35]:
# combine[(combine['name'] == 'Reggie Davis')]

combine = combine[~((combine['name'] == 'Reggie Davis') & (combine['year'] == 1989))]

In [36]:
# combine[(combine['name'] == 'Marcus Davis')]

combine = combine[~((combine['name'] == 'Marcus Davis') & (combine['year'] == 2013))]

In [37]:
# combine[(combine['name'] == 'Jeff Graham')]

combine = combine[~((combine['name'] == 'Jeff Graham') & (combine['year'] == 1989))]

In [38]:
# combine[(combine['name'] == 'Marvin Harrison')]

combine = combine[~((combine['name'] == 'Marvin Harrison') & (combine['year'] == 1996))]

In [39]:
# combine[(combine['name'] == 'Cedrick Wilson')]

combine = combine[~((combine['name'] == 'Cedrick Wilson') & (combine['year'] == 2001))]

In [40]:
# combine[(combine['name'] == 'Terry Smith')]

combine = combine[~((combine['name'] == 'Terry Smith') & (combine['year'] == 1992))]

## Drop irrelevant columns
- College should not be factored into the determination of playing style
- All of these players are receivers
- The Wonderlic exam is nearly irrelevant now, and it has rarely been administered to receivers

In [41]:
combine = combine.drop(columns = ['College', 'POS', 'Wonderlic'])

In [42]:
combine.head()

Unnamed: 0,year,name,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone
0,2024.0,Javon Baker,73.38,202.0,4.54,,37.0,121.0,,
1,2024.0,Jermaine Burton,72.25,196.0,4.45,,38.5,133.0,,
2,2024.0,Jalen Coker,73.25,208.0,4.57,,42.5,128.0,,
3,2024.0,Keon Coleman,74.63,213.0,4.61,,38.0,127.0,,
4,2024.0,Malachi Corley,70.63,215.0,,,,,,


## Convert data types

In [43]:
combine.dtypes

year          float64
name           object
height_in     float64
weight_lbs    float64
40            float64
bench         float64
vertical      float64
broad_jump    float64
shuttle       float64
3_cone        float64
dtype: object

In [44]:
combine.columns

Index(['year', 'name', 'height_in', 'weight_lbs', '40', 'bench', 'vertical',
       'broad_jump', 'shuttle', '3_cone'],
      dtype='object')

In [45]:
float_cols = ['height_in', '40', 'vertical', 'shuttle', '3_cone', 'broad_jump', 'bench', 'weight_lbs', 'year']

for col in float_cols:
    combine[col] = pd.to_numeric(combine[col], errors = 'coerce')

In [46]:
int_cols = ['broad_jump', 'bench', 'weight_lbs', 'year']

for col in int_cols:
    combine[col] = combine[col].astype('Int64')

In [47]:
combine.head()

Unnamed: 0,year,name,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone
0,2024,Javon Baker,73.38,202,4.54,,37.0,121.0,,
1,2024,Jermaine Burton,72.25,196,4.45,,38.5,133.0,,
2,2024,Jalen Coker,73.25,208,4.57,,42.5,128.0,,
3,2024,Keon Coleman,74.63,213,4.61,,38.0,127.0,,
4,2024,Malachi Corley,70.63,215,,,,,,


## Imputing missing values
- At every Combine and Pro Day, some players choose not to partake in some drills for several reasons:
  - The trait being tested by the drill is not very applicable to their position
  - The player is recovering from an injury
  - The player beleives he will not perform well in the drill, and thinks a poor performance in it will hurt his draft stock

### Bench press clean-up
- Good strength is a nice attribute for a WR to have, but it's not essential. Plenty of WRs have played at a high level in the NFL with underwhelming bench press test numbers.
- Many WRs choose not to partake in the bench press test because their playing styles, which are documented by game film, do not rely on upper body strength.
- Some WRs choose not to partake because they know they'll perform poorly.

In [48]:
combine.isnull().sum()

year             1
name             1
height_in        1
weight_lbs       1
40             147
bench         1016
vertical       204
broad_jump     221
shuttle        381
3_cone         746
dtype: int64

In [49]:
combine.sort_values(by = 'bench', ascending=False)

Unnamed: 0,year,name,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone
1098,2006.0,Brandon Williams,69.38,179.0,4.5,38.0,34.5,115.0,4.07,6.87
1007,2008.0,Taj Smith,72.38,187.0,4.64,35.0,35.0,119.0,4.5,7.15
1008,2008.0,Limas Sweed,75.88,215.0,4.56,35.0,35.0,128.0,4.33,7.14
136,2021.0,Jalen Camp,73.88,226.0,4.48,29.0,39.5,125.0,4.14,7.0
920,2009.0,Quan Cosby,68.63,196.0,4.5,28.0,34.5,,4.47,7.28
1082,2006.0,Brandon Marshall,76.5,229.0,4.54,28.0,37.0,120.0,4.09,7.05
925,2009.0,Brooks Foster,72.5,201.0,4.45,27.0,,,,
588,2015.0,Lemar Durant,73.63,231.0,4.59,27.0,37.0,117.0,4.56,7.18
293,2019.0,D.K. Metcalf,75.38,228.0,4.33,27.0,40.5,134.0,4.5,7.38
274,2019.0,N'Keal Harry,74.88,228.0,4.53,27.0,38.5,122.0,4.28,7.05


- The bench press numbers for Brandon Williams, Taj Smith, and Limas Sweed look wrong.
- The most reps of 225 performed at the Combine by a WR is 27. Jalen Camp did 29 reps at his Pro Day.
- After some research, Brandon Williams' number appears to be that of a defensive tackle who had the same name.
- The numbers for Smith and Sweed appear to be data population errors.
- I'll just delete these receivers, since they rarely played in the league and are outside the time window I'm analyzing.

In [50]:
combine = combine[(combine['name'] != 'Brandon Williams') & (combine['name'] != 'Taj Smith') & (combine['name'] != 'Limas Sweed')]

In [51]:
combine.sort_values(by = 'bench', ascending=False).head()

Unnamed: 0,year,name,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone
136,2021,Jalen Camp,73.88,226,4.48,29,39.5,125.0,4.14,7.0
920,2009,Quan Cosby,68.63,196,4.5,28,34.5,,4.47,7.28
1082,2006,Brandon Marshall,76.5,229,4.54,28,37.0,120.0,4.09,7.05
588,2015,Lemar Durant,73.63,231,4.59,27,37.0,117.0,4.56,7.18
274,2019,N'Keal Harry,74.88,228,4.53,27,38.5,122.0,4.28,7.05


In [52]:
combine['weight_lbs'].max(), combine['weight_lbs'].min()

(254, 144)

- We'll create "weight classes" in 10 lbs increments and find the mean value for each class.
- This is how we will fill missing values for 

In [53]:
min_weight = 140
max_weight = 260

for start_weight in range(min_weight, max_weight, 10):
    end_weight = start_weight + 10
    mean_bench = combine[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight)]['bench'].mean()

    if pd.isna(mean_bench):
        mean_bench = 3
    else:
        mean_bench = round(mean_bench)
        
    print(f'{start_weight} to {end_weight}: {mean_bench}')

140 to 150: 3
150 to 160: 5
160 to 170: 9
170 to 180: 11
180 to 190: 12
190 to 200: 13
200 to 210: 14
210 to 220: 16
220 to 230: 17
230 to 240: 17
240 to 250: 20
250 to 260: 23


In [54]:
weight_ranges = [(140, 150, 3), (150, 160, 5), (160, 170, 9), (170, 180, 11),
                 (180, 190, 12), (190, 200, 13), (200, 210, 14), (210, 220, 16),
                 (220, 230, 17), (230, 240, 17), (240, 250, 20), (250, 260, 23)]

for start_weight, end_weight, mean_bench in weight_ranges:
    combine.loc[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight) & (combine['bench'].isna()), 'bench'] = mean_bench

In [55]:
combine[combine['bench'].isnull()].shape

(1, 10)

In [56]:
combine['height_in'].isnull().sum()

1

In [57]:
combine['weight_lbs'].isnull().sum()

1

### Vertical clean-up

In [58]:
combine['vertical'].isnull().sum()

204

In [59]:
min_weight = 140
max_weight = 260

for start_weight in range(min_weight, max_weight, 10):
    end_weight = start_weight + 10
    mean_vertical = combine[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight)]['vertical'].mean()

    if pd.isna(mean_vertical):
        mean_vertical = np.nan
    else:
        mean_vertical = round(mean_vertical * 2) / 2
        
    print(f'{start_weight} to {end_weight}: {mean_vertical}')

140 to 150: 32.5
150 to 160: 33.0
160 to 170: 34.0
170 to 180: 34.0
180 to 190: 34.5
190 to 200: 35.0
200 to 210: 35.0
210 to 220: 35.5
220 to 230: 35.0
230 to 240: 34.5
240 to 250: 34.5
250 to 260: 29.0


In [60]:
weight_ranges = [(140, 150, 32.5), (150, 160, 33), (160, 170, 34), (170, 180, 34),
                 (180, 190, 34.5), (190, 200, 35), (200, 210, 35), (210, 220, 35.5),
                 (220, 230, 35), (230, 240, 34.5), (240, 250, 34.5), (250, 260, 29)]

for start_weight, end_weight, mean_vertical in weight_ranges:
    combine.loc[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight) & (combine['vertical'].isna()), 'vertical'] = mean_vertical

In [61]:
combine['vertical'].isnull().sum()

1

### Broad jump clean-up

In [62]:
combine['broad_jump'].isnull().sum()

221

In [63]:
min_weight = 140
max_weight = 260

for start_weight in range(min_weight, max_weight, 10):
    end_weight = start_weight + 10
    mean_broad = combine[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight)]['broad_jump'].mean()

    if pd.isna(mean_broad):
        mean_broad = np.nan
    else:
        mean_broad = round(mean_broad)
        
    print(f'{start_weight} to {end_weight}: {mean_broad}')

140 to 150: 114
150 to 160: 116
160 to 170: 118
170 to 180: 118
180 to 190: 119
190 to 200: 120
200 to 210: 121
210 to 220: 121
220 to 230: 121
230 to 240: 120
240 to 250: 126
250 to 260: 118


In [64]:
weight_ranges = [(140, 150, 114), (150, 160, 116), (160, 170, 118), (170, 180, 118),
                 (180, 190, 119), (190, 200, 120), (200, 210, 121), (210, 220, 121),
                 (220, 230, 121), (230, 240, 120), (240, 250, 126), (250, 260, 118)]

for start_weight, end_weight, mean_broad in weight_ranges:
    combine.loc[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight) & (combine['broad_jump'].isna()), 'broad_jump'] = mean_broad

### 40-yard dash clean-up
- For the 40, we'll use the median because some weight classes have exceptional performances that distort what is expected of the weight class

- Having run the KNN previously and seeing that Marquise "Hollywood" Brown, who's known for his speed, was not classified as a speedster, I looked up his 40-yard dash time from his Oklahoma Pro Day.
- His 40 time was not in the data from this scrape, so we'll manually enter it.

In [65]:
combine.loc[combine['name'] == 'Marquise Brown', '40'] = 4.27

In [66]:
combine['40'].isnull().sum()

146

In [67]:
min_weight = 140
max_weight = 260

for start_weight in range(min_weight, max_weight, 10):
    end_weight = start_weight + 10
    median_40 = combine[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight)]['40'].median()
        
    print(f'{start_weight} to {end_weight}: {median_40}')

140 to 150: 4.44
150 to 160: 4.5
160 to 170: 4.52
170 to 180: 4.55
180 to 190: 4.535
190 to 200: 4.54
200 to 210: 4.54
210 to 220: 4.55
220 to 230: 4.55
230 to 240: 4.59
240 to 250: 4.65
250 to 260: 4.7


In [68]:
weight_ranges = [(140, 150, 4.44), (150, 160, 4.5), (160, 170, 4.53), (170, 180, 4.55),
                 (180, 190, 4.54), (190, 200, 4.54), (200, 210, 4.54), (210, 220, 4.55),
                 (220, 230, 4.55), (230, 240, 4.59), (240, 250, 4.65), (250, 260, 4.7)]

for start_weight, end_weight, median_40 in weight_ranges:
    combine.loc[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight) & (combine['40'].isna()), '40'] = median_40

### Shuttle drill clean-up

In [69]:
combine['shuttle'].isnull().sum()

381

In [70]:
min_weight = 140
max_weight = 260

for start_weight in range(min_weight, max_weight, 10):
    end_weight = start_weight + 10
    mean_shuttle = combine[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight)]['shuttle'].mean()
        
    print(f'{start_weight} to {end_weight}: {mean_shuttle}')

140 to 150: 4.3500000000000005
150 to 160: 4.198695652173913
160 to 170: 4.217368421052632
170 to 180: 4.218535031847133
180 to 190: 4.226
190 to 200: 4.239716088328076
200 to 210: 4.266645161290323
210 to 220: 4.281490384615385
220 to 230: 4.260250000000001
230 to 240: 4.370454545454546
240 to 250: 4.165
250 to 260: 4.14


In [71]:
weight_ranges = [(140, 150, 4.35), (150, 160, 4.2), (160, 170, 4.22), (170, 180, 4.22),
                 (180, 190, 4.23), (190, 200, 4.24), (200, 210, 4.27), (210, 220, 4.28),
                 (220, 230, 4.26), (230, 240, 4.37), (240, 250, 4.17), (250, 260, 4.14)]

for start_weight, end_weight, mean_shuttle in weight_ranges:
    combine.loc[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight) & (combine['shuttle'].isna()), 'shuttle'] = mean_shuttle

### 3-cone clean-up

In [72]:
combine['3_cone'].isnull().sum()

746

In [73]:
min_weight = 140
max_weight = 260

for start_weight in range(min_weight, max_weight, 10):
    end_weight = start_weight + 10
    mean_3_cone = combine[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight)]['3_cone'].mean()
        
    print(f'{start_weight} to {end_weight}: {mean_3_cone}')

140 to 150: 6.88
150 to 160: 6.88111111111111
160 to 170: 6.953157894736843
170 to 180: 6.992133333333334
180 to 190: 6.99273631840796
190 to 200: 7.002926829268293
200 to 210: 7.02109022556391
210 to 220: 7.047539267015706
220 to 230: 7.043947368421053
230 to 240: 7.080526315789474
240 to 250: 7.46
250 to 260: 6.9


In [74]:
combine[combine['weight_lbs'] > 250]

Unnamed: 0,year,name,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone
412,2017,Billy Brown,75.38,254,4.7,23,29.0,118,4.14,6.9


In [75]:
weight_ranges = [(140, 150, 6.88), (150, 160, 6.88), (160, 170, 6.95), (170, 180, 6.99),
                 (180, 190, 6.99), (190, 200, 7), (200, 210, 7.02), (210, 220, 7.05),
                 (220, 230, 7.04), (230, 240, 7.08), (240, 250, 7.46), (250, 260, 7.6)]

for start_weight, end_weight, mean_3_cone in weight_ranges:
    combine.loc[(combine['weight_lbs'] > start_weight) & (combine['weight_lbs'] <= end_weight) & (combine['3_cone'].isna()), '3_cone'] = mean_3_cone

# CHECKING DATA

In [76]:
combine.isnull().sum()

year          1
name          1
height_in     1
weight_lbs    1
40            1
bench         1
vertical      1
broad_jump    1
shuttle       1
3_cone        1
dtype: int64

No missing values.

In [77]:
combine.head(20)

Unnamed: 0,year,name,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone
0,2024,Javon Baker,73.38,202,4.54,14,37.0,121,4.27,7.02
1,2024,Jermaine Burton,72.25,196,4.45,13,38.5,133,4.24,7.0
2,2024,Jalen Coker,73.25,208,4.57,14,42.5,128,4.27,7.02
3,2024,Keon Coleman,74.63,213,4.61,16,38.0,127,4.28,7.05
4,2024,Malachi Corley,70.63,215,4.55,16,35.5,121,4.28,7.05
5,2024,Jacob Cowing,68.38,168,4.38,9,36.0,119,4.32,7.02
6,2024,Ryan Flournoy,73.13,202,4.44,19,39.5,132,4.27,7.02
7,2024,Troy Franklin,74.25,176,4.41,11,39.0,124,4.31,6.9
8,2024,Anthony Gould,68.0,174,4.39,11,39.5,129,4.16,6.99
9,2024,Lideatrick Griffin,69.88,181,4.43,12,35.5,124,4.35,7.0


### Calvin Johnson
- It looks like Calvin Johnson's missing values were filled in correctly.

In [78]:
# combine_scrape[combine_scrape['Name'].str.contains('Calvin')]
combine_scrape[combine_scrape['Name'] == 'Calvin Johnson']

Unnamed: 0.1,Unnamed: 0,Year,Name,College,POS,Height (in),Weight (lbs),Wonderlic,40 Yard,Bench Press,Vert Leap (in),Broad Jump (in),Shuttle,3Cone
1032,1032,2007.0,Calvin Johnson,Georgia Tech,WR,77.0,239.0,,4.38,,42.5,139.0,,


In [79]:
combine[combine['name'] == 'Calvin Johnson']
# combine[combine['name'].str.contains('Calvin')]

Unnamed: 0,year,name,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone
1032,2007,Calvin Johnson,77.0,239,4.38,17,42.5,139,4.37,7.08


In [80]:
combine[combine['name'] == 'Marquise Brown']

Unnamed: 0,year,name,height_in,weight_lbs,40,bench,vertical,broad_jump,shuttle,3_cone
248,2019,Marquise Brown,69.38,166,4.27,9,34.0,118,4.22,6.95


In [81]:
combine.to_csv('../combine_data/combine.csv', index=False)

In [82]:
pwd

'/mnt/c/Data_Science/Personal_Projects/nfl_wr_knn/final'