# Clean Data

### Imports

In [106]:
import numpy as np
import pandas as pd


## Clean NFL Combine Data

In [107]:
combine_file = r'data\nfl_combine_1987_2020.csv'

df_raw_combine = pd.read_csv(combine_file)

df_raw_combine.head()

combine_cols_to_drop = ['Unnamed: 0', 'Wonderlic']
df_combine = df_raw_combine
df_combine.drop(columns=combine_cols_to_drop, inplace=True)
df_combine.columns = df_combine.columns.str.lower()
df_combine.rename(columns={'college':'school'}, inplace=True)
df_combine.head()
# Drop years prior to 2000 (no draft data)
print(df_combine.shape)
df_combine.drop(df_combine[df_combine['year']<2000].index, inplace=True)
print('Cleaned combine size: ', df_combine.shape)


(12808, 15)
Cleaned combine size:  (8465, 15)


## Clean NFL Draft Data

In [108]:
draft_file = r'data\espn_draft_history_2000_2021_cleaned.csv'
df_raw_draft = pd.read_csv(draft_file)

df_draft = df_raw_draft

df_draft.columns = df_draft.columns.str.lower()

print("First year of draft data: ", df_draft['year'].min())
df_draft.columns


First year of draft data:  2000


Index(['year', 'round', 'pk(ovr)', 'team', 'name', 'position', 'school'], dtype='object')

In [109]:
# combine results by position
df_combine['pos'].value_counts()

WR     1095
CB      864
RB      700
DE      657
DT      647
OT      618
OLB     586
OG      528
TE      471
QB      470
ILB     350
FS      347
SS      298
C       253
FB      175
P       108
K        92
LB       91
S        53
LS       22
EDG      22
OL       10
DL        4
DB        3
NT        1
Name: pos, dtype: int64

In [110]:
# Are there duplicated names?
df_combine['name'].value_counts(sort='descending').head(10)


Brandon Williams    5
Chris Brown         5
Brian Allen         4
Mike Williams       4
Chris Jones         4
Michael Bennett     3
Josh Harris         3
Travis Wilson       3
Steve Smith         3
Andre Smith         3
Name: name, dtype: int64

In [111]:
# Do college names match in both datasets?
draft_school = pd.DataFrame(df_draft['school'].unique()).rename(columns={0:'school'})
draft_school['source'] = 'draft'
combine_school = pd.DataFrame(df_combine['school'].unique()).rename(columns={0:'school'})
combine_school['source'] = 'combine'
print(type(combine_school))
print(combine_school.head())

schools = draft_school.merge(combine_school, on='school', how='outer',
                             suffixes=['_draft', '_combine']).sort_values(by='school')

#schools.head(10)

# List all cases with mismatches
na_mask = schools.isna().any(axis=1)
print(schools[na_mask])

# So we see that the 'combine' dataset frequently has the state appended to the school name
# Ex: "Abilene Christian (TX)". Remove these from school names, with the exception of
# "Miami (OH).

df_combine['school'] = df_combine['school'].str.replace('Miami (OH)', 'Miami - OH')

regex_replace_parens = r'\([^)]*[a-zA-Z][^)]*\)'
df_combine['school'] = df_combine['school'].str.replace(regex_replace_parens, '', regex=True)

df_combine['school'].head(20)

schools2 = draft_school.merge(combine_school, on='school', how='outer',
                             suffixes=['_draft', '_combine']).sort_values(by='school')

na_mask = schools2.isna().any(axis=1)
schools2[na_mask]
# pd.concat([df_combine['school'], df_draft['school']], join='outer')

#df_combine.merge(df_draft, how='left', on='school')

<class 'pandas.core.frame.DataFrame'>
               school   source
0      Boston College  combine
1          Texas Tech  combine
2  Jackson State (MS)  combine
3      South Carolina  combine
4         Wake Forest  combine
                       school source_draft source_combine
202         Abilene Christian        draft            NaN
381    Abilene Christian (TX)          NaN        combine
445          Adams State (CO)          NaN        combine
345        Alabama-Birmingham          NaN        combine
200                    Albany        draft            NaN
..                        ...          ...            ...
225             Winston-Salem        draft            NaN
348  Winston-Salem State (NC)          NaN        combine
280          Youngstown State        draft            NaN
325     Youngstown State (OH)          NaN        combine
103                       NaN        draft        combine

[315 rows x 3 columns]


  df_combine['school'] = df_combine['school'].str.replace('Miami (OH)', 'Miami - OH')


## Standardize player names between datasets
Player names in the "Draft" dataset include suffixes including "Jr., II, III, IV", but these are NOT included in the "combine" dataset.

Standardize player names between datasets by removing these values from the "Draft" dataset.

In [113]:
regex_suffixes_to_remove = r'Jr\.$|III$|IIII$|IV$|, Jr.$'
df_draft['name'] = df_draft['name'].str.replace(regex_suffixes_to_remove, '', regex=True)

# df_draft['name'].to_clipboard()



Merge the Draft and NFL Combine datasets

In [115]:
df_merged = df_combine.merge(df_draft, how='left',
                             on=['name', 'school', 'year'])

df_merged.head(10)

# df_merged.to_clipboard()

## Investigate merged data

