# Clean Data

### Imports

In [None]:
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# personal module scripts
import clean_data


## Clean NFL Combine Data

In [None]:
combine_file = r'data\nfl_combine_1987_2020.csv'

df_raw_combine = pd.read_csv(combine_file)

df_raw_combine.head()

# Keep raw data import for reference - build copy to modify
df_combine = df_raw_combine

# Drop dummy '0' column and Wonderlic scores data
combine_cols_to_drop = ['Unnamed: 0', 'Wonderlic']
df_combine.drop(columns=combine_cols_to_drop, inplace=True)

# Clean column headers
df_combine.columns = df_combine.columns.str.lower()
df_combine.rename(columns={'college':'school'}, inplace=True)

# Clean school names
df_combine['school'] = df_combine['school'].str.strip()

# Drop years prior to 2000 (no draft data)
print(df_combine.shape)
df_combine.drop(df_combine[df_combine['year']<2000].index, inplace=True)
print('Cleaned combine size: ', df_combine.shape)
df_combine.head()

## Import position mapping data
The "combine" dataset maps players to very specific positions (ie, "Free Safety" or "Outside Linebacker").

Map these granular positions to more standard positions. Also classify each position as "Offense" or "Defense", and indicate if the position is a "Skill" position or on the line of scrimmage.

In [None]:
df_positions = pd.read_csv('data/position_mapping.csv')
df_positions.head()

### Merge the draft and position mapping datasets

In [None]:
print('Granular position counts in combine dataset:')
print(df_combine['pos'].value_counts().head())

df_combine = df_combine.merge(df_positions,
                          how='left',
                          on='pos')

print('\nPosition Group counts after merging with position map:')
print(df_combine['pos_group'].value_counts())

df_combine.head()

#### Visualize combine performance distributions by position

In [None]:
positions = df_combine['pos_group'].unique()

positions_to_drop = ['SN', 'K']   # Long snappers and kickers/punters

positions = [pos for pos in positions if pos not in positions_to_drop]
print(positions)

# Sort positions by L(ine)/S(kill)
sort_dict = pd.Series(df_positions['line_or_skill'].values,index=df_positions['pos_group']).to_dict()
print(sort_dict)
positions.sort(key=lambda x: sort_dict[x])
print(positions)

print('Unique Positions: ', len(positions))
print(df_combine.columns)
stat_columns = ['height (in)', 'weight (lbs)',
       'hand size (in)', 'arm length (in)', '40 yard', 'bench press',
       'vert leap (in)', 'broad jump (in)', 'shuttle', '3cone', '60yd shuttle']
num_stats = len(stat_columns)

fig, axes = plt.subplots(len(positions), num_stats,
                         sharex=False,
                         sharey=True,
                         figsize=(25,25))

fig.suptitle('NFL Combine Statistics - Distribution by Position (2000-2020)', fontsize=30)
fig.supxlabel('Measurement', fontsize=30)
fig.supylabel('Position', fontsize=30)

fig.tight_layout(rect=[0.03, 0.03, 1, .95])

# Loop over axes and data
for row, pos in enumerate(positions):
    x_positions = df_combine[df_combine['pos_group']==pos]

    for col, stat in enumerate(stat_columns):
        # Get axis
        ax = axes[row,col]
        x = x_positions[stat]
        ax.hist(x,
                range=[df_combine[stat].min(),df_combine[stat].max()],
                alpha=.5, bins=10)
        # Set Y label once per row
        if col==0:
            ax.set_ylabel(pos, fontsize='xx-large')

        # Set X label above first row and below last row
        if row == 0:
            ax.set_title(stat, fontsize='xx-large')
        if row == len(positions) - 1:
            ax.set_xlabel(stat, fontsize='xx-large')

fig.show()
fig.savefig('images/stats_by_position.png', format='png')


## Import and clean NFL Draft Data

In [None]:
draft_file = r'data\espn_draft_history_2000_2021_cleaned.csv'
df_raw_draft = pd.read_csv(draft_file)

# Keep raw data import for reference - build copy to modify
df_draft = df_raw_draft

# Clean column headers
df_draft.columns = df_draft.columns.str.lower()

# Clean school names
df_draft['school'] = df_draft['school'].str.strip()
df_draft.head()

### Are there duplicated names?

In [None]:
df_combine['name'].value_counts(sort='descending').head(10)

### Answer: Yes

So we cannot simply join the 2 datasets on player 'name' columns. Need to also join on
college and year.


## Do college names match in both datasets?

In [None]:
draft_school = pd.DataFrame(df_draft['school'].unique()).rename(columns={0:'school'})
draft_school['source'] = 'draft'
combine_school = pd.DataFrame(df_combine['school'].unique()).rename(columns={0:'school'})
combine_school['source'] = 'combine'
print(type(combine_school))
print(combine_school.head())

schools = draft_school.merge(combine_school, on='school', how='outer',
                             suffixes=['_draft', '_combine']).sort_values(by='school')

# List all cases with mismatches
na_mask = schools.isna().any(axis=1)
schools[na_mask].head(10)

### So we see that the 'combine' dataset frequently has the state appended to the school name;
Ex: "Abilene Christian (TX)". Remove these from school names, with the exception of "Miami (OH).

In [None]:
df_combine['school'] = df_combine['school'].str.replace('Miami (OH)', 'Miami - OH')

print(df_combine['school'].head())
regex_replace_parens = r'\([^)]*[a-zA-Z][^)]*\)'
df_combine['school'] = df_combine['school'].str.replace(regex_replace_parens,
                                                        '', regex=True)

df_combine['school'].head()

## Standardize player names between datasets
Player names in the "Draft" dataset include suffixes including "Jr., II, III, IV", but these are NOT included in the "combine" dataset.

Standardize player names between datasets by removing these values from the "Draft" dataset.

In [None]:
regex_suffixes_to_remove = r'Jr\.$|III$|IIII$|IV$|, Jr.$'
df_draft['name'] = df_draft['name'].str.replace(regex_suffixes_to_remove,
                                                '', regex=True)

### Merge the Draft and NFL Combine datasets

In [None]:
df_merged = df_combine.merge(df_draft, how='left',
                             on=['name', 'school', 'year'])

df_merged.head()

# df_merged.to_clipboard()

## Investigate merged data

In [None]:
df_merged.info()


## Data Cleaning:
* Very few 60-yard shuttle records; drop column
* Drop all undrafted players - will only focus on drafted players
* Drop kickers, long snappers, QBs and Fullbacks (too few, draft status not driven by stats)

In [None]:
# Drop 60yd shuttle (too few data points), duplicative columns related to
# player position, and things like year and team name
merged_cols_to_drop = ['year', 'name', 'school', 'pos',
                       '60yd shuttle',
                       'pk(ovr)', 'team', 'position']

try:
    df_merged.drop(columns=merged_cols_to_drop, inplace=True)
except:
    print('Issue dropping columns')


# overwite blank 'round' values with '8' (will indicate undrafted)
# df_merged['round'].fillna(8, inplace=True)
df_merged.dropna(inplace=True)

# df_merged['round'].loc[(df_merged['round']>=1) & (df_merged['round']<4)] = 1
# df_merged['round'].loc[(df_merged['round']>=4) & (df_merged['round']<8)] = 2
# df_merged['round'].fillna(3, inplace=True)
# print(df_merged['round'].value_counts())

print('\n Remaining Columns')
print(df_merged.columns)

positions_to_drop = ['SN', 'K', 'QB', 'FB']
positions_mask = \
    df_merged[df_merged['pos_group'].isin(positions_to_drop)].index

print(positions_mask)

print(df_merged.shape)
df_merged.drop(positions_mask, inplace=True)
print(df_merged.head())


### Drop players with sparse combine statistics

In [None]:
metrics_cols = ['height (in)', 'weight (lbs)', 'hand size (in)', 'arm length (in)',
       '40 yard', 'bench press', 'vert leap (in)', 'broad jump (in)',
       'shuttle', '3cone']

# See count of records by number of missing metrics values
# 10 total metrics values, drop if they don't have at least 8
print('\n Missing metrics per row')
print(df_merged[metrics_cols].isna().sum(axis=1).value_counts())
df_merged.dropna(axis=0, thresh=7,
                 subset=metrics_cols, inplace=True)
print('\nRemaining missing metrics by row')
print(df_merged[metrics_cols].isna().sum(axis=1).value_counts())

df_merged.head(10)

In [None]:
df_merged.columns


## Impute missing values based on average of players with same position


In [None]:
# print(df_merged.head(10))

df_merged = clean_data.group_imputer(
    df=df_merged,
    grouping_col='pos_group',
    cols_to_impute=metrics_cols)

# print('\n')
# print(df_merged.head(10))

## Drop either "line" or "skill" players from dataset

In [None]:
drop_skill_or_line = 'S'    # 'S' for skill or 'L' for line
drop_index = df_merged[df_merged['line_or_skill']==drop_skill_or_line].index
df_for_models = df_merged.drop(drop_index)
df_for_models.head(5)

## Begin Modeling


### Imports for modeling

In [None]:
from sklearn.model_selection import train_test_split

# To visualize pipeline models
from sklearn import set_config
from sklearn.utils import estimator_html_repr


# encoders
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

#metrics
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#pipeline
from sklearn.pipeline import make_pipeline

# machine learning
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# Boosted Models
# Use this one if you have an M1 chip.
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# Permutation Importance
from sklearn.inspection import permutation_importance

# for displaying images and html
from IPython.display import Image
from IPython.core.display import HTML

Setup sklearn tools to visualize pipelines

In [None]:
set_config(display='diagram')

### Split data

In [None]:
target = 'round'
X = df_for_models.drop(columns=target)
y = df_for_models[target]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=.2,
                                                    random_state=21)

print(f'X_train shape: {X_train.shape} X_test shape: {X_test.shape}')

## Baseline Accuracy

In [None]:
baseline = y.value_counts(normalize=True).max()
baseline


## Build Decision Tree Classifier Model/Pipeline

In [None]:
# Base Model
model_dt = make_pipeline(
    OrdinalEncoder(),
    DecisionTreeClassifier(random_state=42)
)

model_dt.fit(X_train, y_train)

print('Decision Tree Training Accuracy', model_dt.score(X_train, y_train))
print('Decision Tree Validation Accuracy', model_dt.score(X_test, y_test))

conf_matrix = confusion_matrix(y_true=y_test, y_pred=model_dt.predict(X_test))
display = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
                                 display_labels=model_dt.classes_)
display.plot()

# Export HTML of pipeline model
with open('pipe_html/model_dt.html', 'w') as f:
    f.write(estimator_html_repr(model_dt))

model_dt


## Build Tuned Random Forest Model/Pipeline

In [None]:
# Base Model
model_rf = make_pipeline(
    OrdinalEncoder(),
    RandomForestClassifier(random_state=42)
)

# Parameter distributions for hyperparameter tuning
# Note double underscores __ in keys below
param_distributions = {
    'randomforestclassifier__max_depth': range(3,50,5),
    'randomforestclassifier__n_estimators': range(10,2000, 10),
    'randomforestclassifier__bootstrap': [True, False],
    'randomforestclassifier__warm_start': [True, False]
}

tuned_rf = RandomizedSearchCV(
    model_rf,
    param_distributions=param_distributions,
    n_iter=25,
    cv=5,
    verbose=1,
    n_jobs=6
)

model_rf.fit(X_train, y_train)
tuned_rf.fit(X_train, y_train)

# Export HTML of pipeline model
with open('pipe_html/tuned_rf.html', 'w') as f:
    f.write(estimator_html_repr(tuned_rf))

conf_matrix = confusion_matrix(y_true=y_test, y_pred=tuned_rf.predict(X_test))
display = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
                                 display_labels=tuned_rf.classes_)
display.plot()

### Evaluate Tuned Random Forest Model/Pipeline

In [None]:
print('Tuned RF training best score: ', tuned_rf.best_score_)
print('Tuned RF best parameters: ', tuned_rf.best_params_)
print('Tuned RF test score: ', tuned_rf.score(X_test, y_test))

### Get Permutation importances for model

In [None]:
importances = model_rf.named_steps['randomforestclassifier'].feature_importances_

gini_imp = pd.DataFrame(data=importances, index=X_test.columns, columns=['gini_impurity']).sort_values(by='gini_impurity')

gini_imp.tail(10).plot(kind='barh');



## Build XGBoost Model/Pipeline

In [None]:
model_xgb = make_pipeline(
    OrdinalEncoder(),
    XGBClassifier(
        loss='deviance',
        # learning_rate=0.1,
        n_estimators=500,
        subsample=1,
        max_depth=4,
        random_state=42
    )
)

model_xgb.fit(X_train, y_train)

# Export HTML of pipeline model
with open('pipe_html/model_xgb.html', 'w') as f:
    f.write(estimator_html_repr(model_xgb))

model_xgb

conf_matrix = confusion_matrix(y_true=y_test, y_pred=model_xgb.predict(X_test))
display = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
                                 display_labels=model_xgb.classes_)
display.plot()

### Evaluate XGBoost Model/Pipeline

In [None]:
print('XGBoost Forest Training Accuracy', model_xgb.score(X_train, y_train))
print('XGBoost Forest Validation Accuracy', model_xgb.score(X_test, y_test))


## Build Ridge Classifier Model/Pipeline

In [None]:
model_ridge = make_pipeline(
    OneHotEncoder(),
    RidgeClassifierCV(cv=5,
                      alphas=[.1,1.0,10.0])
)

model_ridge.fit(X_train, y_train)

conf_matrix = confusion_matrix(y_true=y_test, y_pred=model_ridge.predict(X_test))
display = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
                                 display_labels=model_ridge.classes_)
display.plot()

### Evaluate Ridge Classifier Model/Pipeline

In [None]:
print('Ridge Classifier Training Accuracy', model_ridge.score(X_train, y_train))
print('Ridge Classifier Validation Accuracy', model_ridge.score(X_test, y_test))


### Build Tuned Ridge Classifier Model/Pipeline

In [40]:
import random

# Base Model
model_ridgecv = make_pipeline(
    OneHotEncoder(),
    RidgeClassifierCV()
)

# Parameter distributions for hyperparameter tuning
# Note double underscores __ in keys below
param_distributions = {
    'ridgeclassifiercv__cv': range(2, 10),
    'ridgeclassifiercv__fit_intercept': [True, False],
    'ridgeclassifiercv__class_weight': ['balanced', None]
}

tuned_ridge = RandomizedSearchCV(
    model_ridgecv,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    verbose=1,
    n_jobs=6
)

model_ridgecv.fit(X_train, y_train)
tuned_ridge.fit(X_train, y_train)

print('Tuned Ridge Classifier Training Accuracy', tuned_ridge.score(X_train, y_train))
print('Tuned Ridge Classifier Validation Accuracy', tuned_ridge.score(X_test, y_test))

tuned_ridge

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Tuned Ridge Classifier Training Accuracy 0.25036603221083453
Tuned Ridge Classifier Validation Accuracy 0.24561403508771928
