# Modeling

### Import Relevant Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = FutureWarning)
warnings.filterwarnings("ignore", category = UserWarning)

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer, snowball
from nltk.corpus import stopwords
from nltk import FreqDist
from wordcloud import WordCloud
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.util import ngrams
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder 
from nltk.util import ngrams

nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import plot_confusion_matrix, classification_report, r2_score, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

import time

import streamlit as st

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\capta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Read in Processed Data

In [2]:
df = pd.read_parquet('./Data/processed_reports.parquet')

In [3]:
df.reset_index(inplace=True)

In [4]:
df.drop(['index'],axis=1,inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Data columns (total 99 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   player                       4209 non-null   object 
 1   report                       4209 non-null   object 
 2   year                         4209 non-null   float64
 3   weight_x                     3986 non-null   float64
 4   height_x                     3986 non-null   float64
 5   pos_rk                       3871 non-null   float64
 6   ovr_rk                       3292 non-null   float64
 7   grade                        3877 non-null   float64
 8   age                          3897 non-null   float64
 9   Round                        4209 non-null   Int64  
 10  Pick                         4209 non-null   Int64  
 11  target                       4209 non-null   Int64  
 12  processed_report             4209 non-null   object 
 13  reportlen         

In [6]:
df['target'] = df.Round

In [7]:
df.target[df.target >= 6] = 0

In [8]:
df.target[df.target == 1] = 10

In [9]:
df.target[(df.target == 4) | (df.target == 5)] = 1

In [10]:
df.target[df.target == 3] = 2

In [11]:
df.target[df.target == 10] = 3

In [12]:
cleaned_df = df.drop(['report','processed_report','joined_report','weight_y','height_y'],axis=1) 

In [13]:
cleaned_df = cleaned_df.add_suffix('_col')

In [14]:
grades22 = pd.read_excel('./Data/addgrades.xlsx')

In [15]:
cleaned_df.pos_rk_col[cleaned_df.year_col == 2022] = grades22.pos_rk_col
cleaned_df.ovr_rk_col[cleaned_df.year_col == 2022] = grades22.ovr_rk_col
cleaned_df.grade_col[cleaned_df.year_col == 2022] = grades22.grade_col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df.pos_rk_col[cleaned_df.year_col == 2022] = grades22.pos_rk_col
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df.ovr_rk_col[cleaned_df.year_col == 2022] = grades22.ovr_rk_col
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df.grade_col[cleaned_df.year_col == 2022] = grades22.grade_col


In [16]:
cleaned_df

Unnamed: 0,player_col,year_col,weight_x_col,height_x_col,pos_rk_col,ovr_rk_col,grade_col,age_col,Round_col,Pick_col,...,tackles_loss_career_col,sacks_career_col,pass_defended_career_col,fumbles_forced_season_col,fumbles_forced_career_col,college_conference_col,speed_col,shuttle_agility_col,cone_agility_col,pos_col
0,Alex Smith,2005.0,217.0,76.125,2.0,3.0,98.0,21.0,1,1,...,,,,,,Mountain West Conference,88.187311,176.485482,136.815911,QB
1,Ronnie Brown,2005.0,233.0,72.250,1.0,4.0,98.0,23.0,1,2,...,,,,,,,120.996000,168.168738,130.199966,RB
2,Braylon Edwards,2005.0,211.0,74.875,1.0,1.0,99.0,22.0,1,3,...,,,,,,,107.614900,,,WR
3,Cedric Benson,2005.0,222.0,70.500,3.0,10.0,96.0,22.0,1,4,...,,,,,,,97.457455,,,RB
4,Carnell Williams,2005.0,217.0,70.875,2.0,6.0,97.0,,1,5,...,,,,,,,,,,RB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,Jeff Gunter,2022.0,,,18.0,198.0,55.0,23.0,7,252,...,38.5,17.0,4.0,2.0,9.0,Sun Belt Conference,105.744617,144.109521,137.671457,LB
4205,Trenton Gill,2022.0,,,,,,23.0,7,255,...,,,,,,Atlantic Coast Conference,,,,P
4206,Jesse Luketa,2022.0,,,11.0,167.0,59.0,23.0,7,256,...,11.5,0.5,6.0,,,Big Ten Conference,88.494256,,,LB
4207,Marquis Hayes,2022.0,,,10.0,139.0,64.0,23.0,7,257,...,,,,,,Big Twelve Conference,80.603451,,,OL


In [17]:
tf = TfidfVectorizer(ngram_range=(1,3),min_df=.062,max_df =.75)

In [18]:
cvec = CountVectorizer(ngram_range=(1,3),min_df=.062,max_df=.75)

In [19]:
X_cvec = cvec.fit_transform(df.joined_report)

In [20]:
cvec_df = pd.DataFrame(X_cvec.toarray(),columns=cvec.get_feature_names())

In [21]:
cvec_df

Unnamed: 0,2002,2003,2004,2005,2006,2016,abil,adequ,also,appear,...,two season,upsid,valu,versatil,well,work,yard,year,year starter,zone
0,1,2,1,2,0,0,0,0,3,0,...,0,0,0,0,0,0,4,3,0,0
1,1,0,1,1,0,0,0,0,1,0,...,0,0,0,1,0,1,3,0,0,0
2,1,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,1,0,0
3,0,1,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
4,2,2,1,0,0,0,1,0,3,0,...,0,0,0,0,0,0,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4205,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,1,0,0,0
4206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4207,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
X = tf.fit_transform(df.joined_report)

In [23]:
report_df = pd.DataFrame(X.toarray(),columns=tf.get_feature_names())

In [24]:
report_df.columns

Index(['2002', '2003', '2004', '2005', '2006', '2016', 'abil', 'adequ', 'also',
       'appear',
       ...
       'two season', 'upsid', 'valu', 'versatil', 'well', 'work', 'yard',
       'year', 'year starter', 'zone'],
      dtype='object', length=198)

In [25]:
model_df = pd.concat([cleaned_df,report_df],axis=1)

In [26]:
cvec_model_df = pd.concat([cleaned_df,cvec_df],axis=1)

In [27]:
cvec_model_df.to_parquet('./Data/mockdraft.parquet')

In [28]:
model_df.to_parquet('./Data/mockdraft2.parquet')

In [29]:
model_df

Unnamed: 0,player_col,year_col,weight_x_col,height_x_col,pos_rk_col,ovr_rk_col,grade_col,age_col,Round_col,Pick_col,...,two season,upsid,valu,versatil,well,work,yard,year,year starter,zone
0,Alex Smith,2005.0,217.0,76.125,2.0,3.0,98.0,21.0,1,1,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.302746,0.197995,0.0,0.000000
1,Ronnie Brown,2005.0,233.0,72.250,1.0,4.0,98.0,23.0,1,2,...,0.0,0.0,0.0,0.128312,0.0,0.130273,0.338669,0.000000,0.0,0.000000
2,Braylon Edwards,2005.0,211.0,74.875,1.0,1.0,99.0,22.0,1,3,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.261304,0.113928,0.0,0.000000
3,Cedric Benson,2005.0,222.0,70.500,3.0,10.0,96.0,22.0,1,4,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.200609,0.000000,0.0,0.000000
4,Carnell Williams,2005.0,217.0,70.875,2.0,6.0,97.0,,1,5,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.282241,0.061528,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,Jeff Gunter,2022.0,,,18.0,198.0,55.0,23.0,7,252,...,0.0,0.0,0.0,0.199357,0.0,0.000000,0.000000,0.000000,0.0,0.000000
4205,Trenton Gill,2022.0,,,,,,23.0,7,255,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.370394,0.000000,0.0,0.000000
4206,Jesse Luketa,2022.0,,,11.0,167.0,59.0,23.0,7,256,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000
4207,Marquis Hayes,2022.0,,,10.0,139.0,64.0,23.0,7,257,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.237427


In [30]:
def get_results (model):

    # Fit GridSearch for model
    model.fit(X_train,y_train)

    # Get y_preds
    y_pred = model.predict(X_test)

    # Set metric scores with print labels
    r2 = f"r2 = {round(r2_score(y_test,y_pred),4)}"
    MSE = f"MSE = {round(mean_squared_error(y_test,y_pred),2)}"
    
    return r2, MSE

In [31]:
def graph_model_results(model):
    
    y_pred = model.predict(X_test)
    fig, ax = plt.subplots(figsize=(8,8))
    sns.regplot(x=y_pred,y=y_test,line_kws={"color": "red"})
    plt.xlabel('Predicted Pick')
    plt.ylabel('Pick')
    plt.title('Actual Pick vs. Model Predicted Pick')
    plt.xlim(-5,265)
    plt.ylim(-5,265)
    plt.annotate(f'r2 = {round(r2_score(y_test,y_pred),3)}',xy=(230,240));
    return

### Random Forest

In [32]:
year = 2022

X = cvec_model_df.drop(['Pick_col'],axis=1)
label_encoder = LabelEncoder()

X_train = X[X.year_col != year]
X_test = X[X.year_col == year]
y_train = label_encoder.fit_transform(cvec_model_df.Pick_col[cvec_model_df.year_col != year])
y_test = label_encoder.fit_transform(cvec_model_df.Pick_col[cvec_model_df.year_col == year])

player_name = X_test.player_col
player_year = X_test.year_col
player_round = X_test.Round_col
player_target = X_test.target_col
player_pos = X_test.pos_col

X_train.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)
X_test.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)

num_cols = X_train.select_dtypes(['Int64','float64'])
cat_cols = X_train.select_dtypes('object')

num_transformer = Pipeline(steps=[('ss',StandardScaler()),
                             ('impute',SimpleImputer(strategy='constant'))])

cat_transformer = Pipeline(steps=[('ohe',OneHotEncoder(drop='first',sparse=False,handle_unknown='ignore'))])

transformer = ColumnTransformer(transformers=[
    ('num',num_transformer,num_cols.columns),
    ('cat',cat_transformer,cat_cols.columns)
])

rfc_pipe = Pipeline([
    ('transformer',transformer),
    ('sample',None),
    ('forest',RandomForestRegressor())
])

rfc_pipe.fit(X_train, y_train)

grid = {
    'sample':[RandomOverSampler(random_state=42),SMOTE(random_state=42)],
    'forest__n_estimators':[102],
#     'forest__criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'forest__max_depth':[24],
    'forest__min_samples_split':[2],
    'forest__min_samples_leaf':[6]
}

cvec_forest = GridSearchCV(estimator=rfc_pipe,
                          param_grid=grid,
                          cv=5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)


In [33]:
start_time = time.time()
print(get_results(cvec_forest))
print(round((time.time() - start_time)/60,1),'minutes')
# Set y_pred
y_pred_cvec_forest = cvec_forest.predict(X_test)

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\capta\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\capta\anaconda3\lib\site-packages\imblearn\pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\capta\anaconda3\lib\site-packages\imblearn\pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "C:\Users\capta\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)


('r2 = 0.7329', 'MSE = 1106.97')
2.2 minutes


In [None]:
# Graph model results
graph_model_results(cvec_forest)

In [None]:
year = 2022

X = model_df.drop(['Pick_col'],axis=1)
label_encoder = LabelEncoder()

X_train = X[X.year_col != year]
X_test = X[X.year_col == year]
y_train = label_encoder.fit_transform(model_df.Pick_col[model_df.year_col != year])
y_test = label_encoder.fit_transform(model_df.Pick_col[model_df.year_col == year])

player_name = X_test.player_col
player_year = X_test.year_col
player_round = X_test.Round_col
player_target = X_test.target_col
player_pos = X_test.pos_col

X_train.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)
X_test.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)

num_cols = X_train.select_dtypes(['Int64','float64'])
cat_cols = X_train.select_dtypes('object')

num_transformer = Pipeline(steps=[('ss',StandardScaler()),
                             ('impute',SimpleImputer(strategy='constant'))])

cat_transformer = Pipeline(steps=[('ohe',OneHotEncoder(drop='first',sparse=False,handle_unknown='ignore'))])

transformer = ColumnTransformer(transformers=[
    ('num',num_transformer,num_cols.columns),
    ('cat',cat_transformer,cat_cols.columns)
])

rfc_pipe = Pipeline([
    ('transformer',transformer),
    ('sample',None),
    ('forest',RandomForestRegressor())
])

rfc_pipe.fit(X_train, y_train)

grid = {
    'sample':[RandomOverSampler(random_state=42),SMOTE(random_state=42)],
    'forest__n_estimators':[102],
#     'forest__criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'forest__max_depth':[24],
    'forest__min_samples_split':[2],
    'forest__min_samples_leaf':[6]
}

forest = GridSearchCV(estimator=rfc_pipe,
                          param_grid=grid,
                          cv=5)

In [None]:
start_time = time.time()
print(get_results(forest))
print(round((time.time() - start_time)/60,1),'minutes')
# Set y_pred
y_pred_forest = forest.predict(X_test)

In [None]:
# Graph model results
graph_model_results(forest)

In [None]:
year = 2022

X = model_df.drop(['Pick_col'],axis=1)
label_encoder = LabelEncoder()

X_train = X[X.year_col != year]
X_test = X[X.year_col == year]
y_train = label_encoder.fit_transform(model_df.Pick_col[model_df.year_col != year])
y_test = label_encoder.fit_transform(model_df.Pick_col[model_df.year_col == year])

player_name = X_test.player_col
player_year = X_test.year_col
player_round = X_test.Round_col
player_target = X_test.target_col
player_pos = X_test.pos_col

# Drop unneeded or data leakage columns
X_train.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)
X_test.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)

# Get numerical and categorical columns to breakup 
num_cols = X_train.select_dtypes(['Int64','float64'])
cat_cols = X_train.select_dtypes('object')

# Numerical transformer pipeline
num_transformer = Pipeline(steps=[('ss',StandardScaler()),
                             ('impute',SimpleImputer(strategy='constant'))])

# Categorical transformer pipeline
cat_transformer = Pipeline(steps=[('ohe',OneHotEncoder(drop='first',sparse=False,handle_unknown='ignore'))])

# Column transformer of numerical and categorical transformers
transformer = ColumnTransformer(transformers=[
    ('num',num_transformer,num_cols.columns),
    ('cat',cat_transformer,cat_cols.columns)
])

# Pipeline for transformations, sampling and the model
pipe = Pipeline([
    ('transformer',transformer),
    ('sample',None),
    ('grad',GradientBoostingRegressor(random_state=42))
])

# Create Grid for GridSearch
grid = {
    'sample':[RandomOverSampler(random_state=42),SMOTE(random_state=42)],
    'grad__n_estimators':[94],
    'grad__learning_rate':[.05],
#     'grad__loss':['absolute_error','squared_error','huber','quantile'],
    'grad__min_samples_split':[2],
    'grad__min_samples_leaf':[1],
    'grad__max_depth':[4],
    'grad__min_impurity_decrease':[.31]
}

# GridSearch with Cross Validation of 5
grad = GridSearchCV(estimator=pipe,
                          param_grid=grid,
                          cv=5)

In [None]:
# Set start time
start_time = time.time()

# Call results function
print(get_results(grad))

# Return run time
print(round((time.time() - start_time)/60,1),'minutes')

# Set y_pred
y_pred_grad = grad.predict(X_test)

In [None]:
# Graph model results
graph_model_results(grad)

In [34]:
year = 2022

X = cvec_model_df.drop(['Pick_col'],axis=1)
label_encoder = LabelEncoder()

X_train = X[X.year_col != year]
X_test = X[X.year_col == year]
y_train = label_encoder.fit_transform(cvec_model_df.Pick_col[cvec_model_df.year_col != year])
y_test = label_encoder.fit_transform(cvec_model_df.Pick_col[cvec_model_df.year_col == year])

player_name = X_test.player_col
player_year = X_test.year_col
player_round = X_test.Round_col
player_target = X_test.target_col
player_pos = X_test.pos_col

# Drop unneeded or data leakage columns
X_train.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)
X_test.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)

# Get numerical and categorical columns to breakup 
num_cols = X_train.select_dtypes(['Int64','float64'])
cat_cols = X_train.select_dtypes('object')

# Numerical transformer pipeline
num_transformer = Pipeline(steps=[('ss',StandardScaler()),
                             ('impute',SimpleImputer(strategy='constant'))])

# Categorical transformer pipeline
cat_transformer = Pipeline(steps=[('ohe',OneHotEncoder(drop='first',sparse=False,handle_unknown='ignore'))])

# Column transformer of numerical and categorical transformers
transformer = ColumnTransformer(transformers=[
    ('num',num_transformer,num_cols.columns),
    ('cat',cat_transformer,cat_cols.columns)
])

# Pipeline for transformations, sampling and the model
pipe = Pipeline([
    ('transformer',transformer),
    ('sample',None),
    ('grad',GradientBoostingRegressor(random_state=42))
])

# Create Grid for GridSearch
grid = {
    'sample':[RandomOverSampler(random_state=42),SMOTE(random_state=42)],
    'grad__n_estimators':[94],
    'grad__learning_rate':[.05],
#     'grad__loss':['absolute_error','squared_error','huber','quantile'],
    'grad__min_samples_split':[2],
    'grad__min_samples_leaf':[1],
    'grad__max_depth':[4],
    'grad__min_impurity_decrease':[.31]
}

# GridSearch with Cross Validation of 5
cvec_grad = GridSearchCV(estimator=pipe,
                          param_grid=grid,
                          cv=5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(['player_col','year_col','Round_col','target_col'],axis=1,inplace=True)


In [35]:
# Set start time
start_time = time.time()

# Call results function
print(get_results(cvec_grad))

# Return run time
print(round((time.time() - start_time)/60,1),'minutes')

# Set y_pred
y_pred_cvec_grad = cvec_grad.predict(X_test)

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\capta\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\capta\anaconda3\lib\site-packages\imblearn\pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\capta\anaconda3\lib\site-packages\imblearn\pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "C:\Users\capta\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)


('r2 = 0.7458', 'MSE = 1053.28')
1.2 minutes


In [None]:
# Graph model results
graph_model_results(cvec_grad)

### Get Weights from the Best Model

In [None]:
# Get names of the OneHotEncoded columns
rfc_pipe[0].transformers_[1][1].named_steps['ohe'].get_feature_names()

In [None]:
# Get all other X column names
X_train.columns.to_list()

In [None]:
# Combine the X_train columns and the OneHotEncoded problems in the proper order
# EXTREMELY important as otherwise the feature names will be mismatched
feature_cols = ['weight_x_col',
 'height_x_col',
 'pos_rk_col',
 'ovr_rk_col',
 'grade_col',
 'age_col',
 'reportlen_col',
 'forty_yd_col',
 'vertical_col',
 'bench_reps_col',
 'broad_jump_col',
 'cone_col',
 'shuttle_col',
 'g_season_col',
 'def_int_season_col',
 'def_int_yds_season_col',
 'def_int_yds_per_int_season_col',
 'def_int_career_col',
 'def_int_yds_career_col',
 'def_int_yds_per_int_career_col',
 'pass_cmp_season_col',
 'pass_att_season_col',
 'pass_cmp_pct_season_col',
 'pass_yds_season_col',
 'pass_yds_per_att_season_col',
 'adj_pass_yds_per_att_season_col',
 'pass_td_season_col',
 'pass_int_season_col',
 'pass_rating_season_col',
 'pass_cmp_career_col',
 'pass_att_career_col',
 'pass_cmp_pct_career_col',
 'pass_yds_career_col',
 'pass_yds_per_att_career_col',
 'adj_pass_yds_per_att_career_col',
 'pass_td_career_col',
 'pass_int_career_col',
 'pass_rating_career_col',
 'rush_att_season_col',
 'rush_yds_season_col',
 'rush_yds_per_att_season_col',
 'rush_td_season_col',
 'rec_season_col',
 'rec_yds_season_col',
 'rec_yds_per_rec_season_col',
 'scrim_att_season_col',
 'scrim_yds_season_col',
 'scrim_yds_per_att_season_col',
 'scrim_td_season_col',
 'rush_att_career_col',
 'rush_yds_career_col',
 'rush_yds_per_att_career_col',
 'rush_td_career_col',
 'rec_career_col',
 'rec_yds_career_col',
 'rec_yds_per_rec_career_col',
 'rec_td_career_col',
 'scrim_att_career_col',
 'scrim_yds_career_col',
 'scrim_yds_per_att_career_col',
 'scrim_td_career_col',
 'fumbles_rec_yds_career_col',
 'fumbles_rec_td_career_col',
 'rec_td_season_col',
 'def_int_td_season_col',
 'def_int_td_career_col',
 'fumbles_rec_career_col',
 'fumbles_rec_yds_season_col',
 'fumbles_rec_td_season_col',
 'fumbles_rec_season_col',
 'tackles_solo_season_col',
 'tackles_assists_season_col',
 'tackles_total_season_col',
 'tackles_loss_season_col',
 'sacks_season_col',
 'pass_defended_season_col',
 'tackles_solo_career_col',
 'tackles_assists_career_col',
 'tackles_total_career_col',
 'tackles_loss_career_col',
 'sacks_career_col',
 'pass_defended_career_col',
 'fumbles_forced_season_col',
 'fumbles_forced_career_col',
 'x0_Atlantic Coast Conference', 'x0_Big East',
       'x0_Big Sky Conference', 'x0_Big South Conference',
       'x0_Big Ten Conference', 'x0_Big Twelve Conference',
       'x0_Central Intercollegiate Athletic Association',
       'x0_Colonial Athletic Association', 'x0_Conference USA',
       'x0_Great Lakes Intercollegiate Athletic Conference',
       'x0_Great Northwest Athletic Conference',
       'x0_Great West Football Conference', 'x0_Gulf South Conference',
       'x0_Independent', 'x0_Ivy League', 'x0_Lone Star Conference',
       'x0_Mid-America Intercollegiate Athletic Association',
       'x0_Mid-American Conference', 'x0_Mid-Eastern Athletic Conference',
       'x0_Minnesota Intercollegiate Athletic Conference',
       'x0_Missouri Valley Football Conference',
       'x0_Mountain West Conference', 'x0_Northeast Conference',
       'x0_Northern Sun Intercollegiate Conference',
       'x0_Ohio Athletic Conference', 'x0_Ohio Valley Conference',
       'x0_Pacific Twelve Conference', 'x0_Patriot League',
       'x0_Pennsylvania State Athletic Conference',
       'x0_Pioneer Football League', 'x0_South Atlantic Conference',
       'x0_Southeastern Conference', 'x0_Southern Conference',
       'x0_Southern Intercollegiate Athletic Conference',
       'x0_Southland Conference', 'x0_Southwestern Athletic Conference',
       'x0_Sun Belt Conference', 'x0_Western Athletic Conference',
       'x0_Wisconsin Intercollegiate Athletic Conference','x0_None','speed_col',
 'shuttle_agility_col',
 'cone_agility_col','x1_DB', 'x1_DE', 'x1_DL', 'x1_DT', 'x1_FB', 'x1_K', 'x1_LB',
       'x1_LS', 'x1_OL', 'x1_P', 'x1_QB', 'x1_RB', 'x1_S', 'x1_TE',
       'x1_WR','2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2016',
 'abil',
 'adequ',
 'also',
 'appear',
 'arm',
 'athlet',
 'athlet abil',
 'athletic',
 'averag',
 'back',
 'backup',
 'ball',
 'ball skill',
 'best',
 'better',
 'big',
 'block',
 'blocker',
 'bodi',
 'burst',
 'career',
 'catch',
 'class',
 'close',
 'colleg',
 'combin',
 'come',
 'concern',
 'consist',
 'contribut',
 'corner',
 'could',
 'coverag',
 'day',
 'defend',
 'defens',
 'develop',
 'draft',
 'durabl',
 'earli',
 'edg',
 'effect',
 'eight',
 'elit',
 'end',
 'end speed',
 'enough',
 'excel',
 'experi',
 'explos',
 'field',
 'final',
 'finish',
 'first',
 'fit',
 'five',
 'flash',
 'flash abil',
 'footbal',
 'forc',
 'four',
 'frame',
 'freshman',
 'fumbl',
 'game',
 'get',
 'good',
 'grade',
 'great',
 'guard',
 'hand',
 'high',
 'howev',
 'ideal',
 'improv',
 'inconsist',
 'injuri',
 'insid',
 'instinct',
 'intercept',
 'job',
 'junior',
 'lack',
 'lack ideal',
 'last',
 'late',
 'left',
 'length',
 'level',
 'like',
 'limit',
 'line',
 'lineback',
 'long',
 'loss',
 'lot',
 'make',
 'man',
 'miss',
 'move',
 'much',
 'natur',
 'need',
 'nfl',
 'offens',
 'one',
 'outsid',
 'outstand',
 'overal',
 'pass',
 'pass rusher',
 'physic',
 'pick',
 'play',
 'play game',
 'player',
 'point',
 'posit',
 'possess',
 'potenti',
 'power',
 'pro',
 'product',
 'project',
 'prospect',
 'quarterback',
 'quick',
 'rang',
 'receiv',
 'record',
 'redshirt',
 'return',
 'right',
 'round',
 'round pick',
 'rout',
 'rout runner',
 'run',
 'run defend',
 'runner',
 'rush',
 'rusher',
 'sack',
 'safeti',
 'scheme',
 'season',
 'second',
 'senior',
 'set',
 'short',
 'show',
 'six',
 'size',
 'skill',
 'special',
 'special team',
 'speed',
 'start',
 'start game',
 'starter',
 'still',
 'strength',
 'strong',
 'tackl',
 'tackl loss',
 'take',
 'team',
 'techniqu',
 'term',
 'three',
 'three season',
 'tight',
 'time',
 'top',
 'top end',
 'top end speed',
 'total',
 'touchdown',
 'tough',
 'true',
 'true freshman',
 'two',
 'two season',
 'upsid',
 'valu',
 'versatil',
 'well',
 'work',
 'yard',
 'year',
 'year starter',
 'zone']

In [None]:
weights = rfc_pipe[-1].feature_importances_

In [None]:
coefs = pd.DataFrame(zip(feature_cols,weights))

In [None]:
coefs.sort_values(by=1,ascending=False).head(20)

In [36]:
results = pd.concat([player_name.reset_index(),player_round.reset_index(),player_year.reset_index(),X_test.reset_index(),pd.Series(y_pred_cvec_grad),pd.Series(y_test)],axis=1)

In [37]:
mock_draft_year = results.sort_values(by=0)

In [38]:
results.sort_values(by=1)

Unnamed: 0,index,player_col,index.1,Round_col,index.2,year_col,index.3,weight_x_col,height_x_col,pos_rk_col,...,valu,versatil,well,work,yard,year,year starter,zone,0,1
0,3986,Travon Walker,3986,1,3986,2022.0,3986,,,2.0,...,0,1,0,0,0,0,0,0,10.781889,0
1,3987,Aidan Hutchinson,3987,1,3987,2022.0,3987,,,1.0,...,0,0,1,1,0,0,0,0,10.330430,1
2,3988,Derek Stingley,3988,1,3988,2022.0,3988,,,2.0,...,0,0,0,0,0,0,0,0,16.674849,2
3,3989,Sauce Gardner,3989,1,3989,2022.0,3989,,,1.0,...,0,0,0,0,1,0,0,0,9.523984,3
4,3990,Kayvon Thibodeaux,3990,1,3990,2022.0,3990,,,3.0,...,0,0,0,0,0,0,0,0,13.092921,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,4204,Jeff Gunter,4204,7,4204,2022.0,4204,,,18.0,...,0,1,0,0,0,0,0,0,153.643042,218
219,4205,Trenton Gill,4205,7,4205,2022.0,4205,,,,...,0,0,0,0,1,0,0,0,204.233916,219
220,4206,Jesse Luketa,4206,7,4206,2022.0,4206,,,11.0,...,0,0,0,0,0,0,0,0,156.505006,220
221,4207,Marquis Hayes,4207,7,4207,2022.0,4207,,,10.0,...,0,0,0,0,0,0,0,1,153.277911,221


In [39]:
mock_draft_year

Unnamed: 0,index,player_col,index.1,Round_col,index.2,year_col,index.3,weight_x_col,height_x_col,pos_rk_col,...,valu,versatil,well,work,yard,year,year starter,zone,0,1
5,3991,Ikem Ekwonu,3991,1,3991,2022.0,3991,,,1.0,...,0,0,0,0,0,0,0,0,9.469407,5
3,3989,Sauce Gardner,3989,1,3989,2022.0,3989,,,1.0,...,0,0,0,0,1,0,0,0,9.523984,3
6,3992,Evan Neal,3992,1,3992,2022.0,3992,,,2.0,...,0,0,0,0,0,0,0,1,9.901255,6
9,3995,Garrett Wilson,3995,1,3995,2022.0,3995,,,1.0,...,0,0,0,0,0,0,0,0,10.329920,9
1,3987,Aidan Hutchinson,3987,1,3987,2022.0,3987,,,1.0,...,0,0,1,1,0,0,0,0,10.330430,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,4136,Tyler Allgeier,4136,5,4136,2022.0,4136,,,26.0,...,0,0,0,0,0,0,0,0,197.154513,150
217,4203,Isiah Pacheco,4203,7,4203,2022.0,4203,,,27.0,...,0,0,0,0,2,1,0,0,197.835071,217
184,4170,Jordan Jackson,4170,6,4170,2022.0,4170,,,25.0,...,0,0,0,0,0,0,0,0,198.355806,184
219,4205,Trenton Gill,4205,7,4205,2022.0,4205,,,,...,0,0,0,0,1,0,0,0,204.233916,219


In [40]:
mock_draft_year['Actual_Pick'] = mock_draft_year[1] + 1

In [41]:
mock_draft_year

Unnamed: 0,index,player_col,index.1,Round_col,index.2,year_col,index.3,weight_x_col,height_x_col,pos_rk_col,...,versatil,well,work,yard,year,year starter,zone,0,1,Actual_Pick
5,3991,Ikem Ekwonu,3991,1,3991,2022.0,3991,,,1.0,...,0,0,0,0,0,0,0,9.469407,5,6
3,3989,Sauce Gardner,3989,1,3989,2022.0,3989,,,1.0,...,0,0,0,1,0,0,0,9.523984,3,4
6,3992,Evan Neal,3992,1,3992,2022.0,3992,,,2.0,...,0,0,0,0,0,0,1,9.901255,6,7
9,3995,Garrett Wilson,3995,1,3995,2022.0,3995,,,1.0,...,0,0,0,0,0,0,0,10.329920,9,10
1,3987,Aidan Hutchinson,3987,1,3987,2022.0,3987,,,1.0,...,0,1,1,0,0,0,0,10.330430,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,4136,Tyler Allgeier,4136,5,4136,2022.0,4136,,,26.0,...,0,0,0,0,0,0,0,197.154513,150,151
217,4203,Isiah Pacheco,4203,7,4203,2022.0,4203,,,27.0,...,0,0,0,2,1,0,0,197.835071,217,218
184,4170,Jordan Jackson,4170,6,4170,2022.0,4170,,,25.0,...,0,0,0,0,0,0,0,198.355806,184,185
219,4205,Trenton Gill,4205,7,4205,2022.0,4205,,,,...,0,0,0,1,0,0,0,204.233916,219,220


In [42]:
mock_draft_year['Diff'] = (mock_draft_year[0] - mock_draft_year.Actual_Pick).abs() 

In [43]:
mock = mock_draft_year[['player_col','Round_col', 'Actual_Pick',0,'Diff','ovr_rk_col']].reset_index()

In [46]:
mock[['player_col','Round_col', 'Actual_Pick',0,'Diff','ovr_rk_col']].head(150)

Unnamed: 0,player_col,Round_col,Actual_Pick,0,Diff,ovr_rk_col
0,Ikem Ekwonu,1,6,9.469407,3.469407,3.0
1,Sauce Gardner,1,4,9.523984,5.523984,5.0
2,Evan Neal,1,7,9.901255,2.901255,4.0
3,Garrett Wilson,1,10,10.32992,0.32992,6.0
4,Aidan Hutchinson,1,2,10.33043,8.33043,1.0
5,Travon Walker,1,1,10.781889,9.781889,2.0
6,Kyle Hamilton,1,14,12.544796,1.455204,9.0
7,Kayvon Thibodeaux,1,5,13.092921,8.092921,7.0
8,Charles Cross,1,9,13.430974,4.430974,10.0
9,Chris Olave,1,11,16.005296,5.005296,15.0


In [45]:
pd.set_option("display.max_rows", None)
mock_draft_year

Unnamed: 0,index,player_col,index.1,Round_col,index.2,year_col,index.3,weight_x_col,height_x_col,pos_rk_col,...,well,work,yard,year,year starter,zone,0,1,Actual_Pick,Diff
5,3991,Ikem Ekwonu,3991,1,3991,2022.0,3991,,,1.0,...,0,0,0,0,0,0,9.469407,5,6,3.469407
3,3989,Sauce Gardner,3989,1,3989,2022.0,3989,,,1.0,...,0,0,1,0,0,0,9.523984,3,4,5.523984
6,3992,Evan Neal,3992,1,3992,2022.0,3992,,,2.0,...,0,0,0,0,0,1,9.901255,6,7,2.901255
9,3995,Garrett Wilson,3995,1,3995,2022.0,3995,,,1.0,...,0,0,0,0,0,0,10.32992,9,10,0.32992
1,3987,Aidan Hutchinson,3987,1,3987,2022.0,3987,,,1.0,...,1,1,0,0,0,0,10.33043,1,2,8.33043
0,3986,Travon Walker,3986,1,3986,2022.0,3986,,,2.0,...,0,0,0,0,0,0,10.781889,0,1,9.781889
13,3999,Kyle Hamilton,3999,1,3999,2022.0,3999,,,1.0,...,0,0,1,0,0,1,12.544796,13,14,1.455204
4,3990,Kayvon Thibodeaux,3990,1,3990,2022.0,3990,,,3.0,...,0,0,0,0,0,0,13.092921,4,5,8.092921
8,3994,Charles Cross,3994,1,3994,2022.0,3994,,,3.0,...,0,0,0,1,1,1,13.430974,8,9,4.430974
10,3996,Chris Olave,3996,1,3996,2022.0,3996,,,4.0,...,0,0,0,0,0,0,16.005296,10,11,5.005296


In [None]:
results['difference'] = results[0] - results[1]
results['abs_difference'] = (results[0] - results[1]).abs()

In [None]:
results.difference

In [None]:
results.abs_difference.mean()

In [None]:
sns.histplot(results.abs_difference);

In [None]:
best = results[results.abs_difference <= 16]

In [None]:
best

In [None]:
worst = results[results.abs_difference >= 64]

In [None]:
best.Round_col.value_counts(normalize=True)

In [None]:
worst.Round_col.value_counts(normalize=True)

In [None]:
best_pos = best.pos_col.value_counts(normalize=True) - results.pos_col.value_counts(normalize=True)
worst_pos = worst.pos_col.value_counts(normalize=True) - results.pos_col.value_counts(normalize=True)

In [None]:
best_pos.sort_values(ascending=False)

In [None]:
worst_pos.sort_values(ascending=False)

In [None]:
results.pos_col.value_counts(normalize=True)

In [None]:
best.pos_col.value_counts(normalize=True)

In [None]:
worst.pos_col.value_counts(normalize=True)

In [None]:
sns.histplot(results[results.Round_col == 1].abs_difference);

In [None]:
results[results.Round_col == 1].abs_difference.mean()

In [None]:
results[results.Round_col == 2].abs_difference.mean()

In [None]:
results[results.Round_col == 3].abs_difference.mean()

In [None]:
results[results.Round_col == 4].abs_difference.mean()

In [None]:
results[results.Round_col == 5].abs_difference.mean()

In [None]:
results[results.Round_col == 6].abs_difference.mean()

In [None]:
results[results.Round_col == 7].abs_difference.mean()