In [72]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score,confusion_matrix, plot_confusion_matrix, plot_roc_curve, recall_score, f1_score, precision_score
import pandas as pd

In [7]:
pd.set_option("display.max_columns", None)
df_all = pd.read_parquet('./Data/modeldraft1322')
df_all.columns

Index(['Rnd', 'Tm', 'Player', 'Pos', 'Age', 'To', 'College', 'tackles_solo',
       'tackles_assists', 'tackles_total', 'tackles_loss', 'sacks', 'def_int',
       'def_int_yds', 'def_int_yds_per_int', 'def_int_td', 'pass_defended',
       'fumbles_rec', 'fumbles_rec_yds', 'fumbles_rec_td', 'fumbles_forced',
       'rec', 'rec_yds', 'rec_yds_per_rec', 'rec_td', 'rush_att', 'rush_yds',
       'rush_yds_per_att', 'rush_td', 'scrim_att', 'scrim_yds',
       'scrim_yds_per_att', 'scrim_td', 'pass_cmp', 'pass_att', 'pass_cmp_pct',
       'pass_yds', 'pass_yds_per_att', 'adj_pass_yds_per_att', 'pass_td',
       'pass_int', 'pass_rating', 'height', 'weight', 'college_conference'],
      dtype='object')

In [11]:
df = df_all.drop(['Tm','Player','To','College','height','weight'],axis=1)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2558 entries, 0 to 2557
Data columns (total 39 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rnd                   2558 non-null   float64
 1   Pos                   2558 non-null   object 
 2   Age                   2555 non-null   float64
 3   tackles_solo          1182 non-null   float64
 4   tackles_assists       1182 non-null   float64
 5   tackles_total         1182 non-null   float64
 6   tackles_loss          1182 non-null   float64
 7   sacks                 1182 non-null   float64
 8   def_int               1166 non-null   float64
 9   def_int_yds           1166 non-null   float64
 10  def_int_yds_per_int   784 non-null    float64
 11  def_int_td            1165 non-null   float64
 12  pass_defended         1166 non-null   float64
 13  fumbles_rec           1166 non-null   float64
 14  fumbles_rec_yds       295 non-null    float64
 15  fumbles_rec_td       

In [67]:
X = df.drop(['Rnd'],axis=1)
y = df.Rnd

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.33,random_state=42)

smote = SMOTE()

# Separate X_train into categorical and numerical dfs
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(['float64'])

cat_transformer = Pipeline(steps=[('ohe',OneHotEncoder(drop='first',sparse=False,handle_unknown='ignore'))])
num_transformer = Pipeline(steps=[('impute',SimpleImputer(strategy='constant')),
                                  ('mms',MinMaxScaler())])

transformer = ColumnTransformer(transformers=[
    ('numerical', num_transformer, X_train_num.columns),
    ('categorical', cat_transformer, X_train_cat.columns)
])

pipe = Pipeline(steps=([
    ('trans',transformer),
    ('smote',smote),
    ('logr',LogisticRegression(random_state=42,solver='liblinear'))
]))

pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
# grid = [{'C': [1e-10,1e-5,1,1e5,1e10],
#        'fit_intercept': [True, False]}]

# gridsearch = GridSearchCV(estimator=pipe,
#                           param_grid=grid,
#                           scoring='accuracy',
#                           cv=5)

# gridsearch.fit(X_train,y_train)

# gridsearch.score(X_test, y_test)



In [69]:
f1_score(y_test,y_pred)

0.30617283950617286

In [70]:
accuracy_score(y_test,y_pred)

0.6674556213017752

In [71]:
recall_score(y_test,y_pred)

0.6526315789473685

In [73]:
precision_score(y_test,y_pred)

0.2

#### Columns to get from profootballreference/sportsreference
- Rnd
- Pick
- Tm
- Player
- Pos
- Age
- College/Univ
- Draft_Yr
- All college stats

#### Columns to get from NFL Python Package:
- position_group
- college_conference
- height
- weight

#### Columns to One Hot Encode 
- Pos or position_group
    - See which one works better
- college_conference

#### Columns to Transform
- Rnd needs to be turned into round 1 (target) or round 0

#### Models to try:
- Logistic Regression
- Decision Tree
- Random Forest

