In [113]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
pd.set_option('display.max_rows', 60)

In [114]:
df = pd.read_csv('NFL_combine_data.csv')
df

Unnamed: 0,Player,Pos,Ht,Wt,Forty,Vertical,BenchReps,BroadJump,Cone,Shuttle,Year,Pfr_ID,AV,Team,Round,Pick
0,John Abraham,OLB,76,252,4.55,,,,,,2000,AbraJo00,26,New York Jets,1,13.0
1,Shaun Alexander,RB,72,218,4.58,,,,,,2000,AlexSh00,26,Seattle Seahawks,1,19.0
2,Darnell Alford,OT,76,334,5.56,25.0,23.0,94.0,8.48,4.98,2000,AlfoDa20,0,Kansas City Chiefs,6,188.0
3,Kyle Allamon,TE,74,253,4.97,29.0,,104.0,7.29,4.49,2000,,0,,,
4,Rashard Anderson,CB,74,206,4.55,34.0,,123.0,7.18,4.15,2000,AndeRa21,6,Carolina Panthers,1,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6213,Chris Worley,ILB,74,238,4.86,29.5,15.0,,,,2018,WorlCh00,0,,,
6214,Isaiah Wynn,G,75,313,,,,,,,2018,WynnIs00,0,,,
6215,Isaac Yiadom,CB,73,190,4.52,,8.0,120.0,,4.18,2018,YiadIs00,0,,,
6216,Kenny Young,ILB,73,236,4.60,36.0,23.0,117.0,7.38,4.48,2018,,0,,,


In [115]:
#Not drafted players marked as round 99
df['Round'] = df.Round.str.replace(' ','99', regex = True)

In [116]:
#Deleting columns that are not part of ML, also dropping rows with NaN cells
df = df.drop(['Pos', 'Year', 'Pfr_ID', 'AV', 'Team', 'Pick'], axis = 1)
df = df.dropna()

In [117]:
#setting the imput and output
X = df.iloc[:, 1:-1]
y = df.iloc[:, -1]

In [118]:
#changing y from string to int
y_numeric = pd.to_numeric(y, errors='coerce')

In [119]:
#making multiclass output to binary
y_binary = y_numeric.apply(lambda x: 1 if x in [1] else 0)
y_binary

2       0
7       0
9       0
11      0
12      0
       ..
6202    0
6204    0
6206    0
6216    0
6217    0
Name: Round, Length: 2885, dtype: int64

In [120]:
#Feeding the x, y, test size and random state
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

In [121]:
#changing input type to int
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = StandardScaler()
categorical_columns = list(set(X.columns) - set(numeric_features))
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
#Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [122]:
#Creating logistic regression
logistic_regression = Pipeline(steps=[('preprocessor', preprocessor),
                                       ('classifier', LogisticRegression())])
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [123]:
logistic_regression.fit(X_train_resampled, y_train_resampled)

In [124]:
y_pred = logistic_regression.predict(X_test)

In [126]:
logreg_model = logistic_regression.named_steps['classifier']
numeric_features = logistic_regression.named_steps['preprocessor'].transformers_[0][2]

In [127]:
#take a name, a transformer and the feature it applies to
categorical_features = logistic_regression.named_steps['preprocessor'].transformers_[1][1]
categorical_features.fit(X_train[categorical_columns])

In [128]:
#find coefficient names and values
categorical_feature_names = categorical_features.get_feature_names_out()
feature_names = list(numeric_features) + list(categorical_feature_names)
coefficients = logreg_model.coef_.flatten()
intercept = logreg_model.intercept_

In [129]:
#print Intercept and Coefficient name,values
for feature_name, coefficient in zip(feature_names, coefficients):
    print(f"{feature_name}: {coefficient}")
print("Intercept:", intercept)

Ht: -0.4322215538256542
Wt: 3.089319609288117
Forty: -1.393364546059793
Vertical: 0.0996766405391254
BenchReps: -0.11578640705101298
BroadJump: 0.3776607572400046
Cone: -0.8813905919745274
Shuttle: -0.12003734315694542
Intercept: [-0.02168595]


In [131]:
#Classification Report
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report_result)

Accuracy: 0.66
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.65      0.78       535
           1       0.14      0.76      0.24        42

    accuracy                           0.66       577
   macro avg       0.56      0.70      0.51       577
weighted avg       0.91      0.66      0.74       577

