In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers

# Read the preprocessed data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_val = pd.read_csv('../data/processed/X_val.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').iloc[:, 0]  # Convert to Series
y_val = pd.read_csv('../data/processed/y_val.csv').iloc[:, 0]      # Convert to Series
y_test = pd.read_csv('../data/processed/y_test.csv')   # No need to convert to Series

# Build a neural network model with L2 regularization and dropout
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l2(0.01)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01))  # Binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)

# Make predictions on the test set
predictions = model.predict(X_test)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 0.9380395412445068


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Read the preprocessed data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_val = pd.read_csv('../data/processed/X_val.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').iloc[:, 0]  
y_val = pd.read_csv('../data/processed/y_val.csv').iloc[:, 0]
y_test = pd.read_csv('../data/processed/y_test.csv').iloc[:, 0]

# Apply PolynomialFeatures to the feature sets
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
X_test_poly = poly.transform(X_test)

# Create a LogisticRegression model with regularization
model = LogisticRegression(max_iter=1000, penalty='l2')

# Define a range of hyperparameters to search over
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Use GridSearchCV to find the best hyperparameters using cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_poly, y_train)

# Get the best model with the selected hyperparameters
best_model = grid_search.best_estimator_

# Train the best model on the entire training data
best_model.fit(X_train_poly, y_train)

# Predict on the validation set
val_predictions = best_model.predict(X_val_poly)
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Predict on the test set
test_predictions = best_model.predict(X_test_poly)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


In [4]:
import pandas as pd
import numpy as np
test = pd.read_csv('../data/raw/test.csv')

In [5]:
test.head()

Unnamed: 0,team,conf,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,DRB_per,...,ogbpm,dgbpm,oreb,dreb,treb,ast,stl,blk,pts,player_id
0,Morgan St.,MEAC,2,3.0,115.1,4.7,50.0,50.0,0.0,4.6,...,-2.46774,-2.27566,0.0,0.3333,0.3333,0.0,0.0,0.0,1.0,cf302b4d-84f7-4124-a25d-a75eed31978b
1,South Carolina St.,MEAC,11,17.6,61.1,18.6,34.7,35.18,2.5,15.7,...,-7.49472,-4.41253,0.2727,1.4545,1.7273,0.4545,0.1818,0.0,2.3636,f91837cd-4f49-4b70-963d-aeb82c6ce3da
2,Binghamton,AE,9,28.6,91.9,23.8,54.1,52.49,6.4,22.5,...,-2.92495,1.71789,1.3333,4.4444,5.7778,1.0,0.6667,1.8889,8.8889,53ec2a29-1e7d-4c6d-86d7-d60d02af8916
3,Illinois,B10,7,1.3,111.0,10.4,83.3,83.33,0.0,13.4,...,-0.767911,0.962469,0.0,0.2857,0.2857,0.0,0.0,0.0,0.7143,32402798-471c-4a54-8cb4-29cd95199014
4,Iowa St.,B12,23,78.5,103.1,21.5,54.0,56.12,3.6,10.2,...,2.89361,-1.019,1.0435,2.8696,3.913,1.1739,0.8261,0.087,14.3043,73b960f9-27b8-4431-9d23-a760e9bbc360


In [6]:
import pandas as pd

def process(df):
    columns_to_keep = ['GP', 'Min_per', 'Ortg', 'usg', 'eFG', 'TS_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FTM', 'FTA', 'FT_per', 'twoPM', 'twoPA', 'twoP_per', 'TPM', 'TPA', 'TP_per', 'blk_per', 'stl_per', 'ftr', 'porpag', 'adjoe', 'pfr', 'year', 'type']
    df = df[columns_to_keep]
    return df

test = process(test)
test

Unnamed: 0,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,DRB_per,AST_per,TO_per,...,TPA,TP_per,blk_per,stl_per,ftr,porpag,adjoe,pfr,year,type
0,2,3.0,115.1,4.7,50.0,50.00,0.0,4.6,0.0,0.0,...,3,0.333,0.0,0.0,0.0,0.261536,91.80970,3.5,2021,all
1,11,17.6,61.1,18.6,34.7,35.18,2.5,15.7,8.1,30.4,...,28,0.250,0.0,0.9,5.6,-1.110000,57.54910,2.8,2021,all
2,9,28.6,91.9,23.8,54.1,52.49,6.4,22.5,10.0,21.9,...,2,0.000,9.0,1.7,52.5,0.662985,93.67160,5.5,2021,all
3,7,1.3,111.0,10.4,83.3,83.33,0.0,13.4,0.0,33.6,...,1,1.000,0.0,0.0,0.0,0.135373,102.32400,5.0,2021,all
4,23,78.5,103.1,21.5,54.0,56.12,3.6,10.2,7.8,16.1,...,147,0.395,0.3,1.4,13.0,2.977030,111.42600,3.5,2021,all
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4965,2,0.3,60.5,19.3,0.0,0.00,0.0,0.0,51.1,0.0,...,0,0.000,0.0,20.3,0.0,-0.117667,62.64930,0.0,2021,all
4966,4,1.3,28.3,7.1,0.0,0.00,7.0,0.0,0.0,50.3,...,0,0.000,0.0,0.0,0.0,-0.859158,19.23020,0.0,2021,all
4967,1,0.1,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0,0.000,0.0,0.0,0.0,-0.326039,-8.70362,0.0,2021,all
4968,1,0.1,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0,0.000,0.0,0.0,0.0,-0.321884,-7.50633,0.0,2021,all


In [7]:
def process(df):
    df.drop('year', axis=1, inplace=True)
    return df
test = process(test)
test

Unnamed: 0,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,DRB_per,AST_per,TO_per,...,TPM,TPA,TP_per,blk_per,stl_per,ftr,porpag,adjoe,pfr,type
0,2,3.0,115.1,4.7,50.0,50.00,0.0,4.6,0.0,0.0,...,1,3,0.333,0.0,0.0,0.0,0.261536,91.80970,3.5,all
1,11,17.6,61.1,18.6,34.7,35.18,2.5,15.7,8.1,30.4,...,7,28,0.250,0.0,0.9,5.6,-1.110000,57.54910,2.8,all
2,9,28.6,91.9,23.8,54.1,52.49,6.4,22.5,10.0,21.9,...,0,2,0.000,9.0,1.7,52.5,0.662985,93.67160,5.5,all
3,7,1.3,111.0,10.4,83.3,83.33,0.0,13.4,0.0,33.6,...,1,1,1.000,0.0,0.0,0.0,0.135373,102.32400,5.0,all
4,23,78.5,103.1,21.5,54.0,56.12,3.6,10.2,7.8,16.1,...,58,147,0.395,0.3,1.4,13.0,2.977030,111.42600,3.5,all
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4965,2,0.3,60.5,19.3,0.0,0.00,0.0,0.0,51.1,0.0,...,0,0,0.000,0.0,20.3,0.0,-0.117667,62.64930,0.0,all
4966,4,1.3,28.3,7.1,0.0,0.00,7.0,0.0,0.0,50.3,...,0,0,0.000,0.0,0.0,0.0,-0.859158,19.23020,0.0,all
4967,1,0.1,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0,0,0.000,0.0,0.0,0.0,-0.326039,-8.70362,0.0,all
4968,1,0.1,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0,0,0.000,0.0,0.0,0.0,-0.321884,-7.50633,0.0,all


In [8]:
import pandas as pd

encoded_df = pd.get_dummies(test, columns=['type'], prefix=['Type'])
test

Unnamed: 0,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,DRB_per,AST_per,TO_per,...,TPM,TPA,TP_per,blk_per,stl_per,ftr,porpag,adjoe,pfr,type
0,2,3.0,115.1,4.7,50.0,50.00,0.0,4.6,0.0,0.0,...,1,3,0.333,0.0,0.0,0.0,0.261536,91.80970,3.5,all
1,11,17.6,61.1,18.6,34.7,35.18,2.5,15.7,8.1,30.4,...,7,28,0.250,0.0,0.9,5.6,-1.110000,57.54910,2.8,all
2,9,28.6,91.9,23.8,54.1,52.49,6.4,22.5,10.0,21.9,...,0,2,0.000,9.0,1.7,52.5,0.662985,93.67160,5.5,all
3,7,1.3,111.0,10.4,83.3,83.33,0.0,13.4,0.0,33.6,...,1,1,1.000,0.0,0.0,0.0,0.135373,102.32400,5.0,all
4,23,78.5,103.1,21.5,54.0,56.12,3.6,10.2,7.8,16.1,...,58,147,0.395,0.3,1.4,13.0,2.977030,111.42600,3.5,all
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4965,2,0.3,60.5,19.3,0.0,0.00,0.0,0.0,51.1,0.0,...,0,0,0.000,0.0,20.3,0.0,-0.117667,62.64930,0.0,all
4966,4,1.3,28.3,7.1,0.0,0.00,7.0,0.0,0.0,50.3,...,0,0,0.000,0.0,0.0,0.0,-0.859158,19.23020,0.0,all
4967,1,0.1,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0,0,0.000,0.0,0.0,0.0,-0.326039,-8.70362,0.0,all
4968,1,0.1,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0,0,0.000,0.0,0.0,0.0,-0.321884,-7.50633,0.0,all


In [9]:
import pandas as pd
from joblib import load
from sklearn.preprocessing import StandardScaler

# Load the saved model
loaded_model = load('../models/model_1.joblib')


# Load the saved StandardScaler object
loaded_scaler = load('../models/scaler.joblib')

# Transform the test data using the loaded StandardScaler
X_test_scaled = loaded_scaler.transform(encoded_df)

# Predict probabilities using the loaded_model
predicted_probabilities = loaded_model.predict_proba(X_test_scaled)

testing = pd.read_csv('../data/raw/test.csv')
threshold = 0.5  # Threshold to classify as drafted (class 1)

result_df['drafted'] = (predicted_probabilities[:, 1] >= threshold).astype(int)

with open('../data/predictions.csv', 'w') as f:
    f.write("player_id,drafted\n")  # Write the header

    # Write the player IDs and drafted predictions for each row
    for player_id, drafted_prob in zip(result_df['player_id'], result_df['drafted']):
        f.write(f"{player_id},{drafted_prob}\n")




NameError: name 'result_df' is not defined