In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV

In [3]:
df = pd.read_csv('../Data/final_ncaa_data.csv')

In [4]:
features = df.drop(columns=['Player', 'Year', 'NBA Adjusted 3P%', 'TS%', 'ORB%', 'ORtg', 'STL%', 'TO%'])
X = features
y = df['NBA Adjusted 3P%'] 

print(features)

         BLK%       DRB%       AST%   GP        FT%   3P%_NCAA    PORPAG  \
0    0.098020   8.840594  30.983168   67  73.039593  39.041210  3.997094   
1    2.200000  13.500000   6.800000   39  79.245283  44.654088  3.451450   
2    3.400000  19.300000  13.000000   38  42.222222  35.555556  2.862280   
3    0.697015   8.700000  26.320896  101  80.630261  42.382979  3.601087   
4    2.333333  15.713333   8.980000   46  82.516189  40.453515  3.718164   
..        ...        ...        ...  ...        ...        ...       ...   
554  9.800000  23.200000   4.400000   39  74.324324  47.619048  2.579370   
555  0.800000   9.500000  12.600000   37  69.117647  37.500000  1.840980   
556  3.300000  20.400000   5.800000   32  76.000000  29.411765  4.432070   
557  2.200000  16.400000  16.400000   20  79.591837  29.113924  0.986946   
558  5.800000  18.000000  15.000000   33  64.039409  33.802817  6.774160   

      Adj_O_Eff      Mid%      O_BPM     PTS_PG    3PT/100        USG   3P  \
0    121.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
svm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVR()) 
])


In [7]:
svm_param_grid = {
    'svm__kernel': ['linear', 'rbf'],  # Since the best kernel is linear, we only include 'linear' here
    'svm__C': [.005, .0051, .0052, .0053, .0054, .0055, .0056,.0057, .0058, .0059, .006 ],  # Adjusted range around the best value
    'svm__epsilon': [0.015,.016, .017, 0.018, .019, 0.02, .021, 0.022],  # Adjusted range around the best value
    'svm__gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],  # Since 'scale' was the best value for 'gamma'
    'svm__degree': [1,2,3, 4, 5],  # Since 'degree' was included in the best parameters
    # Add more hyperparameters as needed
}


In [8]:

# Perform grid search with cross-validation
grid_search = GridSearchCV(svm_pipe, svm_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 5280 candidates, totalling 26400 fits
[CV] END svm__C=0.005, svm__degree=1, svm__epsilon=0.015, svm__gamma=scale, svm__kernel=rbf; total time=   0.0s
[CV] END svm__C=0.005, svm__degree=1, svm__epsilon=0.015, svm__gamma=0.01, svm__kernel=rbf; total time=   0.0s
[CV] END svm__C=0.005, svm__degree=1, svm__epsilon=0.015, svm__gamma=0.001, svm__kernel=linear; total time=   0.0s
[CV] END svm__C=0.005, svm__degree=1, svm__epsilon=0.015, svm__gamma=0.001, svm__kernel=rbf; total time=   0.0s
[CV] END svm__C=0.005, svm__degree=1, svm__epsilon=0.015, svm__gamma=0.0001, svm__kernel=rbf; total time=   0.0s
[CV] END svm__C=0.005, svm__degree=1, svm__epsilon=0.015, svm__gamma=0.0001, svm__kernel=rbf; total time=   0.0s
[CV] END svm__C=0.005, svm__degree=1, svm__epsilon=0.016, svm__gamma=0.1, svm__kernel=rbf; total time=   0.0s
[CV] END svm__C=0.005, svm__degree=1, svm__epsilon=0.016, svm__gamma=0.1, svm__kernel=rbf; total time=   0.0s
[CV] END svm__C=0.005, svm__degree=1, 

In [9]:

# Get the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Fit the best model to the training data
best_model.fit(X_train, y_train)

# Predict on the training set
y_pred_train = best_model.predict(X_train)

# Evaluate performance on the training set
r2_train = r2_score(y_train, y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
mse_train = mean_squared_error(y_train, y_pred_train)

# Cross-validate
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
average_cv_rmse = np.sqrt(-cv_scores.mean())
average_cv_mse = -cv_scores.mean()
average_cv_r2 = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2').mean()

# Print results
print("Best Model:", best_model)
print("Best Parameters:", best_params)
print("Best Score (MSE):", best_score)
print("\nPerformance on Training Set:")
print("R-squared:", r2_train)
print("RMSE:", rmse_train)
print("MSE:", mse_train)
print("\nCross-Validation Scores:")
print("Average CV MSE Score:", average_cv_mse)
print("Average CV RMSE Score:", average_cv_rmse)
print("Average CV R-squared Score:", average_cv_r2)

Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('svm',
                 SVR(C=0.0054, degree=1, epsilon=0.017, kernel='linear'))])
Best Parameters: {'svm__C': 0.0054, 'svm__degree': 1, 'svm__epsilon': 0.017, 'svm__gamma': 'scale', 'svm__kernel': 'linear'}
Best Score (MSE): -0.005165379134781455

Performance on Training Set:
R-squared: 0.5791074446188262
RMSE: 0.06976354348879507
MSE: 0.0048669520001130015

Cross-Validation Scores:
Average CV MSE Score: 0.005165379134781455
Average CV RMSE Score: 0.07187057210556665
Average CV R-squared Score: 0.5398382027007538
