In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from IPython.display import HTML

In [16]:
data = pd.read_csv('data/events.csv')

# filters to shot attempts, excludes own goals
shot_attempts = data[(data['event_type'] == 1) & (data['event_type2'] != 15)]
print(f"{len(shot_attempts)} shot attempts")

228434 shot attempts


In [17]:
features = ['location', 'bodypart', 'assist_method', 'situation', 'fast_break']
target = 'is_goal'
shot_data = shot_attempts[features + [target]]

X = shot_data[features]
y = shot_data[target]

# One-hot encoding categorical features
categorical_cols = ['location', 'bodypart', 'assist_method', 'situation']
X_encoded = pd.get_dummies(X, columns=categorical_cols)

X_encoded['fast_break'] = X['fast_break']

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=0)

### Model 3: SVM (Jayden)

In [18]:
param_grid = {
    'penalty': ['l2'],
    'loss': ['hinge', 'squared_hinge'],
    'dual': ['auto'],
    'tol': [1e-4, 1e-3],
    'C': [0.1, 1.0, 10.0, 100.0],
    'multi_class': ['ovr', 'crammer_singer'],
    'fit_intercept': [True, False],
    'class_weight': ['balanced'],
    'max_iter': [100000]
}

# Initialize the SVM classifier
svm_clf = svm.LinearSVC()

# Create GridSearchCV with the defined parameters
grid_search = GridSearchCV(svm_clf, param_grid, cv=5, scoring='f1', verbose=1)

# Fit the model on the training data to search for best hyperparameters
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [41]:
# Get the results as a DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# Sort results by mean_test_score in descending order
results = results.sort_values(by='mean_test_score', ascending=False)
display(HTML(results[['params', 'mean_test_score', 'std_test_score']].head(5).to_html()))

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Predict on the test set using the best estimator
y_test_pred = best_estimator.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred) * 100
print(f"Accuracy on test set: {test_accuracy:.2f}%")

# Create and print confusion matrix for test set
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

Unnamed: 0,params,mean_test_score,std_test_score
37,"{'C': 10.0, 'class_weight': 'balanced', 'dual': 'auto', 'fit_intercept': True, 'loss': 'squared_hinge', 'max_iter': 100000, 'multi_class': 'ovr', 'penalty': 'l2', 'tol': 0.001}",0.362795,0.004166
28,"{'C': 1.0, 'class_weight': 'balanced', 'dual': 'auto', 'fit_intercept': False, 'loss': 'squared_hinge', 'max_iter': 100000, 'multi_class': 'ovr', 'penalty': 'l2', 'tol': 0.0001}",0.362795,0.004166
21,"{'C': 1.0, 'class_weight': 'balanced', 'dual': 'auto', 'fit_intercept': True, 'loss': 'squared_hinge', 'max_iter': 100000, 'multi_class': 'ovr', 'penalty': 'l2', 'tol': 0.001}",0.362795,0.004166
20,"{'C': 1.0, 'class_weight': 'balanced', 'dual': 'auto', 'fit_intercept': True, 'loss': 'squared_hinge', 'max_iter': 100000, 'multi_class': 'ovr', 'penalty': 'l2', 'tol': 0.0001}",0.362795,0.004166
44,"{'C': 10.0, 'class_weight': 'balanced', 'dual': 'auto', 'fit_intercept': False, 'loss': 'squared_hinge', 'max_iter': 100000, 'multi_class': 'ovr', 'penalty': 'l2', 'tol': 0.0001}",0.362795,0.004166


Accuracy on test set: 72.64%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.72      0.83     40952
           1       0.24      0.75      0.36      4735

    accuracy                           0.73     45687
   macro avg       0.60      0.74      0.59     45687
weighted avg       0.89      0.73      0.78     45687



### Model 4: Gradient Boosting (Jayden)

In [50]:
# Define the parameter grid for Gradient Boosting Classifier
param_grid_gb = {
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'min_samples_leaf': [1, 2, 4],  # Minimum samples in a leaf node
    'max_depth': [3, 5, 7],  # Maximum depth of each tree
    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider at each split
}

# Initialize the Gradient Boosting classifier
gb_clf = GradientBoostingClassifier()

# Create GridSearchCV with the defined parameters
grid_search_gb = GridSearchCV(gb_clf, param_grid_gb, cv=5, scoring='f1', verbose=1)

# Fit the model on the training data to search for best hyperparameters
grid_search_gb.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [51]:
# Get the results as a DataFrame
results_gb = pd.DataFrame(grid_search_gb.cv_results_)

# Sort results by mean_test_score in descending order
results_gb = results_gb.sort_values(by='mean_test_score', ascending=False)
display(HTML(results_gb[['params', 'mean_test_score', 'std_test_score']].head(5).to_html()))

# Get the best parameters and the best estimator
best_params_gb = grid_search_gb.best_params_
best_estimator_gb = grid_search_gb.best_estimator_

# Predict on the test set using the best estimator
y_test_pred_gb = best_estimator_gb.predict(X_test)

# Calculate accuracy on the test set
test_accuracy_gb = accuracy_score(y_test, y_test_pred_gb) * 100
print(f"Accuracy on test set: {test_accuracy_gb:.2f}%")

# Create and print confusion matrix for test set
print("Classification Report:")
print(classification_report(y_test, y_test_pred_gb))

Unnamed: 0,params,mean_test_score,std_test_score
70,"{'learning_rate': 0.2, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 2}",0.360036,0.003999
36,"{'learning_rate': 0.1, 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 1}",0.359579,0.00325
65,"{'learning_rate': 0.2, 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 4}",0.35899,0.002001
64,"{'learning_rate': 0.2, 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 2}",0.358924,0.002492
63,"{'learning_rate': 0.2, 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 1}",0.358623,0.002014


Accuracy on test set: 91.07%
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     40952
           1       0.70      0.24      0.36      4735

    accuracy                           0.91     45687
   macro avg       0.81      0.62      0.66     45687
weighted avg       0.90      0.91      0.89     45687

