In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
import time





In [33]:
# Suppress convergence warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [17]:
# Step 1: Load datasets
basketball_data = pd.read_csv('Basketball_Data_Preprocessed.csv')




In [18]:
# Step 2: Prepare data
# For Basketball: Create a binary classification column
basketball_data['Injury_Binary'] = (basketball_data['Injury Count'] >= 1).astype(int)
X_basketball = basketball_data.drop(columns=['Injury Count', 'Injury_Binary', 'Player', 'Pos', 'Tm'])
y_basketball = basketball_data['Injury_Binary']
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_basketball, y_basketball, test_size=0.2, random_state=42)

In [29]:
# Add a constant term to the predictors (intercept)
X_train_b_with_const = sm.add_constant(X_train_b)

# Fit the model using OLS (Ordinary Least Squares)
ols_model = sm.OLS(y_train_b, X_train_b_with_const).fit()

# Print the summary, which includes t-values, p-values, and confidence intervals
print(ols_model.summary())




                            OLS Regression Results                            
Dep. Variable:          Injury_Binary   R-squared:                       0.080
Model:                            OLS   Adj. R-squared:                  0.038
Method:                 Least Squares   F-statistic:                     1.882
Date:                Sun, 24 Nov 2024   Prob (F-statistic):            0.00822
Time:                        22:23:03   Log-Likelihood:                -353.62
No. Observations:                 520   AIC:                             755.2
Df Residuals:                     496   BIC:                             857.3
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7442      0.378      1.968      0.0

In [34]:
# Step 4: Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
start_time = time.time()
logistic_model.fit(X_train_b, y_train_b)
end_time = time.time()
y_pred_logistic = logistic_model.predict(X_test_b)
y_proba_logistic = logistic_model.predict_proba(X_test_b)[:, 1]
logistic_roc_auc = roc_auc_score(y_test_b, y_proba_logistic)
print(f"Logistic Regression ROC AUC: {logistic_roc_auc}")
print(f"Time Taken: {end_time - start_time} seconds")
print(classification_report(y_test_b, y_pred_logistic))


Logistic Regression ROC AUC: 0.6266947171575502
Time Taken: 0.14577007293701172 seconds
              precision    recall  f1-score   support

           0       0.62      0.50      0.55        62
           1       0.62      0.72      0.67        69

    accuracy                           0.62       131
   macro avg       0.62      0.61      0.61       131
weighted avg       0.62      0.62      0.61       131



In [35]:
# Step 5: Random Forest
random_forest_model = RandomForestClassifier(random_state=42)
start_time = time.time()
random_forest_model.fit(X_train_b, y_train_b)
end_time = time.time()
y_pred_rf = random_forest_model.predict(X_test_b)
y_proba_rf = random_forest_model.predict_proba(X_test_b)[:, 1]
rf_roc_auc = roc_auc_score(y_test_b, y_proba_rf)
print(f"Random Forest ROC AUC: {rf_roc_auc}")
print(f"Time Taken: {end_time - start_time} seconds")
print(classification_report(y_test_b, y_pred_rf))  # Corrected this line


Random Forest ROC AUC: 0.6697054698457223
Time Taken: 0.1495661735534668 seconds
              precision    recall  f1-score   support

           0       0.62      0.52      0.56        62
           1       0.62      0.71      0.66        69

    accuracy                           0.62       131
   macro avg       0.62      0.61      0.61       131
weighted avg       0.62      0.62      0.61       131



In [22]:
# Step 6: Support Vector Machine (SVM)
svm_model = SVC(probability=True, random_state=42)
start_time = time.time()
svm_model.fit(X_train_b, y_train_b)
end_time = time.time()
y_pred_svm = svm_model.predict(X_test_b)
y_proba_svm = svm_model.predict_proba(X_test_b)[:, 1]
svm_roc_auc = roc_auc_score(y_test_b, y_proba_svm)
print(f"SVM ROC AUC: {svm_roc_auc}")
print(f"Time Taken: {end_time - start_time} seconds")
print(classification_report(y_test_b, y_pred_svm))


SVM ROC AUC: 0.6173445535296868
Time Taken: 0.09527802467346191 seconds
              precision    recall  f1-score   support

           0       0.60      0.47      0.53        62
           1       0.60      0.72      0.66        69

    accuracy                           0.60       131
   macro avg       0.60      0.60      0.59       131
weighted avg       0.60      0.60      0.60       131



In [36]:
# Train XGBoost for Basketball Data
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)  # Removed use_label_encoder
start_time = time.time()
xgb_model.fit(X_train_b, y_train_b)
end_time = time.time()
y_pred_xgb = xgb_model.predict(X_test_b)
y_proba_xgb = xgb_model.predict_proba(X_test_b)[:, 1]
xgb_roc_auc = roc_auc_score(y_test_b, y_proba_xgb)

# Display results for XGBoost
print(f"XGBoost ROC AUC: {xgb_roc_auc}")
print(f"Time Taken: {end_time - start_time} seconds")
print(classification_report(y_test_b, y_pred_xgb))



XGBoost ROC AUC: 0.6977559607293128
Time Taken: 0.11744499206542969 seconds
              precision    recall  f1-score   support

           0       0.64      0.56      0.60        62
           1       0.64      0.71      0.68        69

    accuracy                           0.64       131
   macro avg       0.64      0.64      0.64       131
weighted avg       0.64      0.64      0.64       131



In [None]:
# Example preprocessing
# Ensure you replace 'target_column' with the actual column name for prediction
X = football_data.drop(columns=['target_column'])
y = football_data['target_column']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [4]:
# Load the football dataset
football_data = pd.read_csv('football_data_cleaned.csv')  # Replace with the correct file path

# Step 1: Create a binary classification column
football_data['Injury_Binary'] = (football_data['n_injuries'] > 0).astype(int)

# Step 2: Define features and target variable
X_football = football_data.drop(columns=['n_injuries', 'Injury_Binary', 'Unnamed: 0'])  # Exclude irrelevant columns
y_football = football_data['Injury_Binary']

# Step 3: Split the data into training and testing sets
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_football, y_football, test_size=0.2, random_state=42)



In [5]:

# Step 1: Add a constant term to the predictors (intercept)
X_train_f_with_const = sm.add_constant(X_train_f)

# Step 2: Fit the model using OLS
ols_model_football = sm.OLS(y_train_f, X_train_f_with_const).fit()

# Step 3: Print the summary
print(ols_model_football.summary())


                            OLS Regression Results                            
Dep. Variable:          Injury_Binary   R-squared:                       0.277
Model:                            OLS   Adj. R-squared:                  0.259
Method:                 Least Squares   F-statistic:                     14.97
Date:                Mon, 25 Nov 2024   Prob (F-statistic):           2.60e-79
Time:                        18:03:41   Log-Likelihood:                -837.23
No. Observations:                1521   AIC:                             1752.
Df Residuals:                    1482   BIC:                             1960.
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
age               

In [12]:
# Logistic Regression
start_time = time.time()
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)
log_reg_model.fit(X_train_f, y_train_f)
y_pred_logistic = log_reg_model.predict(X_test_f)
y_pred_proba_logistic = log_reg_model.predict_proba(X_test_f)[:, 1]
logistic_roc_auc = roc_auc_score(y_test_f, y_pred_proba_logistic)
end_time = time.time()
print(f"Logistic Regression ROC AUC: {logistic_roc_auc:.2f}")
print(f"Time Taken: {end_time - start_time:.2f} seconds")
print(classification_report(y_test_f, y_pred_logistic))




Logistic Regression ROC AUC: 0.84
Time Taken: 0.04 seconds
              precision    recall  f1-score   support

           0       0.76      0.80      0.78       224
           1       0.69      0.65      0.67       157

    accuracy                           0.74       381
   macro avg       0.73      0.72      0.73       381
weighted avg       0.74      0.74      0.74       381



In [13]:
# Support Vector Machine (SVM)
start_time = time.time()
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train_f, y_train_f)
y_pred_svm = svm_model.predict(X_test_f)
y_pred_proba_svm = svm_model.predict_proba(X_test_f)[:, 1]
svm_roc_auc = roc_auc_score(y_test_f, y_pred_proba_svm)
end_time = time.time()
print(f"\nSVM ROC AUC: {svm_roc_auc:.2f}")
print(f"Time Taken: {end_time - start_time:.2f} seconds")
print(classification_report(y_test_f, y_pred_svm))



SVM ROC AUC: 0.84
Time Taken: 0.68 seconds
              precision    recall  f1-score   support

           0       0.77      0.80      0.79       224
           1       0.70      0.67      0.68       157

    accuracy                           0.75       381
   macro avg       0.74      0.73      0.74       381
weighted avg       0.74      0.75      0.74       381



In [14]:
# Random Forest
start_time = time.time()
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_f, y_train_f)
y_pred_rf = rf_model.predict(X_test_f)
y_pred_proba_rf = rf_model.predict_proba(X_test_f)[:, 1]
rf_roc_auc = roc_auc_score(y_test_f, y_pred_proba_rf)
end_time = time.time()
print(f"\nRandom Forest ROC AUC: {rf_roc_auc:.2f}")
print(f"Time Taken: {end_time - start_time:.2f} seconds")
print(classification_report(y_test_f, y_pred_rf))



Random Forest ROC AUC: 0.83
Time Taken: 0.38 seconds
              precision    recall  f1-score   support

           0       0.74      0.82      0.78       224
           1       0.70      0.60      0.64       157

    accuracy                           0.73       381
   macro avg       0.72      0.71      0.71       381
weighted avg       0.72      0.73      0.72       381



In [15]:
# XGBoost
start_time = time.time()
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_f, y_train_f)
y_pred_xgb = xgb_model.predict(X_test_f)
y_pred_proba_xgb = xgb_model.predict_proba(X_test_f)[:, 1]
xgb_roc_auc = roc_auc_score(y_test_f, y_pred_proba_xgb)
end_time = time.time()
print(f"\nXGBoost ROC AUC: {xgb_roc_auc:.2f}")
print(f"Time Taken: {end_time - start_time:.2f} seconds")
print(classification_report(y_test_f, y_pred_xgb))




XGBoost ROC AUC: 0.81
Time Taken: 0.26 seconds
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       224
           1       0.66      0.61      0.64       157

    accuracy                           0.71       381
   macro avg       0.70      0.70      0.70       381
weighted avg       0.71      0.71      0.71       381

