# Load Dataset


In [2]:
import pandas as pd

df = pd.read_csv('pickleball_matches.csv')
df

Unnamed: 0,p1_rank,p2_rank,p1_win_rate,p2_win_rate,court_type,weather,match_duration_minutes,winner,p1_id,p2_id
0,194,133,0.23,0.43,Indoor,,45,Player 2,P1012,P1015
1,99,112,0.73,0.61,Outdoor,Windy,52,Player 1,P1002,P1020
2,119,174,0.61,0.78,Indoor,,63,Player 1,P1020,P1018
3,132,177,0.19,0.40,Outdoor,Windy,54,Player 2,P1009,P1016
4,67,92,0.48,0.73,Outdoor,Sunny,53,Player 2,P1005,P1002
...,...,...,...,...,...,...,...,...,...,...
1995,125,92,0.66,0.73,Outdoor,Cloudy,56,Player 2,P1010,P1002
1996,68,130,0.74,0.67,Outdoor,Windy,37,Player 1,P1014,P1003
1997,184,60,0.40,0.48,Outdoor,Cloudy,48,Player 2,P1016,P1005
1998,132,69,0.19,0.49,Outdoor,Cloudy,59,Player 2,P1009,P1007


# Data Preparation

## Data seperation as X and y (y is the target)

In [3]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable 'winner' into numerical labels
le = LabelEncoder()
y_encoded = le.fit_transform(df['winner'])

y = y_encoded
display(y)

array([1, 0, 0, ..., 1, 1, 1])

In [4]:
X = df.drop(['winner', 'p1_id', 'p2_id'], axis=1)
X

Unnamed: 0,p1_rank,p2_rank,p1_win_rate,p2_win_rate,court_type,weather,match_duration_minutes
0,194,133,0.23,0.43,Indoor,,45
1,99,112,0.73,0.61,Outdoor,Windy,52
2,119,174,0.61,0.78,Indoor,,63
3,132,177,0.19,0.40,Outdoor,Windy,54
4,67,92,0.48,0.73,Outdoor,Sunny,53
...,...,...,...,...,...,...,...
1995,125,92,0.66,0.73,Outdoor,Cloudy,56
1996,68,130,0.74,0.67,Outdoor,Windy,37
1997,184,60,0.40,0.48,Outdoor,Cloudy,48
1998,132,69,0.19,0.49,Outdoor,Cloudy,59


In [5]:
# Select only the categorical columns for one-hot encoding
X_categorical = X[['court_type', 'weather']]

# Apply one-hot encoding
X_categorical_encoded = pd.get_dummies(X_categorical, dummy_na=False)

# Drop the original categorical columns from X
X = X.drop(['court_type', 'weather'], axis=1)

# Concatenate the one-hot encoded columns with the rest of X
X = pd.concat([X, X_categorical_encoded], axis=1)

display(X.head())

Unnamed: 0,p1_rank,p2_rank,p1_win_rate,p2_win_rate,match_duration_minutes,court_type_Indoor,court_type_Outdoor,weather_Cloudy,weather_Sunny,weather_Windy
0,194,133,0.23,0.43,45,True,False,False,False,False
1,99,112,0.73,0.61,52,False,True,False,False,True
2,119,174,0.61,0.78,63,True,False,False,False,False
3,132,177,0.19,0.4,54,False,True,False,False,True
4,67,92,0.48,0.73,53,False,True,False,True,False


## Data Splitting


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [7]:
#view the training
X_train

Unnamed: 0,p1_rank,p2_rank,p1_win_rate,p2_win_rate,match_duration_minutes,court_type_Indoor,court_type_Outdoor,weather_Cloudy,weather_Sunny,weather_Windy
1260,89,177,0.34,0.40,47,False,True,True,False,False
916,67,91,0.48,0.70,49,True,False,False,False,False
532,184,92,0.40,0.73,54,False,True,False,False,True
1159,98,40,0.70,0.58,46,True,False,False,False,False
1584,68,187,0.74,0.23,32,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
1879,194,112,0.23,0.61,48,True,False,False,False,False
1895,193,21,0.30,0.22,34,False,True,False,False,True
1859,76,118,0.49,0.66,47,True,False,False,False,False
792,68,130,0.74,0.67,60,False,True,False,False,True


In [8]:
#view the testing
X_test

Unnamed: 0,p1_rank,p2_rank,p1_win_rate,p2_win_rate,match_duration_minutes,court_type_Indoor,court_type_Outdoor,weather_Cloudy,weather_Sunny,weather_Windy
1025,115,162,0.36,0.74,46,False,True,True,False,False
1208,119,130,0.61,0.67,60,False,True,False,True,False
1055,181,177,0.78,0.40,61,False,True,True,False,False
367,99,61,0.73,0.74,44,True,False,False,False,False
815,67,69,0.48,0.49,64,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
807,119,118,0.61,0.66,62,True,False,False,False,False
711,169,69,0.74,0.49,39,True,False,False,False,False
1541,119,118,0.61,0.66,71,False,True,False,True,False
1001,148,186,0.38,0.30,62,False,True,False,True,False


# Model Building

## Logistic Regression

In [51]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

In [10]:
y_lr_train_predict = lr.predict(X_train)
y_lr_test_predict = lr.predict(X_test)

In [11]:
y_lr_train_predict

array([0, 0, 1, ..., 0, 0, 1])

In [12]:
y_lr_test_predict

array([1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,

### Model Performance

In [46]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

lr_train_cm = confusion_matrix(y_train, y_lr_train_predict)
lr_test_cm = confusion_matrix(y_test, y_lr_test_predict)

lr_train_acc = accuracy_score(y_train, y_lr_train_predict)
lr_test_acc = accuracy_score(y_test, y_lr_test_predict)

# Calculate ROC AUC using predicted probabilities
y_lr_train_proba = lr.predict_proba(X_train)[:, 1]
y_lr_test_proba = lr.predict_proba(X_test)[:, 1]

lr_train_roc = roc_auc_score(y_train, y_lr_train_proba)
lr_test_roc = roc_auc_score(y_test, y_lr_test_proba)



#### Confusion Matrix

In [47]:
print("\nTrain Confusion Matrix:\n", lr_train_cm)
print("\nTest Confusion Matrix:\n", lr_test_cm)



Train Confusion Matrix:
 [[499 243]
 [234 624]]

Test Confusion Matrix:
 [[121  67]
 [ 66 146]]


#### Accuracy Score

In [48]:
print("\nTrain Accuracy Score:\n", lr_train_acc)
print("\nTest Accuracy Score:\n", lr_test_acc)


Train Accuracy Score:
 0.701875

Test Accuracy Score:
 0.6675


#### ROC AUC

In [49]:
print("\nTrain ROC AUC:\n", lr_train_roc)
print("\nTest ROC AUC:\n", lr_test_roc)


Train ROC AUC:
 0.7844435438775061

Test ROC AUC:
 0.7584805299076676


## Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [29]:
y_rf_train_predict = rf.predict(X_train)
y_rf_test_predict = rf.predict(X_test)

In [30]:
y_rf_train_predict

array([0, 0, 1, ..., 0, 0, 1])

In [31]:
y_rf_test_predict

array([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,

### Model Performance

In [50]:
y_rf_train_predict_cm = confusion_matrix(y_train, y_rf_train_predict)
y_rf_test_predict_cm = confusion_matrix(y_test, y_rf_test_predict)

y_rf_train_predict_acc = accuracy_score(y_train, y_rf_train_predict)
y_rf_test_predict_acc = accuracy_score(y_test, y_rf_test_predict)

# Calculate ROC AUC using predicted probabilities
y_rf_train_proba = rf.predict_proba(X_train)[:, 1]
y_rf_test_proba = rf.predict_proba(X_test)[:, 1]

rf_train_roc = roc_auc_score(y_train, y_rf_train_proba)
rf_test_roc = roc_auc_score(y_test, y_rf_test_proba)

#### Confusion Matrix

In [35]:
print("\nTrain Confusion Matrix:\n", y_rf_train_predict_cm)
print("\nTest Confusion Matrix:\n", y_rf_test_predict_cm)


Train Confusion Matrix:
 [[736   6]
 [  4 854]]

Test Confusion Matrix:
 [[118  70]
 [ 69 143]]


#### ROC AUC

In [52]:
print("\nTrain ROC AUC:\n", rf_train_roc)
print("\nTest ROC AUC:\n", rf_test_roc)


Train ROC AUC:
 0.999910466891599

Test ROC AUC:
 0.7088895022079486


#### Accuracy Score

In [37]:
print("\nTrain Accuracy Score:\n", y_rf_train_predict_acc)
print("\nTest Accuracy Score:\n", y_rf_test_predict_acc)


Train Accuracy Score:
 0.99375

Test Accuracy Score:
 0.6525


## XGBoost

In [38]:
!pip install xgboost



In [39]:
import xgboost as xgb

## XGBoost Model

In [53]:
# Initialize and train the XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic',eval_metric='logloss')
xgb_model.fit(X_train, y_train)

### Make Predictions

In [54]:
y_xgb_train_predict = xgb_model.predict(X_train)
y_xgb_test_predict = xgb_model.predict(X_test)

### Model Performance

In [55]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score


xgb_train_cm = confusion_matrix(y_train, y_xgb_train_predict)
xgb_test_cm = confusion_matrix(y_test, y_xgb_test_predict)

xgb_train_acc = accuracy_score(y_train, y_xgb_train_predict)
xgb_test_acc = accuracy_score(y_test, y_xgb_test_predict)

# Predict probabilities for ROC AUC score
y_xgb_train_proba = xgb_model.predict_proba(X_train)[:, 1]
y_xgb_test_proba = xgb_model.predict_proba(X_test)[:, 1]

xgb_train_roc = roc_auc_score(y_train, y_xgb_train_proba)
xgb_test_roc = roc_auc_score(y_test, y_xgb_test_proba)

#### Confusion Matrix

In [56]:
print("\nXGBoost Train Confusion Matrix:\n", xgb_train_cm)
print("\nXGBoost Test Confusion Matrix:\n", xgb_test_cm)


XGBoost Train Confusion Matrix:
 [[715  27]
 [ 30 828]]

XGBoost Test Confusion Matrix:
 [[122  66]
 [ 71 141]]


#### Accuracy Score

In [57]:
print("\nXGBoost Train Accuracy Score:\n", xgb_train_acc)
print("\nXGBoost Test Accuracy Score:\n", xgb_test_acc)


XGBoost Train Accuracy Score:
 0.964375

XGBoost Test Accuracy Score:
 0.6575


#### ROC AUC

In [58]:
print("\nXGBoost Train ROC AUC:\n", xgb_train_roc)
print("\nXGBoost Test ROC AUC:\n", xgb_test_roc)


XGBoost Train ROC AUC:
 0.9953348538254199

XGBoost Test ROC AUC:
 0.7002709755118426


## Model Export

In [60]:
import joblib

# Define the filename for the exported model
model_filename = 'logistic_regression_model.joblib'

# Export the trained Logistic Regression model
joblib.dump(lr, model_filename)

print(f"Logistic Regression model exported to '{model_filename}'")

Logistic Regression model exported to 'logistic_regression_model.joblib'


In [61]:
# Display the classes learned by the LabelEncoder
print("Original classes and their numerical mapping:")
for i, class_name in enumerate(le.classes_):
    print(f"Numerical Label {i}: {class_name}")

Original classes and their numerical mapping:
Numerical Label 0: Player 1
Numerical Label 1: Player 2


In [62]:
# Display the numerical representation of court_type and weather
# These are the one-hot encoded columns in the X DataFrame
display(X[['court_type_Indoor', 'court_type_Outdoor', 'weather_Cloudy', 'weather_Sunny', 'weather_Windy']].head())

Unnamed: 0,court_type_Indoor,court_type_Outdoor,weather_Cloudy,weather_Sunny,weather_Windy
0,True,False,False,False,False
1,False,True,False,False,True
2,True,False,False,False,False
3,False,True,False,False,True
4,False,True,False,True,False
