In [1]:
from pyspark.sql import SparkSession
import os
import pandas as pd
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, NaiveBayes
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import time

In [2]:
# Just for easier visualization
def show(df):
    pandas_df = df.toPandas()
    return pandas_df

In [3]:
spark = SparkSession.builder \
    .appName('tennis') \
    .config('spark.executor.memory', '4g') \
    .config('spark.driver.memory', '4g') \
    .getOrCreate()

In [4]:
final_df = spark.read.parquet('final_df')
show(final_df)

Unnamed: 0,best_of,draw_size,p1_rank,p1_ht,p1_age,p2_rank,p2_ht,p2_age,p1_ace_avg,p1_df_avg,...,round_encoded,p1_name_encoded,p1_hand_encoded,p1_ioc_encoded,p1_entry_encoded,p2_name_encoded,p2_hand_encoded,p2_ioc_encoded,p2_entry_encoded,Winner
0,3,32,22,183,23.8,95,175,24.4,3.500000,3.000000,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",1
1,3,32,9,188,32.9,107,188,22.9,6.333333,3.000000,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)",1
2,3,32,59,178,25.5,54,198,22.6,3.333333,3.000000,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",1
3,3,32,43,183,22.8,109,190,21.8,6.461538,4.000000,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)",1
4,3,32,17,190,27.9,95,175,24.9,4.750000,1.083333,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73214,5,128,51,188,36.1,7,198,22.7,9.400000,2.200000,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",0
73215,5,128,30,188,28.5,1,185,33.6,1.666667,2.000000,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",0
73216,5,128,19,206,34.7,15,183,34.8,18.571429,1.857143,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",0
73217,5,128,48,188,29.1,11,183,32.3,6.666667,3.333333,...,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)",0


### DATA PREPARATION

In [5]:
#DATA PREPARATION PART

# Features
feature_cols = [
    'best_of', 'draw_size', 'p1_rank', 'p1_ht', 'p1_age', 'p2_rank', 'p2_ht', 'p2_age',
    'p1_ace_avg', 'p1_df_avg', 'p1_1stW_avg', 'p1_2ndW_avg',
    'p2_ace_avg', 'p2_df_avg', 'p2_1stW_avg', 'p2_2ndW_avg',
    'surface_encoded', 'tourney_level_encoded', 'tourney_name_encoded',
    'round_encoded', 'p1_name_encoded', 'p1_hand_encoded', 'p1_ioc_encoded', 'p1_entry_encoded',
    'p2_name_encoded', 'p2_hand_encoded', 'p2_ioc_encoded', 'p2_entry_encoded'
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Prepare data
(train_df, test_df) = final_df.randomSplit([0.8, 0.2], seed=123)


# Definel Label
evaluator = BinaryClassificationEvaluator(labelCol="Winner")




### Naive Bayes

In [None]:
# Create a Naive Bayes classifier
nb = NaiveBayes(labelCol="Winner", featuresCol="features")
pipeline_nb = Pipeline(stages=[assembler, nb])

# Define Parameter Grid
param_grid_nb = ParamGridBuilder() \
    .addGrid(nb.smoothing, [0.0, 0.5, 1.0]) \
    .build()

# Define evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Winner", metricName="accuracy")

start_time = time.time()
# Cross validation + grid search
cv_nb = CrossValidator(estimator=pipeline_nb,
                       estimatorParamMaps=param_grid_nb,
                       evaluator=evaluator,
                       numFolds=5)

# Fit the model
cv_model_nb = cv_nb.fit(train_df)
predictions_nb = cv_model_nb.transform(test_df)
evaluation_metric_nb = evaluator.evaluate(predictions_nb)

print(time.time() - start_time)
print(f"Best Accuracy: {evaluation_metric_nb}")

### Random Forest

In [None]:

# Define Model
rf = RandomForestClassifier(labelCol="Winner", featuresCol="features")
pipeline = Pipeline(stages=[assembler, rf])
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [50, 100]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

start_time = time.time()
# Cross validation + grid search
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=5)  # Number of folds for cross-validation

# Fit the model with grid search on the training data
cv_model = cv.fit(train_df)

# Make predictions on the test set
predictions = cv_model.transform(test_df)

# Evaluate the model performance using the evaluator
evaluation_metric = evaluator.evaluate(predictions)

print(time.time() - start_time)
# Print the evaluation metric (e.g., areaUnderROC for binary classification)
print(f"Evaluation Metric: {evaluation_metric}")

# Optionally, print the best model's parameters
best_model = cv_model.bestModel

### Gradient Boost Tree

In [None]:
# Define Model
gbt = GBTClassifier(labelCol="Winner", featuresCol="features", maxIter=10)  # Example parameters
pipeline = Pipeline(stages=[assembler, gbt])

# Define Parameter Grid for GBT
param_grid_gbt = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10]) \
    .build()

start_time = time.time()
# Cross validation + grid search
cv_gbt = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=param_grid_gbt,
                        evaluator=evaluator,
                        numFolds=5)

# Fit the model
cv_model_gbt = cv_gbt.fit(train_df)

# Make predictions
predictions_gbt = cv_model_gbt.transform(test_df)

# Evaluate the model
evaluation_metric_gbt = evaluator.evaluate(predictions_gbt)
print(time.time() - start_time)

# Print the evaluation metric
print(f"Evaluation Metric (GBT): {evaluation_metric_gbt}")