In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
                    .appName('tennis_ml') \
                    .getOrCreate()

In [3]:
data = spark.read.csv("singles/atp_matches_2020.csv", header=True, inferSchema=True)

In [6]:
from pyspark.sql.functions import col

null_columns = []
for column in data.columns:
    if data.where(col(column).isNull()).count() == data.count():
        null_columns.append(column)

if null_columns:
    data = data.drop(*null_columns)
    print("Columns with all NULL values removed:", null_columns)
else:
    print("No columns with all NULL values found.")

No columns with all NULL values found.


label 0 for player 1 winning, label 2 for player 2 winning

In [7]:
selected_columns = ['surface', 'winner_hand', 'winner_age', 'loser_hand', 'loser_age', 'winner_rank_points', 'loser_rank_points', 'loser_ht', 'winner_ht']
data = data.select(selected_columns)
data.show(1)

+-------+-----------+----------+----------+---------+------------------+-----------------+--------+---------+
|surface|winner_hand|winner_age|loser_hand|loser_age|winner_rank_points|loser_rank_points|loser_ht|winner_ht|
+-------+-----------+----------+----------+---------+------------------+-----------------+--------+---------+
|   Hard|          R|      32.6|         L|     33.5|              9055|             9985|     185|      188|
+-------+-----------+----------+----------+---------+------------------+-----------------+--------+---------+
only showing top 1 row



In [15]:
from pyspark.sql.functions import col, when, lit
        
data = data.withColumn("player_age_diff", col("winner_age") - col("loser_age"))
data = data.withColumn('player_rank_diff', col('winner_rank_points') - col('loser_rank_points'))
data = data.withColumn('player_ht_diff', col('winner_ht') - col('loser_ht'))
data = data.withColumn("hand_diff",
                       when((col("winner_hand") == "U") | (col("loser_hand") == "U"), 2)
                       .when(col("winner_hand") == col("loser_hand"), 0)
                       .otherwise(1))
data = data.withColumn('label', lit(1))
                       

engineered_columns = ['player_age_diff', 'player_rank_diff', 'player_ht_diff', 'hand_diff', 'label']
df = data.select(engineered_columns)
df.show(2)

+-------------------+----------------+--------------+---------+-----+
|    player_age_diff|player_rank_diff|player_ht_diff|hand_diff|label|
+-------------------+----------------+--------------+---------+-----+
|-0.8999999999999986|            -930|             3|        1|    1|
| 2.1999999999999993|            1084|             3|        0|    1|
+-------------------+----------------+--------------+---------+-----+
only showing top 2 rows



In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

# df = df.dropna()
# df = df.toPandas()
X = df.drop(["label"], axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# # Save trained model
# joblib.dump(knn_model, 'knn_model.joblib')

predict_test = knn_model.predict(X_test)
print(f"The accuracy score is {accuracy_score(y_test, predict_test)}")

The accuracy score is 1.0
