In [36]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.functions import avg, col, count, lit, to_date, split, when, coalesce, round, unix_timestamp
from pyspark.sql.types import MapType, StringType
from pyspark.sql.window import Window
import pandas as pd

In [2]:
# Just for easier visualization
def show(df):
    pandas_df = df.toPandas()
    return pandas_df

In [3]:
spark = SparkSession.builder \
                    .appName('tennis') \
                    .getOrCreate()

In [4]:
singles=spark.read.option("header","true").option("inferSchema","true") \
.csv('singles')

In [5]:
from pyspark.sql.functions import col, count, round

singles = singles.withColumn('winner_entry', when(singles['winner_entry'].isNull(), "Standard").otherwise(singles['winner_entry']))
singles = singles.withColumn('loser_entry', when(singles['loser_entry'].isNull(), "Standard").otherwise(singles['loser_entry']))

null_counts = singles.select([round(((count(when(col(c).isNull(), c)) / singles.count()) * 100), 1).alias(c) for c in singles.columns])
show(null_counts)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,0.0,0.0,1.6,0.4,0.0,0.0,0.0,0.0,63.0,0.0,...,50.5,50.5,50.5,50.5,50.5,50.5,18.6,43.2,23.0,44.1


In [6]:
singles.printSchema()

root
 |-- tourney_id: string (nullable = true)
 |-- tourney_name: string (nullable = true)
 |-- surface: string (nullable = true)
 |-- draw_size: integer (nullable = true)
 |-- tourney_level: string (nullable = true)
 |-- tourney_date: integer (nullable = true)
 |-- match_num: integer (nullable = true)
 |-- winner_id: integer (nullable = true)
 |-- winner_seed: integer (nullable = true)
 |-- winner_entry: string (nullable = true)
 |-- winner_name: string (nullable = true)
 |-- winner_hand: string (nullable = true)
 |-- winner_ht: integer (nullable = true)
 |-- winner_ioc: string (nullable = true)
 |-- winner_age: double (nullable = true)
 |-- loser_id: integer (nullable = true)
 |-- loser_seed: integer (nullable = true)
 |-- loser_entry: string (nullable = true)
 |-- loser_name: string (nullable = true)
 |-- loser_hand: string (nullable = true)
 |-- loser_ht: integer (nullable = true)
 |-- loser_ioc: string (nullable = true)
 |-- loser_age: double (nullable = true)
 |-- score: string (

In [7]:
# Selecting relevant columns for feature engineering
selected_columns = ['surface',
                    'tourney_level',
                    'tourney_date',
                    'tourney_name',
                    'draw_size',
                    'round',
                    'best_of',
                    'winner_name',
                    'winner_hand',
                    'winner_rank',
                    'winner_ht',
                    'winner_ioc',
                    'winner_age',
                    'winner_entry',
                    'loser_name',
                    'loser_hand',
                    'loser_rank',
                    'loser_ht',
                    'loser_ioc',
                    'loser_age',
                    'loser_entry',
                    'w_ace',
                    'w_df',
                    'w_svpt',
                    'w_1stIn',
                    'w_1stWon',
                    'w_2ndWon',
                    'w_SvGms',
                    'l_ace',
                    'l_df',
                    'l_svpt',
                    'l_1stIn',
                    'l_1stWon',
                    'l_2ndWon',
                    'l_SvGms'
                   ]

# Filtering relevant columns
clean_df = singles.select(selected_columns)

# Kepping only Bo3 and Bo5 matches
clean_df = clean_df.filter(clean_df['best_of'] != 1)

# Removing Nulls
clean_df = clean_df.dropna()

# Changing date format 
clean_df = clean_df.withColumn('tourney_date', to_date(clean_df['tourney_date'].cast('string'), 'yyyyMMdd'))

show(clean_df)

Unnamed: 0,surface,tourney_level,tourney_date,tourney_name,draw_size,round,best_of,winner_name,winner_hand,winner_rank,...,w_1stWon,w_2ndWon,w_SvGms,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms
0,Hard,A,1994-01-03,Adelaide,32,R32,3,Thomas Muster,L,9,...,21,13,8,6,6,64,30,17,15,8
1,Hard,A,1994-01-03,Adelaide,32,R32,3,Brett Steven,R,43,...,29,8,9,0,4,55,34,21,6,8
2,Hard,A,1994-01-03,Adelaide,32,R32,3,Karel Novacek,R,17,...,31,10,9,1,5,63,35,24,12,9
3,Hard,A,1994-01-03,Adelaide,32,R32,3,Jamie Morgan,R,64,...,38,24,16,2,5,99,52,32,21,15
4,Hard,A,1994-01-03,Adelaide,32,R32,3,David Rikl,L,95,...,27,11,10,2,2,74,48,29,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92799,Hard,A,2020-10-19,Antwerp,32,R32,3,Daniel Evans,R,35,...,36,14,13,4,1,77,43,31,14,13
92800,Hard,A,2020-10-19,Antwerp,32,R32,3,Zizou Bergs,R,528,...,33,13,12,1,3,66,45,27,10,12
92801,Hard,A,2020-10-19,Antwerp,32,R32,3,Taylor Fritz,R,28,...,39,6,10,19,3,58,37,30,5,9
92802,Hard,A,2020-10-19,Antwerp,32,R32,3,Lloyd Harris,R,90,...,28,11,10,0,2,62,45,29,5,10


In [8]:
print(f'Number of rows: {clean_df.count()}')
print(f'Number of columns: {len(clean_df.columns)}')

Number of rows: 92804
Number of columns: 35


In [9]:
# Creating Label column. 1 if the first player won, 0 if the second player won.
clean_df = clean_df.withColumn("Winner", lit(1))

In [10]:
## Randomizing player 1 and 2 so that the label isn't always 1

# Dividing into two df randomly
split_1, split_2 = clean_df.randomSplit([0.5, 0.5], seed=10)

# Changing column order for split 2
split_2_order = ['surface',
                'tourney_level',
                'tourney_date',
                'tourney_name',
                'draw_size',
                'round',
                'best_of',
                'loser_name',
                'loser_hand',
                'loser_rank',
                'loser_ht',
                'loser_ioc',
                'loser_age',
                'loser_entry',
                'winner_name',
                'winner_hand',
                'winner_rank',
                'winner_ht',
                'winner_ioc',
                'winner_age',
                'winner_entry',
                'l_ace',
                'l_df',
                'l_svpt',
                'l_1stIn',
                'l_1stWon',
                'l_2ndWon',
                'l_SvGms',
                'w_ace',
                'w_df',
                'w_svpt',
                'w_1stIn',
                'w_1stWon',
                'w_2ndWon',
                'w_SvGms'
               ]
split_2 = split_2.select(split_2_order)

# Reverting the name change (at this point the names are wrong the winner_name will be the name of the loser actually, etc.
old_names = ['loser_name', 'loser_hand', 'loser_rank', 'loser_ht', 'loser_ioc', 'loser_age', 'loser_entry',
             'winner_name', 'winner_hand', 'winner_rank', 'winner_ht', 'winner_ioc', 'winner_age', 'winner_entry',
             'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms',
             'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms']

new_names = ['winner_name', 'winner_hand', 'winner_rank', 'winner_ht', 'winner_ioc', 'winner_age', 'winner_entry',
             'loser_name', 'loser_hand', 'loser_rank', 'loser_ht', 'loser_ioc', 'loser_age', 'loser_entry',
             'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms',
             'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms']

# Adjust label
split_2 = split_2.withColumn("Winner", lit(0))

# Merge them back together
clean_df = split_1.union(split_2)

In [11]:
# Changing column names from winner/loser to player 1/player 2
old_names = ['winner_name', 'winner_hand', 'winner_rank', 'winner_ht', 'winner_ioc', 'winner_age', 'winner_entry',
             'loser_name', 'loser_hand', 'loser_rank', 'loser_ht', 'loser_ioc', 'loser_age', 'loser_entry',
             'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms',
             'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms']

new_names = ['p1_name', 'p1_hand', 'p1_rank', 'p1_ht', 'p1_ioc', 'p1_age', 'p1_entry',
             'p2_name', 'p2_hand', 'p2_rank', 'p2_ht', 'p2_ioc', 'p2_age', 'p2_entry',
             'p1_ace', 'p1_df', 'p1_svpt', 'p1_1stIn', 'p1_1stWon', 'p1_2ndWon', 'p1_SvGms',
             'p2_ace', 'p2_df', 'p2_svpt', 'p2_1stIn', 'p2_1stWon', 'p2_2ndWon', 'p2_SvGms']

for old_name, new_name in zip(old_names, new_names):
    clean_df = clean_df.withColumnRenamed(old_name, new_name)

In [46]:
# From here it's still not working (trying to calculate the average of a stat for the past three months)

In [47]:
# Convert tourney_date to Unix timestamp
clean_df = clean_df.withColumn("tourney_date_unix", unix_timestamp(col("tourney_date")))

# Define window specification
windowSpec = Window.partitionBy("p1_name").orderBy("tourney_date_unix").rangeBetween(-90 * 86400, 0)

# Now you can use the window function with the converted tourney_date column
clean_df = clean_df.withColumn("avg_p1_ace", avg("p1_ace").over(windowSpec))

In [48]:
show(clean_df[['p1_ace', 'avg_p1_ace']])

Unnamed: 0,p1_ace,avg_p1_ace
0,3,3.00
1,3,3.00
2,3,3.00
3,8,4.25
4,2,5.00
...,...,...
92799,8,9.00
92800,5,5.00
92801,4,4.00
92802,2,2.00
