## How Player Performance Varies Between Surfaces

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
import matplotlib.pyplot as plt
from pandasql import sqldf
import sqlite3 
import datetime as dt
import plotly.express as px

Hypothesis: A stronger serve is less valuable on clay courts than it is on grass and hard

In [2]:
con = sqlite3.connect('/Users/julianbombard/Desktop/Data Analytics Course/Capstone/database/ATP_Tour')
df = pd.read_sql_query('''Select *
                       From cleaned_match_stats''', con)

con.close()

df.head()

Unnamed: 0,player_id,match_id,year,month,player_name,country,height,hand,tourney_name,tourney_level,...,bp_saved,bp_faced,rank,rank_points,won,first_sv_pct,first_sv_win_pct,second_sv_win_pct,total_sv_win_pct,bp_save_pct
0,105357,2015-339_1,2015,1,John Millman,AUS,183,R,Brisbane,A,...,1,1,153,328,1,0.545455,0.791667,0.7,0.75,1.0
1,103813,2015-339_2,2015,1,Jarkko Nieminen,FIN,185,L,Brisbane,A,...,4,7,73,689,1,0.641304,0.661017,0.515152,0.608696,0.571429
2,105902,2015-339_3,2015,1,James Duckworth,AUS,183,R,Brisbane,A,...,2,3,125,430,1,0.6,0.740741,0.611111,0.688889,0.666667
3,104871,2015-339_4,2015,1,Jeremy Chardy,FRA,188,R,Brisbane,A,...,0,0,31,1195,1,0.735849,0.794872,0.785714,0.792453,
4,105373,2015-339_5,2015,1,Martin Klizan,SVK,191,L,Brisbane,A,...,6,8,34,1094,1,0.607692,0.696203,0.529412,0.630769,0.75


In [3]:
df['ace_pct'] = (df['ace'] / df['svpt'])

In [4]:
player_surface = df.groupby(['player_name', 'surface']).agg(
    wins = ('won', 'sum'),
    total_matches_per_surface = ('won', 'count'),
    ace_pct = ('ace_pct', 'mean'), 
    first_sv_pct = ('first_sv_pct', 'mean'),
    first_sv_win_pct = ('first_sv_win_pct', 'mean'),	
    second_sv_win_pct = ('second_sv_win_pct', 'mean'),	
    total_sv_win_pct = ('total_sv_win_pct', 'mean'),	
    bp_save_pct = ('bp_save_pct', 'mean'),
).reset_index()

player_surface['win_pct'] = (player_surface['wins'] / player_surface['total_matches_per_surface'])
player_surface

Unnamed: 0,player_name,surface,wins,total_matches_per_surface,ace_pct,first_sv_pct,first_sv_win_pct,second_sv_win_pct,total_sv_win_pct,bp_save_pct,win_pct
0,Abedallah Shelbayh,Clay,1,4,0.035626,0.564362,0.655399,0.386274,0.539094,0.590774,0.250000
1,Abedallah Shelbayh,Grass,0,2,0.038305,0.570330,0.605695,0.464286,0.544898,0.500000,0.000000
2,Abedallah Shelbayh,Hard,2,8,0.078914,0.601158,0.690752,0.481069,0.602291,0.572754,0.250000
3,Adam Moundir,Clay,0,1,0.013514,0.513514,0.736842,0.500000,0.621622,0.600000,0.000000
4,Adam Neff,Hard,0,1,0.055556,0.569444,0.585366,0.387097,0.500000,0.636364,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
1489,Zizou Bergs,Clay,4,11,0.080889,0.656036,0.706469,0.480533,0.630250,0.609696,0.363636
1490,Zizou Bergs,Grass,1,4,0.041875,0.661122,0.743992,0.511272,0.666592,0.568750,0.250000
1491,Zizou Bergs,Hard,10,29,0.078042,0.618456,0.707025,0.519077,0.633158,0.586024,0.344828
1492,Zsombor Piros,Clay,1,3,0.045662,0.621411,0.609430,0.578985,0.599475,0.709524,0.333333


In [5]:
total_matches = player_surface.groupby('player_name')['total_matches_per_surface'].agg('sum').reset_index()
total_matches= total_matches.rename(columns = {'total_matches_per_surface' : 'total_matches'})
player_surface = pd.merge(player_surface, total_matches, on = 'player_name')
player_surface

Unnamed: 0,player_name,surface,wins,total_matches_per_surface,ace_pct,first_sv_pct,first_sv_win_pct,second_sv_win_pct,total_sv_win_pct,bp_save_pct,win_pct,total_matches
0,Abedallah Shelbayh,Clay,1,4,0.035626,0.564362,0.655399,0.386274,0.539094,0.590774,0.250000,14
1,Abedallah Shelbayh,Grass,0,2,0.038305,0.570330,0.605695,0.464286,0.544898,0.500000,0.000000,14
2,Abedallah Shelbayh,Hard,2,8,0.078914,0.601158,0.690752,0.481069,0.602291,0.572754,0.250000,14
3,Adam Moundir,Clay,0,1,0.013514,0.513514,0.736842,0.500000,0.621622,0.600000,0.000000,1
4,Adam Neff,Hard,0,1,0.055556,0.569444,0.585366,0.387097,0.500000,0.636364,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1489,Zizou Bergs,Clay,4,11,0.080889,0.656036,0.706469,0.480533,0.630250,0.609696,0.363636,44
1490,Zizou Bergs,Grass,1,4,0.041875,0.661122,0.743992,0.511272,0.666592,0.568750,0.250000,44
1491,Zizou Bergs,Hard,10,29,0.078042,0.618456,0.707025,0.519077,0.633158,0.586024,0.344828,44
1492,Zsombor Piros,Clay,1,3,0.045662,0.621411,0.609430,0.578985,0.599475,0.709524,0.333333,4


## Does hitting more aces lead to more wins?

In [16]:
pxdata = player_surface[player_surface['total_matches'] >= 150]

color_map = {
    "Clay": "#BF5700",   
    "Grass": "green",  
    "Hard": "blue"     
}

fig = px.scatter(pxdata, x = 'ace_pct', y = 'win_pct', hover_data = 'player_name', color = 'surface', color_discrete_map = color_map, trendline = 'ols', trendline_scope = 'overall', labels = {
    'ace_pct' : 'Ace Percentage',
    'win_pct' : 'Win Percentage'
})

fig.show()

Overall, ace percentage does slightly contribute towards win percentage, which makes sense becauses aces are free points. However, we can see that many of the highest win percentages on clay have a low ace percentage. This supports the theory that a stronger serve is a less effective weapon on clay than it is on other surfaces. 

## Does saving more break points lead to more wins?

In [15]:
pxdata = player_surface[player_surface['total_matches'] >= 150]

color_map = {
    "Clay": "#BF5700",   
    "Grass": "green",  
    "Hard": "blue"     
}

fig = px.scatter(pxdata, x = 'bp_save_pct', y = 'win_pct', hover_data = 'player_name', color = 'surface', color_discrete_map = color_map, trendline = 'ols', trendline_scope = 'overall', labels = {
    'ace_pct' : 'Ace Percentage',
    'win_pct' : 'Win Percentage'
})

fig.show()

This chart suggests a much stronger correlation between winning and saving break points, particularly on grass surfaces. Again, the data is more scattered on clay, supporting the initial hypothesis 

## Who performs best on which surfaces?

In [14]:
pxdata = player_surface[player_surface['total_matches'] >= 300]

color_map = {
    "Clay": "#BF5700",   
    "Grass": "green",  
    "Hard": "blue"     
}

fig = px.bar(pxdata, x = 'player_name', y = 'win_pct', color = 'surface', color_discrete_map = color_map, barmode = 'group', labels = {
    'player_name' : 'Player',
    'win_pct' : 'Win Percentage'
})

fig.show()