In [None]:
import re
import time
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt

from pathlib import Path

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

from io import StringIO

import qgrid

import seaborn as sns

In [None]:
### Main ########################################################################

url = r'http://wtatennis.com/stats'
driver = webdriver.Chrome()
driver.get(url)

In [None]:
# Wait for button "SHOW MORE" and "click it" to get all rows, rather than just the first 20

def get_full_stats(driver):
    try:
        klass = "widget-footer__more-button"
        elem = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, klass))).click()
        
    except NoSuchElementException as error:
        print(error)
        driver.quit()
        
    else:
        print("Button clicked")

get_full_stats(driver)

In [None]:
# Ingest the stats table.
# CAUTION: before executing this code, need to scroll to the bottom of the page
# in the Selenium/Chromium window to load stats for all players.
# FIXME: this needs to be resolved

def get_table(driver):
    try:
        klass = "stats-list__table"
        table = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, klass))
                )
    except Exception as error:
        print(error)
        
    else:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        tbl = soup.find("table", {"class": klass})
        dfs = pd.read_html(StringIO(str(tbl)))
        return dfs
    
    finally:
        driver.quit()

dfs = get_table(driver)
df = dfs[0]

In [None]:
df.shape

In [None]:
df.columns

In [None]:
new_cols = [x for x in df.columns if not x.startswith('Unnamed: ')]
new_cols

In [None]:
dfNew = df[new_cols]
dfNew = dfNew.dropna()
qgrid.show_grid(dfNew)

In [None]:
fileName = '../Data/clean_stats.pkl'

In [None]:
dfNew.to_pickle(fileName)

In [None]:
dfNew = pd.read_pickle(fileName)

In [None]:
qgrid.show_grid(dfNew)

In [None]:
dfNew.columns

In [None]:
dfNew.rename(columns=
    {'DF Double Faults Two serving faults in a row in one point, causing the player serving to lose the point.': 'DoubleFaults'},
             inplace =True)
dfNew

In [None]:
# This is a dictionary of columns to calculate the ranks, True means ascending, False means descending.
cols2rank = {
        'Aces': False,
        'DoubleFaults': True,
        '1st Srv %': False,
        '1st Srv Pts % 1st Serve Points Won %': False,
        '2nd Srv % 2nd Serve Points %': False,
        'Srv Pts Won % Service Points Won %': False,
        'BP SVD % Break Points Saved %': False,
        'Srv Gm Won % Service Games Won %': False,
        '1st Rtn Pts % 1st Return Points Won %': False,
        '2nd Rtn Pts % 2nd Return Points Won %': False,
        'Rtn Gm Won % Return Games Won %': False,
        'BP CONV % Break Points Converted %': False,
        'Rtn Pts Won % Return Points Won %': False,
}

In [None]:
for col in cols2rank.keys():
    newCol = 'Rank_' + col
    dfNew[newCol] = dfNew[col].rank(ascending=cols2rank[col])
    
qgrid.show_grid(dfNew)

In [None]:
rank_cols = [x for x in dfNew.columns if x.startswith('Rank')]
rank_cols = ['Player'] + rank_cols
rank_cols
dfRank = dfNew[rank_cols]
qgrid.show_grid(dfRank)

In [None]:
dfBar = dfNew.copy()
dfBar['NegDoubleFaults'] = -1.0*dfBar['DoubleFaults']
dfBar['ServeDiff'] = dfBar['Aces'] - dfBar['DoubleFaults']
dfBar = dfBar[0:32]
dfBar = dfBar.sort_values(by='ServeDiff', ascending=False)
bar_plot = sns.barplot(x='Aces', y='Player', data=dfBar, color='blue')
bar_plot = sns.barplot(x='NegDoubleFaults', y='Player', data=dfBar, color='red')
bar_plot.set(xlabel="Aces (blue), Double Faults (red)", ylabel=None, title = "Serving Effectiveness")

In [None]:
scatter_plot = sns.scatterplot(x='DoubleFaults', y='Aces', data=dfNew)
scatter_plot.set(xlabel='Double Faults', ylabel='Aces', title = "Serving Effectiveness")

lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)

In [None]:
qgrid.show_grid(dfRank)

In [None]:
import plotly.graph_objects as go

dfTmp = dfRank.loc[dfRank.Rank < 10]
categories = dfTmp.columns[2:-1]

fig = go.Figure()

for i in range(0, 4):
    fig.add_trace(go.Scatterpolar(
        r=dfTmp.iloc[i][2:-1],
        theta=categories,
        fill='toself',
        name=dfTmp.iloc[i,0]
        ))

fig.show()

In [None]:
from pandas.plotting import parallel_coordinates

dfTmp = dfRank.loc[dfRank.Rank < 11].copy()
dfTmp.Player = dfTmp.Player.str.replace(r'^.*([A-Z]\. .*)', r'\1', regex=True) # Why is the str required here?

fig, ax = plt.subplots(figsize=(12,4))
parallel_coordinates(dfTmp, 'Player', ax=ax,
                     colormap='viridis',
                     alpha=0.5)

ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.legend(bbox_to_anchor=(1.0, 1.0))