# data from: https://wtafiles.wtatennis.com/pdf/rankings/All_Career_Prize_Money.pdf

In [None]:
from platform import python_version
print(python_version())

# data_source = '../Data/All_Career_Prize_Money.pdf'
data_source = 'https://wtafiles.wtatennis.com/pdf/rankings/All_Career_Prize_Money.pdf'

In [None]:
from tika import parser # pip install tika

%time raw = parser.from_file(data_source)
print(raw['content'])

In [None]:
#print(raw.keys())
#print(raw['metadata'])
#print(raw['status'])
text = raw['content'].split('\n')
#print('\n'.join(text))

In [None]:
import re
# this statement selects only lines of text which contain player's earning data
result = [line for line in text if re.search(r'^\d+ [^0-9]+ [0-9,$]+', line)] 
print(len(result))
print('\n'.join(result))

In [None]:
player_earnings = []
player_names = []
last_name = None
first_name = None
country = None

for line in result:
    # parse player data into groups: 1-(Rank), 2-(Last Name, First Name, Country) and 3-(Earnings)
    player_info = re.search(r'(^\d+) ([^0-9]+) ([0-9,]+)$', line)
    # split group(2) into individual fields
    name_country = re.search(r'(^[\w\s\.\'\(\)-]+), ([\w\.\(\) -]+) ([A-Z]{3}?$)', player_info.group(2), re.UNICODE)
    if name_country:
        last_name, first_name, country = name_country.groups()
    else:
        name_country = re.search(r'(^[\w\s\'\(\)-]+), ([\w\.\(\) -]+)', player_info.group(2), re.UNICODE)
        if name_country:
            last_name, first_name= name_country.groups()
            country = None
        else:
            print(player_info.group(2))
    player_earnings.append([player_info.group(1), last_name, first_name, country, player_info.group(3)])

In [None]:
import numpy as np

@np.vectorize
def create_wiki_link(first_name, last_name):
    return 'https://en.wikipedia.org/wiki/' + first_name.title() +'_' + last_name.title()

import pandas as pd
df = pd.DataFrame(player_earnings, columns=['Rank', 'LastName', 'FirstName', 'Country', 'Earnings'])
df['Earnings'] = df['Earnings'].str.replace(',', '').astype(float)
df['WikiLink'] = create_wiki_link(df['FirstName'], df['LastName'])
df.to_excel('earnings.xlsx', index=False)
df.head(50)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize

figsize(12.5, 7)
scatter, ax = plt.subplots()
ax = sns.scatterplot(x='Rank', y='Earnings', data=df.head(50))
#ax = sns.scatterplot(x='Rank', y='Earnings', data=df)
ax.set(xscale='log', yscale='log')
# I hate doing this: the plotting routines should pick up better ranges, but for now, I don't 
# have a better solution to get reasonable charts.
ax.set_xlim([1, 12000])
ax.set_ylim([10000000, 100000000])
plt.show()
# P.S. This chart sucks - needs a lot of tweaking to make it useful in general

In [None]:
df = df.replace({None:'---'})

In [None]:
df_sum = df.groupby('Country')['Earnings'].sum()
df_sum.sort_values(ascending=False).head(50)
df_sum.describe()

In [None]:
df_sum = df_sum.to_frame()
type(df_sum)

In [None]:
df_sum

In [None]:
box, ax = plt.subplots()
ax = sns.boxplot(data=df, x='Country', y='Earnings')
plt.show()

In [None]:
box, ax = plt.subplots()
ax = sns.barplot(data=df_sum, y='Earnings', use_index=True)
plt.show()