In [1]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests

In [3]:
# we do this for every year and every surface... three sets of serve and return dicts per year, 
# containing info on each player

In [9]:
stats_dict = {}
surfaces = ["clay","hard","grass"]
stat_type = ["1st-serve","1st-serve-points-won","2nd-serve-points-won","1st-serve-return-points-won","2nd-serve-return-points-won"]

for types in stat_type:
    stats_dict[types] = {}
    for surface in surfaces:
        print surface
        stats_dict[types][surface] = {}
        for i in range(1992,2016):
            time.sleep(1)
            print i
            returns = 'http://www.atpworldtour.com/en/stats/%(what)s/%(year)s/%(surface)s/all/' % \
            {"year": str(i), "surface": surface, "what": types}
            returns = requests.get(returns)
            returns = BeautifulSoup(returns.text, "html.parser")
            rows = returns.find("table", attrs={"class": "stats-listing-table"}).find_all("tr", attrs={"class": "stats-listing-row"})[0:]
            names = returns.find("table", attrs={"class": "stats-listing-table"}).find_all("td", attrs={"class": "stats-listing-name"})
            #print returns
            #rows = returns.find_all("tr", attrs={"class": "stats-listing-row"})[0:]
            #names = returns.find_all("td", attrs={"class": "stats-listing-name"})

            name_list = []

            # collect player names from webpage table
            for name in names:
                store = str(name.find("a").get("href"))
                a,b,store = store.partition("players/")
                store,a,b = store.partition("/")
                store.replace('-',' ')
                name_list.append(store)
            # create dataframe with player names
            our_df = pd.DataFrame(name_list)
            our_df.columns = ['player']
            # initialize empty lists
            stats_perc = []
            # iterate through tables rows and store serving statistics in lists
            for row in rows:
                info = row.find_all("td")[5:9]
                # index up to 2 so I get rid of the % sign
                stats_perc.append(int(str(info[0].text).replace("%","")))

            our_df[types] = stats_perc
            stats_dict[types][surface][i] = our_df

stats_dict

clay
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
hard
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
grass
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
clay
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
hard
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
grass
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
clay
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
hard
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
201

{'1st-serve': {'clay': {1992:                     player  1st-serve
   0            eduardo-masso         76
   1               bart-wuyts         74
   2         thierry-champion         73
   3           jonas-svensson         73
   4          tomas-carbonell         72
   5      alberto-berasategui         71
   6             jordi-arrese         70
   7         frederic-fontang         70
   8              goran-prpic         70
   9            thomas-muster         69
   10            franco-davin         68
   11          magnus-larsson         68
   12       claudio-pistolesi         68
   13            lars-jonsson         68
   14            andre-agassi         68
   15          lars-koslowski         68
   16       magnus-gustafsson         67
   17            roberto-azar         67
   18             jaime-yzaga         67
   19           michael-chang         67
   20           ronald-agenor         67
   21          gabriel-markus         66
   22      horacio-de-la-pena 

In [22]:
stats_dict['1st-serve-points-won']

{'clay': {1992:                     player  first_perc
  0         goran-ivanisevic          80
  1              jim-courier          77
  2             pete-sampras          76
  3               guy-forget          76
  4              marc-rosset          75
  5            henri-leconte          75
  6             german-lopez          74
  7         rodolphe-gilbert          74
  8           cedric-pioline          74
  9         richard-krajicek          74
  10         andrei-medvedev          73
  11             luiz-mattar          73
  12           michael-stich          73
  13              ivan-lendl          73
  14           karel-novacek          72
  15          wayne-ferreira          72
  16            andres-gomez          72
  17        richard-fromberg          72
  18            carlos-costa          71
  19         bernd-karbacher          71
  20           stefan-edberg          71
  21            jakob-hlasek          71
  22      malivai-washington          71
  

In [10]:
stats_dict['1st-serve-points-won']

{'clay': {1992:                     player  1st-serve-points-won
  0         goran-ivanisevic                    80
  1              jim-courier                    77
  2             pete-sampras                    76
  3               guy-forget                    76
  4              marc-rosset                    75
  5            henri-leconte                    75
  6             german-lopez                    74
  7         rodolphe-gilbert                    74
  8           cedric-pioline                    74
  9         richard-krajicek                    74
  10         andrei-medvedev                    73
  11             luiz-mattar                    73
  12           michael-stich                    73
  13              ivan-lendl                    73
  14           karel-novacek                    72
  15          wayne-ferreira                    72
  16            andres-gomez                    72
  17        richard-fromberg                    72
  18            c

In [25]:
x = stats_dict['1st-serve-points-won']['clay'][1992]
x = x.loc[x['player'] == 'thomas-muster']
y = x['player'].values
print x

           player  first_perc
57  thomas-muster          65


In [11]:
import cPickle as pickle
with open('stats_dict.p', 'wb') as fp:
    pickle.dump(stats_dict, fp)