In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
#needed for Jupyter Notebook, if want plots to show inline
%matplotlib inline 
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

In [2]:
import glob
import re
plt.style.available
plt.style.use('seaborn-colorblind')

In [3]:
from collections import Counter
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression

In [4]:
%whos

Variable     Type        Data/Info
----------------------------------
Counter      type        <class 'collections.Counter'>
KMeans       type        <class 'sklearn.cluster.k_means_.KMeans'>
glob         module      <module 'glob' from '/hom<...>3/lib/python3.6/glob.py'>
np           module      <module 'numpy' from '/ho<...>kages/numpy/__init__.py'>
pd           module      <module 'pandas' from '/h<...>ages/pandas/__init__.py'>
pdist        function    <function pdist at 0x7fb4badb5ea0>
plt          module      <module 'matplotlib.pyplo<...>es/matplotlib/pyplot.py'>
re           module      <module 're' from '/home/<...>da3/lib/python3.6/re.py'>
squareform   function    <function squareform at 0x7fb4badc82f0>


## Loading data

In [5]:
def read2000sMatches(dirname):
    allFiles = glob.glob(dirname + "/atp_matches_2" + "*.csv")
    matches = pd.DataFrame()
    list_ = list()
    for filen in allFiles:
        df = pd.read_csv(filen,
                         index_col=None,
                         header=0,
                         parse_dates=[0])
                         #date_parser=lambda t:parse(t))
        list_.append(df)
    matches = pd.concat(list_)
    return matches

def read1900sMatches(dirname):
    allFiles = glob.glob(dirname + "/atp_matches_1" + "*.csv")
    matches = pd.DataFrame()
    list_ = list()
    for filen in allFiles:
        df = pd.read_csv(filen,
                         index_col=None,
                         header=0,
                         parse_dates=[0])
                         #date_parser=lambda t:parse(t))
        list_.append(df)
    matches = pd.concat(list_)
    return matches

In [6]:
matches2000s=read2000sMatches('tennis/')
matches1900s=read1900sMatches('tennis/')
matches_SackmannAll = pd.concat([matches1900s,matches2000s])
matches_SackmannAll.shape # (167879, 49)

(167879, 49)

## Take only data with detailed stats

In [7]:
df_stats = matches_SackmannAll[~pd.isnull(matches_SackmannAll['w_ace'])]
df_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81281 entries, 15 to 2823
Data columns (total 49 columns):
tourney_id            81281 non-null object
tourney_name          81281 non-null object
surface               81281 non-null object
draw_size             81281 non-null int64
tourney_level         81281 non-null object
tourney_date          81281 non-null int64
match_num             81281 non-null int64
winner_id             81281 non-null int64
winner_seed           36056 non-null float64
winner_entry          10452 non-null object
winner_name           81281 non-null object
winner_hand           81275 non-null object
winner_ht             78590 non-null float64
winner_ioc            81281 non-null object
winner_age            81268 non-null float64
winner_rank           79430 non-null float64
winner_rank_points    79430 non-null float64
loser_id              81281 non-null int64
loser_seed            19455 non-null float64
loser_entry           17509 non-null object
loser_name

In [8]:
df_lim = df_stats[['surface','winner_rank','winner_ht','winner_age','loser_rank','loser_ht','loser_age']] 

In [10]:
df_win = df_stats[['tourney_id', 'match_num','tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date',  'score', 'best_of',
       'round', 'minutes','winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points','w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms',
       'w_bpSaved', 'w_bpFaced']]
df_loss = df_stats[['tourney_id','match_num','tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date',  'score', 'best_of',
       'round', 'minutes','loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points','l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced']]

### Make 'win' and 'loss' columns compatible

#### Situation before

In [11]:
df_win_loss = pd.concat([df_win,df_loss],axis=1)
# each row has basic info repeated
# each row has two 'target'

#### Making compatible

In [12]:
df_win.columns = [s.replace("winner_", "") for s in df_win.columns]
df_win.columns = [s.replace("w_", "") for s in df_win.columns]
df_loss.columns = [s.replace("loser_", "") for s in df_win.columns]
df_loss.columns = [s.replace("l_", "") for s in df_win.columns]

df_win['target']=1
df_loss['target']=0


Index(['tourney_id', 'match_num', 'tourney_name', 'surface', 'drasize',
       'tourney_level', 'tourney_date', 'score', 'best_of', 'round', 'minutes',
       'id', 'seed', 'entry', 'name', 'hand', 'ht', 'ioc', 'age', 'rank',
       'rank_points', 'ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon',
       'SvGms', 'bpSaved', 'bpFaced', 'target'],
      dtype='object')

Index(['tourney_id', 'match_num', 'tourney_name', 'surface', 'drasize',
       'tourney_level', 'tourney_date', 'score', 'best_of', 'round', 'minutes',
       'id', 'seed', 'entry', 'name', 'hand', 'ht', 'ioc', 'age', 'rank',
       'rank_points', 'ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon',
       'SvGms', 'bpSaved', 'bpFaced', 'target'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [13]:
df_win_loss = pd.concat([df_win,df_loss])


Unnamed: 0,tourney_id,match_num,tourney_name,surface,drasize,tourney_level,tourney_date,score,best_of,round,minutes,id,seed,entry,name,hand,ht,ioc,age,rank,rank_points,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,target
15,1992-237,1,Guaruja,Hard,32,A,19921026,6-4 6-4,3,R32,97.0,101063,1.0,,Jordi Arrese,R,175.0,ESP,28.158795,29.0,946.0,4.0,0.0,67.0,44.0,30.0,11.0,10.0,5.0,7.0,1
16,1992-237,2,Guaruja,Hard,32,A,19921026,6-2 6-4,3,R32,65.0,100652,,WC,Cassio Motta,R,180.0,BRA,32.676249,201.0,171.0,7.0,4.0,52.0,35.0,29.0,9.0,9.0,1.0,1.0,1
17,1992-237,3,Guaruja,Hard,32,A,19921026,6-1 6-4,3,R32,82.0,101594,,Q,Carsten Arriens,R,193.0,GER,23.542779,210.0,160.0,4.0,0.0,58.0,39.0,32.0,10.0,9.0,3.0,3.0,1
18,1992-237,4,Guaruja,Hard,32,A,19921026,7-6(4) 6-1,3,R32,95.0,101761,7.0,,Jaime Oncins,R,196.0,BRA,22.362765,52.0,650.0,5.0,7.0,73.0,44.0,35.0,11.0,10.0,3.0,5.0,1
19,1992-237,5,Guaruja,Hard,32,A,19921026,6-3 3-6 6-4,3,R32,123.0,101174,,WC,Fernando Roese,R,190.0,BRA,27.173169,187.0,185.0,4.0,4.0,79.0,47.0,33.0,11.0,14.0,4.0,10.0,1


In [14]:
%whos

Variable              Type         Data/Info
--------------------------------------------
Counter               type         <class 'collections.Counter'>
KMeans                type         <class 'sklearn.cluster.k_means_.KMeans'>
df_lim                DataFrame         surface  winner_rank<...>n[81281 rows x 7 columns]
df_loss               DataFrame         tourney_id  match_nu<...>[81281 rows x 31 columns]
df_stats              DataFrame         tourney_id  tourney_<...>[81281 rows x 49 columns]
df_win                DataFrame         tourney_id  match_nu<...>[81281 rows x 31 columns]
df_win_loss           DataFrame         tourney_id  match_nu<...>162562 rows x 31 columns]
glob                  module       <module 'glob' from '/hom<...>3/lib/python3.6/glob.py'>
matches1900s          DataFrame         tourney_id          <...>110457 rows x 49 columns]
matches2000s          DataFrame         tourney_id          <...>[57422 rows x 49 columns]
matches_SackmannAll   DataFrame         

In [15]:
del(matches1900s,matches2000s)

#### Fill missing values

In [17]:
## fill method of entry for most players with 'standard'; mostly missing values
df_win_loss['entry'] = df_win_loss['entry'].fillna('standard')

## better way might be max(33,rank) ## 33 is the minumum non-seeded value
##high positive value because negative associates with better rank
df_win_loss['seed'] = df_win_loss['seed'].fillna(9999)
df_win_loss['seed'] = df_win_loss['seed'].astype(int)

#rank, and rank_points have ~4000 missign out of 162000
df_win_loss[['rank','rank_points']] = df_win_loss.groupby('name')[['rank','rank_points']]\
                                        .transform(lambda x: x.fillna(x.mean()))
#rank, and rank_points have some more missing values: set to maximum ranking and minimum rank points
df_win_loss['rank'] = df_win_loss['rank']\
                                        .transform(lambda x: x.fillna(x.max()))
df_win_loss['rank_points'] = df_win_loss['rank_points']\
                                        .transform(lambda x: x.fillna(x.min()))


#### more filling of missing values

In [21]:
### for this 1st epicycle: don't want score,2 scores missing,, , 

In [22]:
## set 'hand' to U ('unknown') ~35 missing 'hand'
df_win_loss['hand'] = df_win_loss['hand'].fillna('U')

In [23]:
df_win_loss['hand'].value_counts() #R    138155 L     23661 U       746

R    138155
L     23661
U       746
Name: hand, dtype: int64

In [24]:
# set missing height, age  to average for all people ~7000 missing heights, ~100 missing age
df_win_loss[['age','ht']] = df_win_loss[['age','ht']]\
                                        .transform(lambda x: x.fillna(x.mean()))

In [25]:
## set missing minutes to average age of all ~5000 missing minutes
df_win_loss['minutes'] = df_win_loss['minutes']\
                                        .transform(lambda x: x.fillna(x.mean()))

In [26]:
(df_win_loss['age']).value_counts() #87 (most) occurrences of 23.7919
len(df_win_loss.loc[pd.isnull(df_win_loss['age']),'age']) # 0 good
(df_win_loss['ht']).value_counts() #25899 (most) occurrences of 185 (cm)
len(df_win_loss.loc[pd.isnull(df_win_loss['ht']),'ht']) # 0 good

0

## Dummying categorical variables

In [28]:
df_win_loss = pd.concat([df_win_loss,\
                         pd.get_dummies(df_win_loss['surface'])],axis=1)

* Hard      80028 
* Clay      54282 
* Grass     16366 
* Carpet    11736 
* None        150 

Name: surface, dtype: int64

In [29]:
df_wl_surf = df_win_loss.drop('surface', axis=1,inplace=True)
        

In [30]:
df_win_loss = pd.concat([df_win_loss,\
                         pd.get_dummies(df_win_loss['hand'])],axis=1)

In [31]:
df_win_loss.drop('hand', axis=1,inplace=True)

In [33]:
df_win_loss = pd.concat([df_win_loss,\
                         pd.get_dummies(df_win_loss['tourney_level'])],axis=1)

In [34]:
df_win_loss.drop('tourney_level', axis=1,inplace=True)

In [35]:
df_win_loss.columns

Index(['tourney_id', 'match_num', 'tourney_name', 'drasize', 'tourney_date',
       'score', 'best_of', 'round', 'minutes', 'id', 'seed', 'entry', 'name',
       'ht', 'ioc', 'age', 'rank', 'rank_points', 'ace', 'df', 'svpt', '1stIn',
       '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced', 'target', 'Carpet',
       'Clay', 'Grass', 'Hard', 'None', 'L', 'R', 'U', 'A', 'C', 'D', 'F', 'G',
       'M'],
      dtype='object')

In [40]:
df_win_loss.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162562 entries, 15 to 2823
Data columns (total 42 columns):
tourney_id      162562 non-null object
match_num       162562 non-null int64
tourney_name    162562 non-null object
drasize         162562 non-null int64
tourney_date    162562 non-null int64
score           162560 non-null object
best_of         162562 non-null int64
round           162562 non-null object
minutes         162562 non-null float64
id              162562 non-null int64
seed            162562 non-null int64
entry           162562 non-null object
name            162562 non-null object
ht              162562 non-null float64
ioc             162562 non-null object
age             162562 non-null float64
rank            162562 non-null float64
rank_points     162562 non-null float64
ace             162562 non-null float64
df              162562 non-null float64
svpt            162562 non-null float64
1stIn           162562 non-null float64
1stWon          162562 non-nu

In [39]:
df_pred_cols=df_win_loss[[ 
        'minutes', 
       'ht', 'age', 'rank_points', 'ace', 'df',
       'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced',
       'Carpet', 'Clay', 'Grass', 'Hard', 'None', 'L', 'R', 'U', 'A',
       'C', 'D', 'F', 'G', 'M']]

In [None]:
lgstc_reg_model=LogisticRegression()