In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
#needed for Jupyter Notebook, if want plots to show inline
%matplotlib inline 
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

In [3]:
import glob
import re
plt.style.available
plt.style.use('seaborn-colorblind')

In [4]:
from collections import Counter
from datetime import datetime, timedelta

In [5]:
%whos

Variable    Type      Data/Info
-------------------------------
Counter     type      <class 'collections.Counter'>
datetime    type      <class 'datetime.datetime'>
glob        module    <module 'glob' from '/hom<...>3/lib/python3.6/glob.py'>
np          module    <module 'numpy' from '/ho<...>kages/numpy/__init__.py'>
pd          module    <module 'pandas' from '/h<...>ages/pandas/__init__.py'>
plt         module    <module 'matplotlib.pyplo<...>es/matplotlib/pyplot.py'>
re          module    <module 're' from '/home/<...>da3/lib/python3.6/re.py'>
timedelta   type      <class 'datetime.timedelta'>


## Loading data

In [6]:
def read2000sMatches(dirname):
    allFiles = glob.glob(dirname + "/atp_matches_2" + "*.csv")
    matches = pd.DataFrame()
    list_ = list()
    for filen in allFiles:
        df = pd.read_csv(filen,
                         index_col=None,
                         header=0,
                         parse_dates=[0])
                         #date_parser=lambda t:parse(t))
        list_.append(df)
    matches = pd.concat(list_)
    return matches

def read1900sMatches(dirname):
    allFiles = glob.glob(dirname + "/atp_matches_1" + "*.csv")
    matches = pd.DataFrame()
    list_ = list()
    for filen in allFiles:
        df = pd.read_csv(filen,
                         index_col=None,
                         header=0,
                         parse_dates=[0])
                         #date_parser=lambda t:parse(t))
        list_.append(df)
    matches = pd.concat(list_)
    return matches

In [7]:
matches2000s=read2000sMatches('tennis/')
matches1900s=read1900sMatches('tennis/')
matches_SackmannAll = pd.concat([matches1900s,matches2000s])
matches_SackmannAll.shape # (167879, 49)

(167879, 49)

## Take only data with detailed stats

In [8]:
df_stats = matches_SackmannAll[~pd.isnull(matches_SackmannAll['w_ace'])]
#still has many columns that are mising data
# df_stats.info()

In [9]:
df_win = df_stats[['tourney_id', 'match_num','tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date',  'score', 'best_of',
       'round', 'minutes','winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points','w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms',
       'w_bpSaved', 'w_bpFaced']]
df_loss = df_stats[['tourney_id','match_num','tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date',  'score', 'best_of',
       'round', 'minutes','loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points','l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced']]

### Make 'win' and 'loss' columns compatible

#### Situation by poor concatenation (appending to the right NOT below), without changing column names

In [10]:
df_win_loss = pd.concat([df_win,df_loss],axis=1)
# each row has basic info repeated
# each row has two 'target'

#### Making compatible

In [11]:
df_win.columns = [s.replace("winner_", "") for s in df_win.columns]
df_win.columns = [s.replace("w_", "") for s in df_win.columns]
df_loss.columns = [s.replace("loser_", "") for s in df_win.columns]
df_loss.columns = [s.replace("l_", "") for s in df_win.columns]

df_win['target']=1
df_loss['target']=0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [12]:
df_win_loss = pd.concat([df_win,df_loss])


In [13]:
%whos

Variable              Type         Data/Info
--------------------------------------------
Counter               type         <class 'collections.Counter'>
datetime              type         <class 'datetime.datetime'>
df_loss               DataFrame         tourney_id  match_nu<...>[81281 rows x 31 columns]
df_stats              DataFrame         tourney_id  tourney_<...>[81281 rows x 49 columns]
df_win                DataFrame         tourney_id  match_nu<...>[81281 rows x 31 columns]
df_win_loss           DataFrame         tourney_id  match_nu<...>162562 rows x 31 columns]
glob                  module       <module 'glob' from '/hom<...>3/lib/python3.6/glob.py'>
matches1900s          DataFrame         tourney_id          <...>110457 rows x 49 columns]
matches2000s          DataFrame         tourney_id          <...>[57422 rows x 49 columns]
matches_SackmannAll   DataFrame         tourney_id          <...>167879 rows x 49 columns]
np                    module       <module 'numpy' fro

In [14]:
del(matches1900s,matches2000s)

#### Fill missing values

In [15]:
## fill method of entry for most players with 'standard'; mostly missing values
df_win_loss['entry'] = df_win_loss['entry'].fillna('standard')

## better way might be max(33,rank) ## 33 is the minumum non-seeded value
##high positive value because negative associates with better rank
df_win_loss['seed'] = df_win_loss['seed'].fillna(9999)
df_win_loss['seed'] = df_win_loss['seed'].astype(int)

#rank, and rank_points have ~4000 missign out of 162000
df_win_loss[['rank','rank_points']] = df_win_loss.groupby('name')[['rank','rank_points']]\
                                        .transform(lambda x: x.fillna(x.mean()))
#rank, and rank_points have some more missing values: set to maximum ranking and minimum rank points
df_win_loss['rank'] = df_win_loss['rank']\
                                        .transform(lambda x: x.fillna(x.max()))
df_win_loss['rank_points'] = df_win_loss['rank_points']\
                                        .transform(lambda x: x.fillna(x.min()))


#### more filling of missing values

In [16]:
### for this 1st epicycle: don't want score,2 scores missing,, , 

In [17]:
## set 'hand' to U ('unknown') ~35 missing 'hand'
df_win_loss['hand'] = df_win_loss['hand'].fillna('U')

In [18]:
df_win_loss['hand'].value_counts() #R    138155 L     23661 U       746

R    138155
L     23661
U       746
Name: hand, dtype: int64

In [19]:
# set missing height, age  to average for all people ~7000 missing heights, ~100 missing age
df_win_loss[['age','ht']] = df_win_loss[['age','ht']]\
                                        .transform(lambda x: x.fillna(x.mean()))

In [20]:
## set missing minutes to average age of all ~5000 missing minutes
df_win_loss['minutes'] = df_win_loss['minutes']\
                                        .transform(lambda x: x.fillna(x.mean()))

In [21]:
(df_win_loss['age']).value_counts() #87 (most) occurrences of 23.7919
len(df_win_loss.loc[pd.isnull(df_win_loss['age']),'age']) # 0 good
(df_win_loss['ht']).value_counts() #25899 (most) occurrences of 185 (cm)
len(df_win_loss.loc[pd.isnull(df_win_loss['ht']),'ht']) # 0 good

0

## Dummying categorical variables

In [22]:
df_win_loss = pd.concat([df_win_loss,\
                         pd.get_dummies(df_win_loss['surface'])],axis=1)

* Hard      80028 
* Clay      54282 
* Grass     16366 
* Carpet    11736 
* None        150 

Name: surface, dtype: int64

In [23]:
df_win_loss.drop('surface', axis=1,inplace=True)     

In [24]:
df_win_loss = pd.concat([df_win_loss,\
                         pd.get_dummies(df_win_loss['hand'])],axis=1)

In [25]:
df_win_loss.drop('hand', axis=1,inplace=True)

In [26]:
df_win_loss = pd.concat([df_win_loss,\
                         pd.get_dummies(df_win_loss['tourney_level'])],axis=1)

In [27]:
df_win_loss.drop('tourney_level', axis=1,inplace=True)

In [28]:
df_win_loss.columns

Index(['tourney_id', 'match_num', 'tourney_name', 'drasize', 'tourney_date',
       'score', 'best_of', 'round', 'minutes', 'id', 'seed', 'entry', 'name',
       'ht', 'ioc', 'age', 'rank', 'rank_points', 'ace', 'df', 'svpt', '1stIn',
       '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced', 'target', 'Carpet',
       'Clay', 'Grass', 'Hard', 'None', 'L', 'R', 'U', 'A', 'C', 'D', 'F', 'G',
       'M'],
      dtype='object')

In [29]:
#Other than score all values are filled, at 162562
df_win_loss.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162562 entries, 15 to 2823
Data columns (total 42 columns):
tourney_id      162562 non-null object
match_num       162562 non-null int64
tourney_name    162562 non-null object
drasize         162562 non-null int64
tourney_date    162562 non-null int64
score           162560 non-null object
best_of         162562 non-null int64
round           162562 non-null object
minutes         162562 non-null float64
id              162562 non-null int64
seed            162562 non-null int64
entry           162562 non-null object
name            162562 non-null object
ht              162562 non-null float64
ioc             162562 non-null object
age             162562 non-null float64
rank            162562 non-null float64
rank_points     162562 non-null float64
ace             162562 non-null float64
df              162562 non-null float64
svpt            162562 non-null float64
1stIn           162562 non-null float64
1stWon          162562 non-nu

In [30]:
df_pred_cols=df_win_loss[[ 
        'minutes', 
       'ht', 'age', 'rank_points', 'ace', 'df',
       'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced',
       'Carpet', 'Clay', 'Grass', 'Hard', 'None', 'L', 'R', 'U', 'A',
       'C', 'D', 'F', 'G', 'M']]

## Selecting the time subset of data

### Exploration

In [31]:
df_win_loss.columns


Index(['tourney_id', 'match_num', 'tourney_name', 'drasize', 'tourney_date',
       'score', 'best_of', 'round', 'minutes', 'id', 'seed', 'entry', 'name',
       'ht', 'ioc', 'age', 'rank', 'rank_points', 'ace', 'df', 'svpt', '1stIn',
       '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced', 'target', 'Carpet',
       'Clay', 'Grass', 'Hard', 'None', 'L', 'R', 'U', 'A', 'C', 'D', 'F', 'G',
       'M'],
      dtype='object')

In [32]:
tourney_date=df_win_loss['tourney_date'].values
type(np.max(tourney_date)) #20180521, np.int64
np.min(tourney_date) #19901231

19901231

In [33]:
# shows that data not easily converted to date
len(pd.to_datetime(df_win_loss['tourney_date']).unique()) #1078

1078

### Conversion of 'tourney_date' to datetime variable

In [34]:
## this conversion works, have to reassign becuase operation is not inplace
df_win_loss['tourney_date']= pd.to_datetime(df_win_loss['tourney_date'].apply(str))
type(df_win_loss['tourney_date']) #seriers, post-pend '.values':np.ndarray of datetimes
df_win_loss['tourney_date'].apply(lambda x: x.year).unique()
# 1992, 1991, 1993, 1996, 1999, 1994, 1995, 1997, 1998, 1990, 2004,
#        2008, 2007, 2015, 2012, 2003, 2002, 2000, 2001, 2013, 2009, 2017,
#        2014, 2010, 2006, 2005, 2011, 2016, 2018

# df_win_loss.groupby(df_win_loss['tourney_date'].apply(lambda x: x.year))['tourney_date']

array([1992, 1991, 1993, 1996, 1999, 1994, 1995, 1997, 1998, 1990, 2004,
       2008, 2007, 2015, 2012, 2003, 2002, 2000, 2001, 2013, 2009, 2017,
       2014, 2010, 2006, 2005, 2011, 2016, 2018])

In [35]:
df_win_loss.index = df_win_loss['tourney_date']
df_win_loss.set_index(np.arange(1,len(df_win_loss)+1),inplace=True)

In [36]:
# datetime.now() #5 hours ahead of Central Standard

In [37]:
one_year = timedelta(days=365)
# print(s[date_index - one_day]

### Get a duration of a year and limit dataframe to year before stipulated date

In [38]:
date = pd.datetime(2010,11,23)
one_year=timedelta(days=370) 
df_one_year = df_win_loss[(df_win_loss.tourney_date<date) & (df_win_loss.tourney_date> date-one_year)]
df_one_year.shape #5432,42; 'days=500', shape is 7544,42

(5462, 42)

In [None]:
# Checking which years the data of resultant dataframe is from
df_one_year['tourney_date'].apply(lambda x: x.year).unique()

### Conclsion: get a one-year basis dataframe 

## Get player data for the last  year

#### One method, Masking: Checking Roger Federer

In [None]:
#Works but gives same value, replated for all columsn of data frame
df_win_loss[df_win_loss['name']=='Roger Federer'].count()
#Works and give a single value as I want
df_win_loss.loc[df_win_loss['name']=='Roger Federer', 'name'].count() #1339
#Seems there are other Roger's, checked a few lines down:
df_win_loss.loc[df_win_loss['name'].str.contains('Roger'), 'name'].count() #1652
#But only one Federer, number matches that of 'Roger Federer':
df_win_loss.loc[df_win_loss['name'].str.contains('Federer'), 'name'].count() #1339
#Gets expect result of 313 Roger's not named Federer (1652-1339)
df_win_loss.loc[(df_win_loss['name'].str.contains('Roger'))&(df_win_loss['name']!='Roger Federer'), 'name'].count() 

#### Other method, Grouping: Checking Novak and Nadal

In [51]:
player1='Rafael Nadal'
player2='Novak Djokovic'
df_win_loss.groupby('name').get_group(player1) 
df_win_loss.groupby('name').get_group(player2)

Unnamed: 0,tourney_id,match_num,tourney_name,drasize,tourney_date,score,best_of,round,minutes,id,seed,entry,name,ht,ioc,age,rank,rank_points,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,target,Carpet,Clay,Grass,Hard,None,L,R,U,A,C,D,F,G,M
30299,2004-773,14,Bucharest,32,2004-09-13,2-6 6-4 6-4,3,R32,146.0,104925,9999,Q,Novak Djokovic,188.0,SRB,17.314168,272.0,128.0,4.0,2.0,91.0,60.0,39.0,19.0,14.0,2.0,5.0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0
32806,2008-96,24,Beijing Olympics,64,2008-08-11,6-4 6-4,3,R64,72.0,104925,3,standard,Novak Djokovic,188.0,SRB,21.223819,3.0,4905.0,6.0,1.0,51.0,38.0,28.0,7.0,10.0,1.0,3.0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0
32826,2008-96,44,Beijing Olympics,64,2008-08-11,6-4 6-2,3,R32,74.0,104925,3,standard,Novak Djokovic,188.0,SRB,21.223819,3.0,4905.0,5.0,2.0,43.0,31.0,29.0,5.0,9.0,0.0,1.0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0
32836,2008-96,54,Beijing Olympics,64,2008-08-11,7-6(3) 6-3,3,R16,97.0,104925,3,standard,Novak Djokovic,188.0,SRB,21.223819,3.0,4905.0,4.0,0.0,54.0,40.0,35.0,9.0,10.0,0.0,0.0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0
32841,2008-96,59,Beijing Olympics,64,2008-08-11,4-6 6-1 6-4,3,QF,142.0,104925,3,standard,Novak Djokovic,188.0,SRB,21.223819,3.0,4905.0,4.0,1.0,100.0,69.0,48.0,15.0,14.0,5.0,7.0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0
33175,2008-1536,21,Madrid Masters,48,2008-10-12,6-7(8) 7-6(6) 3-1 RET,3,R32,140.0,104925,3,standard,Novak Djokovic,188.0,SRB,21.396304,3.0,4725.0,10.0,2.0,90.0,58.0,44.0,22.0,14.0,1.0,2.0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1
33248,2008-540,17,Wimbledon,128,2008-06-23,7-5 2-6 6-3 6-0,5,R128,127.0,104925,3,standard,Novak Djokovic,188.0,SRB,21.089665,3.0,5360.0,13.0,5.0,94.0,67.0,53.0,15.0,18.0,3.0,5.0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0
33421,2008-422,36,Cincinnati Masters,56,2008-07-28,7-6(2) 7-6(2),3,R32,114.0,104925,3,standard,Novak Djokovic,188.0,SRB,21.185489,3.0,5055.0,9.0,5.0,97.0,54.0,45.0,21.0,12.0,10.0,12.0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1
33431,2008-422,46,Cincinnati Masters,56,2008-07-28,6-1 6-2,3,R16,59.0,104925,3,standard,Novak Djokovic,188.0,SRB,21.185489,3.0,5055.0,8.0,1.0,41.0,23.0,19.0,11.0,8.0,0.0,1.0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1
33436,2008-422,51,Cincinnati Masters,56,2008-07-28,6-3 6-4,3,QF,94.0,104925,3,standard,Novak Djokovic,188.0,SRB,21.185489,3.0,5055.0,6.0,1.0,64.0,43.0,33.0,12.0,10.0,3.0,3.0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1


In [42]:
df_win_loss.loc[(df_win_loss['name'].str.contains('Roger'))&(df_win_loss['name']!='Roger Federer'), 'name'].tail(100)

136131       Rogerio Dutra Silva
136183       Rogerio Dutra Silva
136318       Rogerio Dutra Silva
136408       Rogerio Dutra Silva
136630       Rogerio Dutra Silva
136741       Rogerio Dutra Silva
136823       Rogerio Dutra Silva
136984       Rogerio Dutra Silva
137111       Rogerio Dutra Silva
137289       Rogerio Dutra Silva
137333       Rogerio Dutra Silva
137515       Rogerio Dutra Silva
137638       Rogerio Dutra Silva
137754       Rogerio Dutra Silva
137868       Rogerio Dutra Silva
137941       Rogerio Dutra Silva
138040       Rogerio Dutra Silva
138109       Rogerio Dutra Silva
138268       Rogerio Dutra Silva
138769    Edouard Roger Vasselin
138795    Edouard Roger Vasselin
138911    Edouard Roger Vasselin
139111       Rogerio Dutra Silva
139229    Edouard Roger Vasselin
139257    Edouard Roger Vasselin
139343    Edouard Roger Vasselin
139493    Edouard Roger Vasselin
139569    Edouard Roger Vasselin
139616    Edouard Roger Vasselin
139732    Edouard Roger Vasselin
139781    

In [48]:
player1='Rafael Nadal'
player2='Novac Djokovic'
#Works but not what I want, all players where they appear more than twice
# df_one_year.groupby('name').filter(lambda x: len(x)>2) 
df_one_year.groupby('name').get_group(player1) 

Unnamed: 0,tourney_id,match_num,tourney_name,drasize,tourney_date,score,best_of,round,minutes,id,seed,entry,name,ht,ioc,age,rank,rank_points,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,target,Carpet,Clay,Grass,Hard,None,L,R,U,A,C,D,F,G,M
60064,2010-1536,40,Madrid Masters,56,2010-05-09,6-4 6-3,3,R32,87.0,104745,2,standard,Rafael Nadal,185.0,ESP,23.934292,3.0,6480.0,0.0,2.0,68.0,52.0,37.0,9.0,9.0,1.0,1.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1
60072,2010-1536,48,Madrid Masters,56,2010-05-09,7-5 6-4,3,R16,87.0,104745,2,standard,Rafael Nadal,185.0,ESP,23.934292,3.0,6480.0,1.0,0.0,52.0,34.0,29.0,15.0,11.0,0.0,0.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1
60076,2010-1536,52,Madrid Masters,56,2010-05-09,6-1 6-3,3,QF,78.0,104745,2,standard,Rafael Nadal,185.0,ESP,23.934292,3.0,6480.0,1.0,1.0,41.0,30.0,27.0,6.0,8.0,2.0,2.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1
60078,2010-1536,54,Madrid Masters,56,2010-05-09,4-6 6-2 6-2,3,SF,136.0,104745,2,standard,Rafael Nadal,185.0,ESP,23.934292,3.0,6480.0,1.0,1.0,70.0,45.0,35.0,9.0,13.0,3.0,6.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1
60079,2010-1536,55,Madrid Masters,56,2010-05-09,6-4 7-6(5),3,F,131.0,104745,2,standard,Rafael Nadal,185.0,ESP,23.934292,3.0,6480.0,2.0,2.0,81.0,59.0,32.0,14.0,11.0,8.0,11.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1
60197,2010-540,64,Wimbledon,128,2010-06-21,6-2 6-4 6-4,5,R128,125.0,104745,2,standard,Rafael Nadal,185.0,ESP,24.049281,1.0,8745.0,8.0,3.0,78.0,54.0,44.0,13.0,14.0,2.0,3.0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0
60228,2010-540,96,Wimbledon,128,2010-06-21,5-7 6-2 3-6 6-0 6-3,5,R64,142.0,104745,2,standard,Rafael Nadal,185.0,ESP,24.049281,1.0,8745.0,5.0,2.0,107.0,73.0,62.0,23.0,22.0,0.0,2.0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0
60244,2010-540,112,Wimbledon,128,2010-06-21,6-4 4-6 6-7(5) 6-2 6-3,5,R32,225.0,104745,2,standard,Rafael Nadal,185.0,ESP,24.049281,1.0,8745.0,12.0,1.0,142.0,96.0,77.0,28.0,25.0,1.0,2.0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0
60252,2010-540,120,Wimbledon,128,2010-06-21,6-4 6-2 6-2,5,R16,121.0,104745,2,standard,Rafael Nadal,185.0,ESP,24.049281,1.0,8745.0,8.0,1.0,68.0,44.0,38.0,16.0,13.0,2.0,2.0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0
60256,2010-540,124,Wimbledon,128,2010-06-21,3-6 6-3 7-6(4) 6-1,5,QF,163.0,104745,2,standard,Rafael Nadal,185.0,ESP,24.049281,1.0,8745.0,12.0,3.0,126.0,85.0,59.0,25.0,19.0,4.0,7.0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0


In [40]:
df_pred_cols=df_one_year[[ 
        'minutes', 
       'ht', 'age', 'rank_points', 'ace', 'df',
       'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced',
       'Carpet', 'Clay', 'Grass', 'Hard', 'None', 'L', 'R', 'U', 'A',
       'C', 'D', 'F', 'G', 'M']]

# _ASIDE_: Cool python/pandas stuff

In [None]:
# shift...looks like for datetime...not sure what quatity it shifts by (a 'year'?)
df.groupby([df['Date'].dt.month,df['Date'].dt.day])['Value'].shift()

In [None]:
# last() method to get data from last <period of time, #days, #weeks etc>
df_win_loss.index = df_win_loss['tourney_date']
df_win_loss.last('1y')

In [None]:
##
import datetime
one_day = timedelta(days=1)
print(s[date_index - one_day]