# 0.0 Imports

In [36]:
import inflection
import pandas            as pd 
import numpy             as np
import matplotlib.pyplot as plt  
import seaborn           as sns         

from sklearn.model_selection import train_test_split
from sklearn.linear_model    import LinearRegression

from IPython.core.display import display, HTML

## 0.1 Helper Functions

In [29]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:75% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [30]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 0.2 Load data

In [33]:
df = pd.read_csv('data/tennis_stats.csv', low_memory = False)

In [34]:
df.head()

Unnamed: 0,Player,Year,FirstServe,FirstServePointsWon,FirstServeReturnPointsWon,SecondServePointsWon,SecondServeReturnPointsWon,Aces,BreakPointsConverted,BreakPointsFaced,BreakPointsOpportunities,BreakPointsSaved,DoubleFaults,ReturnGamesPlayed,ReturnGamesWon,ReturnPointsWon,ServiceGamesPlayed,ServiceGamesWon,TotalPointsWon,TotalServicePointsWon,Wins,Losses,Winnings,Ranking
0,Pedro Sousa,2016,0.88,0.5,0.38,0.5,0.39,0,0.14,7,7,0.43,2,9,0.11,0.38,8,0.5,0.43,0.5,1,2,39820,119
1,Roman Safiullin,2017,0.84,0.62,0.26,0.33,0.07,7,0.0,7,0,0.57,1,9,0.0,0.2,9,0.67,0.41,0.57,0,1,17334,381
2,Pedro Sousa,2017,0.83,0.6,0.28,0.53,0.44,2,0.38,10,8,0.4,1,19,0.16,0.34,17,0.65,0.45,0.59,4,1,109827,119
3,Rogerio Dutra Silva,2010,0.83,0.64,0.34,0.59,0.33,2,0.33,5,6,0.4,0,14,0.14,0.34,15,0.8,0.49,0.63,0,0,9761,125
4,Daniel Gimeno-Traver,2017,0.81,0.54,0.0,0.33,0.33,1,0.0,2,0,0.5,2,3,0.0,0.2,2,0.5,0.35,0.5,0,1,32879,272


# 1.0 Describe data

In [35]:
df.columns

Index(['Player', 'Year', 'FirstServe', 'FirstServePointsWon',
       'FirstServeReturnPointsWon', 'SecondServePointsWon',
       'SecondServeReturnPointsWon', 'Aces', 'BreakPointsConverted',
       'BreakPointsFaced', 'BreakPointsOpportunities', 'BreakPointsSaved',
       'DoubleFaults', 'ReturnGamesPlayed', 'ReturnGamesWon',
       'ReturnPointsWon', 'ServiceGamesPlayed', 'ServiceGamesWon',
       'TotalPointsWon', 'TotalServicePointsWon', 'Wins', 'Losses', 'Winnings',
       'Ranking'],
      dtype='object')

## 1.1 Rename Columns

In [37]:
cols_old = ['Player', 'Year', 'FirstServe', 'FirstServePointsWon',
       'FirstServeReturnPointsWon', 'SecondServePointsWon',
       'SecondServeReturnPointsWon', 'Aces', 'BreakPointsConverted',
       'BreakPointsFaced', 'BreakPointsOpportunities', 'BreakPointsSaved',
       'DoubleFaults', 'ReturnGamesPlayed', 'ReturnGamesWon',
       'ReturnPointsWon', 'ServiceGamesPlayed', 'ServiceGamesWon',
       'TotalPointsWon', 'TotalServicePointsWon', 'Wins', 'Losses','Winnings','Ranking']

snakecase = lambda x: inflection.underscore(x)

cols_new = list(map(snakecase, cols_old))

df.columns = cols_new

In [38]:
df.columns

Index(['player', 'year', 'first_serve', 'first_serve_points_won',
       'first_serve_return_points_won', 'second_serve_points_won',
       'second_serve_return_points_won', 'aces', 'break_points_converted',
       'break_points_faced', 'break_points_opportunities',
       'break_points_saved', 'double_faults', 'return_games_played',
       'return_games_won', 'return_points_won', 'service_games_played',
       'service_games_won', 'total_points_won', 'total_service_points_won',
       'wins', 'losses', 'winnings', 'ranking'],
      dtype='object')

## 1.2 Data Dimensions

In [40]:
print( 'Number of Rows: {}'.format( df.shape[0] ) )
print( 'Number of Cols: {}'.format( df.shape[1] ) )

Number of Rows: 1721
Number of Cols: 24


## 1.3 Data Types 

In [41]:
df.dtypes

player                             object
year                                int64
first_serve                       float64
first_serve_points_won            float64
first_serve_return_points_won     float64
second_serve_points_won           float64
second_serve_return_points_won    float64
aces                                int64
break_points_converted            float64
break_points_faced                  int64
break_points_opportunities          int64
break_points_saved                float64
double_faults                       int64
return_games_played                 int64
return_games_won                  float64
return_points_won                 float64
service_games_played                int64
service_games_won                 float64
total_points_won                  float64
total_service_points_won          float64
wins                                int64
losses                              int64
winnings                            int64
ranking                           