In [31]:
# imports
import numpy as np 
import pandas as pd
import matplotlib as plt
import seaborn as sns
import requests
import json
from bs4 import BeautifulSoup

In [32]:
df = pd.read_csv('data/pgaTourData.csv')

In [33]:
df.head()

Unnamed: 0,Player Name,Rounds,Fairway Percentage,Year,Avg Distance,gir,Average Putts,Average Scrambling,Average Score,Points,Wins,Top 10,Average SG Putts,Average SG Total,SG:OTT,SG:APR,SG:ARG,Money
0,Henrik Stenson,60.0,75.19,2018,291.5,73.51,29.93,60.67,69.617,868,,5.0,-0.207,1.153,0.427,0.96,-0.027,"$2,680,487"
1,Ryan Armour,109.0,73.58,2018,283.5,68.22,29.31,60.13,70.758,1006,1.0,3.0,-0.058,0.337,-0.012,0.213,0.194,"$2,485,203"
2,Chez Reavie,93.0,72.24,2018,286.5,68.67,29.12,62.27,70.432,1020,,3.0,0.192,0.674,0.183,0.437,-0.137,"$2,700,018"
3,Ryan Moore,78.0,71.94,2018,289.2,68.8,29.17,64.16,70.015,795,,5.0,-0.271,0.941,0.406,0.532,0.273,"$1,986,608"
4,Brian Stuard,103.0,71.44,2018,278.9,67.12,29.11,59.23,71.038,421,,3.0,0.164,0.062,-0.227,0.099,0.026,"$1,089,763"


#### Although the dataset seems pretty clean, it still  contains some null error that we want to get rid of, plus we only want to keep the data for the years of 2017 and 2018. Moreover, and for greater simplicity I´m going to rename most headers in order to replace spaces between words for '_' . 
#### So... let´s do some cleaning!!

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2312 entries, 0 to 2311
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Player Name         2312 non-null   object 
 1   Rounds              1678 non-null   float64
 2   Fairway Percentage  1678 non-null   float64
 3   Year                2312 non-null   int64  
 4   Avg Distance        1678 non-null   float64
 5   gir                 1678 non-null   float64
 6   Average Putts       1678 non-null   float64
 7   Average Scrambling  1678 non-null   float64
 8   Average Score       1678 non-null   float64
 9   Points              2296 non-null   object 
 10  Wins                293 non-null    float64
 11  Top 10              1458 non-null   float64
 12  Average SG Putts    1678 non-null   float64
 13  Average SG Total    1678 non-null   float64
 14  SG:OTT              1678 non-null   float64
 15  SG:APR              1678 non-null   float64
 16  SG:ARG

In [35]:
df.isnull().sum()

Player Name              0
Rounds                 634
Fairway Percentage     634
Year                     0
Avg Distance           634
gir                    634
Average Putts          634
Average Scrambling     634
Average Score          634
Points                  16
Wins                  2019
Top 10                 854
Average SG Putts       634
Average SG Total       634
SG:OTT                 634
SG:APR                 634
SG:ARG                 634
Money                   12
dtype: int64

In [36]:
df = df[df['Rounds'].notna()]
df.isnull().sum()

Player Name              0
Rounds                   0
Fairway Percentage       0
Year                     0
Avg Distance             0
gir                      0
Average Putts            0
Average Scrambling       0
Average Score            0
Points                   4
Wins                  1395
Top 10                 325
Average SG Putts         0
Average SG Total         0
SG:OTT                   0
SG:APR                   0
SG:ARG                   0
Money                    4
dtype: int64

In [39]:
# Replace NaN for 0 in Top_10 and make it an int
df['Top 10'].fillna(0, inplace = True)
df['Top 10'] = df['Top 10'].astype(int)

In [40]:
# Replace NaN for 0 in Wins and make it an int
df['Wins'].fillna(0, inplace = True)
df['Wins'] = df['Wins'].astype(int)

In [41]:
# Make Rounds an int
df['Rounds'] = df['Rounds'].astype(int)

In [62]:
# Perfect money column
df['Money'].fillna(0, inplace = True)
df['Money'] = df['Money'].apply(lambda x: str(x).replace('$', ''))
df['Money'] = df['Money'].apply(lambda x: str(x).replace(',', ''))
df['Money'] = df['Money'].astype(float)
df['Money'] = df['Money'].astype(int)

In [64]:
# This if finally how our clean dataset looks now.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1678 entries, 0 to 1677
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Player Name         1678 non-null   object 
 1   Rounds              1678 non-null   int64  
 2   Fairway Percentage  1678 non-null   float64
 3   Year                1678 non-null   int64  
 4   Avg Distance        1678 non-null   float64
 5   gir                 1678 non-null   float64
 6   Average Putts       1678 non-null   float64
 7   Average Scrambling  1678 non-null   float64
 8   Average Score       1678 non-null   float64
 9   Points              1674 non-null   object 
 10  Wins                1678 non-null   int64  
 11  Top 10              1678 non-null   int64  
 12  Average SG Putts    1678 non-null   float64
 13  Average SG Total    1678 non-null   float64
 14  SG:OTT              1678 non-null   float64
 15  SG:APR              1678 non-null   float64
 16  SG:ARG

In [69]:
df.columns

Index(['Player Name', 'Rounds', 'Fairway Percentage', 'Year', 'Avg Distance',
       'gir', 'Average Putts', 'Average Scrambling', 'Average Score', 'Points',
       'Wins', 'Top 10', 'Average SG Putts', 'Average SG Total', 'SG:OTT',
       'SG:APR', 'SG:ARG', 'Money'],
      dtype='object')

## The the current datatset now contains all results cleaned from 2010 to 2018 and we will work with this one

In [67]:
df.to_csv('./data/golf_data.csv')

### Now let´s continue with this dataset on another jupyter notebook...