# Which player has the most points to be next year´s best golfer

In [1]:
import numpy as np 
import pandas as pd
import matplotlib as plt
import seaborn as sns
import requests
import json
from bs4 import BeautifulSoup

#### I am going to be using pgaTourData for the years 2017 and 2018 and in order to check if the valuations make sense, I´m going to use pga Tour website to check who truly were the best golfers in the season of 2019 (2018-2019).

In [2]:
df = pd.read_csv('pgaTourData.csv')

#### Although the dataset seem pretty clean, it still  contains some null error that we want to get rid of, plus we only want to keep the data for the years of 2017 and 2018. Moreover, and for greater simplicity I´m going to rename most headers in order to replace spaces between words for '_' . 
#### So... let´s do some cleaning!!


In [3]:
df = df.dropna(axis=0, subset=['Rounds'])

In [4]:
df_golf = df.rename(columns = {'Player Name': 'Player_Name', 'Fairway Percentage':'Fairway_Percentage', 'Avg Distance':'Avg_Distance', 'Average Putts':'Average_Putts', 'Average Scrambling': 'Average_Scrambling', 'Average Score':'Average_Score', 'Top 10':'Top_10', 'Average SG Putts':'Average_SG_Putts', 'Average SG Total': 'Average_SG_Total', 'SG:OTT': 'SG_OTT', 'SG:APR': 'SG_APR', 'SG:ARG':'SG_ARG'})

In [5]:
df_golf.head()

Unnamed: 0,Player_Name,Rounds,Fairway_Percentage,Year,Avg_Distance,gir,Average_Putts,Average_Scrambling,Average_Score,Points,Wins,Top_10,Average_SG_Putts,Average_SG_Total,SG_OTT,SG_APR,SG_ARG,Money
0,Henrik Stenson,60.0,75.19,2018,291.5,73.51,29.93,60.67,69.617,868,,5.0,-0.207,1.153,0.427,0.96,-0.027,"$2,680,487"
1,Ryan Armour,109.0,73.58,2018,283.5,68.22,29.31,60.13,70.758,1006,1.0,3.0,-0.058,0.337,-0.012,0.213,0.194,"$2,485,203"
2,Chez Reavie,93.0,72.24,2018,286.5,68.67,29.12,62.27,70.432,1020,,3.0,0.192,0.674,0.183,0.437,-0.137,"$2,700,018"
3,Ryan Moore,78.0,71.94,2018,289.2,68.8,29.17,64.16,70.015,795,,5.0,-0.271,0.941,0.406,0.532,0.273,"$1,986,608"
4,Brian Stuard,103.0,71.44,2018,278.9,67.12,29.11,59.23,71.038,421,,3.0,0.164,0.062,-0.227,0.099,0.026,"$1,089,763"


In [6]:
# Replace NaN for 0 in Top_10 and make it an int
df_golf['Top_10'].fillna(0, inplace = True)
df_golf['Top_10'] = df_golf['Top_10'].astype(int)

In [7]:
# Replace NaN for 0 in Wins and make it an int
df_golf['Wins'].fillna(0, inplace = True)
df_golf['Wins'] = df_golf['Wins'].astype(int)

In [8]:
# Make Rounds an int
df_golf['Rounds'] = df_golf['Rounds'].astype(int)

In [9]:
# Make Points an int
df_golf.dropna(axis = 0, inplace = True )
df_golf['Points'] = df_golf['Points'].str.replace(',', '')
df_golf['Points'] = df_golf['Points'].astype(int)

In [10]:
# Perfect money column
df_golf['Money'] = df_golf['Money'].str.replace('$', '')
df_golf['Money'] = df_golf['Money'].str.replace(',', '')
df_golf['Money'] = df_golf['Money'].astype(int)

In [11]:
indexYears = df_golf[df_golf['Year'] <= 2016].index

In [12]:
df_golf.drop(indexYears , inplace=True)

In [None]:
df_golf.head()

####  In order to make presumptions I´m going to take what I consider to be the most important golf metrics, and I am going to use them depending on it´s importance related to the game. To obtain their 'importance' I will use their correlations with the average score:

In [13]:
df_golf.corr(method = 'pearson')

Unnamed: 0,Rounds,Fairway_Percentage,Year,Avg_Distance,gir,Average_Putts,Average_Scrambling,Average_Score,Points,Wins,Top_10,Average_SG_Putts,Average_SG_Total,SG_OTT,SG_APR,SG_ARG,Money
Rounds,1.0,0.096178,0.022152,0.145598,0.304078,0.010795,0.14707,-0.345654,0.375095,0.125912,0.400573,0.033003,0.317026,0.221764,0.276473,0.10504,0.301076
Fairway_Percentage,0.096178,1.0,0.111276,-0.43946,0.381394,0.236871,0.230514,-0.189344,0.026938,-0.045935,0.048697,-0.039612,0.181384,0.237628,0.161663,-0.085005,0.009446
Year,0.022152,0.111276,1.0,0.226611,0.186522,0.09246,-0.087525,-0.040506,0.029641,0.007024,0.031122,0.023346,0.028998,0.016834,0.009417,0.007282,0.037892
Avg_Distance,0.145598,-0.43946,0.226611,1.0,0.359577,0.173347,-0.154406,-0.403167,0.409369,0.272563,0.389985,-0.123221,0.402398,0.643721,0.215262,-0.079841,0.415772
gir,0.304078,0.381394,0.186522,0.359577,1.0,0.548651,0.129249,-0.600835,0.375015,0.18763,0.373561,-0.204436,0.577352,0.629901,0.593165,-0.019838,0.358602
Average_Putts,0.010795,0.236871,0.09246,0.173347,0.548651,1.0,-0.450432,0.161003,-0.193433,-0.14771,-0.171583,-0.663268,-0.169504,0.313572,0.131752,-0.419566,-0.199834
Average_Scrambling,0.14707,0.230514,-0.087525,-0.154406,0.129249,-0.450432,1.0,-0.54402,0.313745,0.112102,0.311153,0.366179,0.543458,0.099399,0.27134,0.607236,0.306073
Average_Score,-0.345654,-0.189344,-0.040506,-0.403167,-0.600835,0.161003,-0.54402,1.0,-0.758952,-0.404509,-0.732059,-0.277791,-0.961917,-0.602422,-0.707785,-0.417551,-0.748177
Points,0.375095,0.026938,0.029641,0.409369,0.375015,-0.193433,0.313745,-0.758952,1.0,0.780882,0.861735,0.20441,0.733906,0.477488,0.537391,0.312214,0.970333
Wins,0.125912,-0.045935,0.007024,0.272563,0.18763,-0.14771,0.112102,-0.404509,0.780882,1.0,0.500958,0.100277,0.38546,0.269806,0.268201,0.167685,0.761538


In [None]:
# important correlations (in my golfing opinion)
'''
coor(Avg_Distance, Average_Score) = -0.403167
coor(Average_SG_Putts, Average_Score) = -0.277791
coor(SG_APR, Average_Score) = -0.707785
coor(SG_ARG, Average_Score) = -0.417551
'''

#### Golfers who have made over 4,500,000 dollars are going to be noticed as Top golfers and will appear from now on in the charts as orange dots.

In [None]:
# The following jointplot differentiates pro golfers by driving range (Avg_Distance9 and accuracy (Fairway_Percentage)
sns.jointplot(x= df_golf.Avg_Distance, y= df_golf.Fairway_Percentage, hue= df_golf.Money > 4500000, height= 20)

In [None]:
# Longest Hitters

df_driving_ranking = df_golf[['Year','Player_Name','Avg_Distance', 'Fairway_Percentage']].copy()
df_driving_ranking.sort_values(by = 'Avg_Distance', inplace = True, ascending = True)
df_driving_ranking.head()


In [None]:
# Hacemos un ranking con los mejores longest drivers.
df_golf.sort_values(by = 'Avg_Distance', inplace = True, ascending = True)
df_golf.reset_index(inplace = True)
df_golf['driving_ranking'] = df_golf.index
df_golf.reset_index(drop=True)


df_golf.head()


In [None]:
# Line Readers (Putters)

df_best_putters = df_golf[['Year', 'Player_Name', 'Wins', 'Average_SG_Putts']].copy()
df_best_putters.sort_values(by = 'Average_SG_Putts', inplace = True, ascending = True)
df_best_putters.head()

In [None]:
# Hacemos un ranking con los mejores Putteadores

df_golf.sort_values(by = 'Average_SG_Putts', inplace = True, ascending = True)
df_golf.reset_index(inplace = True)
df_golf['best_putters'] = df_golf.index
df_golf.reset_index(drop=True)


df_golf.head()


In [None]:
# Putting skills among the best players

sns.jointplot(x= df_golf.Average_Score, y= df_golf.Average_SG_Putts, hue= df_golf.Money > 4500000, height= 20)

In [None]:
# 7,8,9 & pitching irons

df_best_approachers =  df_golf[['Year', 'Player_Name', 'Wins', 'SG_APR']].copy()
df_best_approachers.sort_values(by = 'SG_APR', inplace = True, ascending = True)
df_best_approachers.head()

In [None]:
# We rank best approachers


# df_golf['APR_Masters'] = df_golf.sort_values(by = 'SG_APR', inplace = True, ascending = True)

'''
df_golf.sort_values(by = 'SG_APR', inplace = True, ascending = True)
df_golf.reset_index(inplace = True)
df_golf['best_approachers'] = df_golf.index
'''

#df_golf.head()

In [None]:
# Aproaching skills among the best players (7,8,9 and pitching irons)

sns.jointplot(x= df_golf.Average_Score, y= df_golf.SG_APR, hue= df_golf.Money > 4500000, height= 20)

In [None]:
# Pitch and Wedge Cracks

df_best_pw = df_golf[['Year', 'Player_Name', 'Wins', 'SG_ARG']].copy()
df_best_pw.sort_values(by = 'SG_ARG', inplace = True, ascending = False)
df_best_pw.head()

In [None]:
# Pitch and wedge cracks among the best players

sns.jointplot(x= df_golf.Average_Score, y= df_golf.SG_ARG, hue= df_golf.Money > 4500000, height= 20)

In [None]:
# We rank best pitchers
'''
df_golf.sort_values(by = 'SG_ARG', inplace = True, ascending = True)
df_golf.reset_index(inplace = True)
df_golf['best_pw'] = df_golf.index
'''

#df_golf.head()

#### With all these new columns I am going to create another one that multiplies each players ranking by its correlation to average scrore. This new column should order players by skills and should be a good indicative of who is most prepare to be next years top golfer

In [None]:
# df_golf['Predictive_Ranking'] = (df_golf.driving_ranking * 0.403167) + (df_golf.best_putters * 0.277791) + (df_golf.best_approachers * 0.707785) + (df_golf.best_pw * 0.417551)

### WEB SCRAPING 

In [None]:
url = 'https://www.pgatour.com/stats/stat.120.y2020.html'

In [None]:
response = requests.get(url)

In [None]:
response.status_code

In [None]:
soup = BeautifulSoup(response.content)

In [None]:
soup.title.text.strip()

In [None]:
player = soup.find_all('td', class_='player-name')

In [None]:
p= player[0]

In [None]:
p.find('a')

In [None]:
[p.find("a").text.strip() for p in player]

In [None]:
avg_score = soup.find_all('td', '/td')

In [None]:
avg_score

In [None]:
a