In [294]:
# Importing Pandas for data analysis and scikit-learn for Min/Max Scaling
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

### Strokes Gained Off The TEE

In [295]:
# Get strokes gained off the tee data from PGA tour website
dfOTT = pd.read_html("https://www.pgatour.com/stats/stat.02567.html")[1]


In [296]:
# figuring out all of the header names in the table
dfOTT.columns

Index(['RANK THIS WEEK', 'RANK LAST WEEK', 'PLAYER NAME', 'ROUNDS', 'AVERAGE',
       'TOTAL SG:OTT', 'MEASURED ROUNDS'],
      dtype='object')

In [297]:
# Creating a new final data frame with the only columns we need. Player name and Average Strokes Gained Off The Tee.
dfOTTf = dfOTT[['PLAYER NAME', 'AVERAGE']]

In [298]:
# Finding the value to adjust by to make all numbers non negative for each golfer.
numofrows = len(dfOTTf) - 1
adjustmentvalue = abs(dfOTTf.iloc[numofrows, 1])

In [299]:
# Adjustment to make all numbers non-negative by adding the absolute value of the lowest number to each value in the AVERAGE column. This makes each of the numbers in the AVERAGE column positive.
dfOTTf['AVERAGE'] = dfOTTf['AVERAGE'] + adjustmentvalue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfOTTf['AVERAGE'] = dfOTTf['AVERAGE'] + adjustmentvalue


In [300]:
# Data frame with adjusted numbers (no negatives)
dfOTTf

Unnamed: 0,PLAYER NAME,AVERAGE
0,Jon Rahm,2.364
1,Keith Mitchell,1.880
2,Cameron Young,1.877
3,Sergio Garcia,1.858
4,Brendan Steele,1.817
...,...,...
200,Brian Gay,0.244
201,Justin Lower,0.191
202,Stephan Jaeger,0.040
203,Wesley Bryan,0.008


In [301]:
# Min/Max scaling all average strokes gained off the tee numbers to between 0-1. This way strokes gained off of the tee will be on the same scale as all of the other statistics.
# Using the scikit-learn module imported above to automate min/max scaling
scaler = MinMaxScaler()
dfOTTf['AVERAGE_OTT_scaled'] = scaler.fit_transform(dfOTTf['AVERAGE'].values.reshape(-1,1))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfOTTf['AVERAGE_OTT_scaled'] = scaler.fit_transform(dfOTTf['AVERAGE'].values.reshape(-1,1))


In [302]:
# New Dataframe
dfOTTf

Unnamed: 0,PLAYER NAME,AVERAGE,AVERAGE_OTT_scaled
0,Jon Rahm,2.364,1.000000
1,Keith Mitchell,1.880,0.795262
2,Cameron Young,1.877,0.793993
3,Sergio Garcia,1.858,0.785956
4,Brendan Steele,1.817,0.768613
...,...,...,...
200,Brian Gay,0.244,0.103215
201,Justin Lower,0.191,0.080795
202,Stephan Jaeger,0.040,0.016920
203,Wesley Bryan,0.008,0.003384


In [303]:
dfOTTfinal = dfOTTf[['PLAYER NAME', 'AVERAGE_OTT_scaled']]
dfOTTfinal

Unnamed: 0,PLAYER NAME,AVERAGE_OTT_scaled
0,Jon Rahm,1.000000
1,Keith Mitchell,0.795262
2,Cameron Young,0.793993
3,Sergio Garcia,0.785956
4,Brendan Steele,0.768613
...,...,...
200,Brian Gay,0.103215
201,Justin Lower,0.080795
202,Stephan Jaeger,0.016920
203,Wesley Bryan,0.003384


### Strokes Gained Approaching The Green

In [304]:
# Get strokes gained approach the green data from pga tour website

dfATG = pd.read_html("https://www.pgatour.com/stats/stat.02568.html")[1]

In [305]:
# view column names
dfATG.columns


Index(['RANK THIS WEEK', 'RANK LAST WEEK', 'PLAYER NAME', 'ROUNDS', 'AVERAGE',
       'TOTAL SG:APP', 'MEASURED ROUNDS'],
      dtype='object')

In [306]:
# Creating a new final data frame with the only columns we need. Player name and Average Strokes Approaching the green.
dfATGf = dfATG[['PLAYER NAME', 'AVERAGE']]

In [307]:
# Finding the value to adjust by to make all numbers non negative for each golfer.
numofrowsatg = len(dfATGf) - 1
adjustmentvalueatg = abs(dfATGf.iloc[numofrowsatg, 1])

In [308]:
# Adjustment to make all numbers non-negative by adding the absolute value of the lowest number to each value in the AVERAGE column. This makes each of the numbers in the AVERAGE column positive.
dfATGf['AVERAGE'] = dfATGf['AVERAGE'] + adjustmentvalueatg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfATGf['AVERAGE'] = dfATGf['AVERAGE'] + adjustmentvalueatg


In [309]:
dfATGf

Unnamed: 0,PLAYER NAME,AVERAGE
0,Russell Henley,2.465
1,Will Zalatoris,2.437
2,Viktor Hovland,2.408
3,Justin Thomas,2.324
4,Shane Lowry,2.312
...,...,...
200,Lucas Herbert,0.275
201,Martin Trainer,0.209
202,Andrew Landry,0.180
203,Jonas Blixt,0.124


In [310]:
# Min/Max scaling all average strokes gained off the tee numbers to between 0-1. This way strokes gained ATG will be on the same scale as all of the other statistics.
# Using the scikit-learn module imported above to automate min/max scaling
scaler = MinMaxScaler()
dfATGf['AVERAGE_ATG_scaled'] = scaler.fit_transform(dfATGf['AVERAGE'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfATGf['AVERAGE_ATG_scaled'] = scaler.fit_transform(dfATGf['AVERAGE'].values.reshape(-1,1))


In [311]:
dfATGf

Unnamed: 0,PLAYER NAME,AVERAGE,AVERAGE_ATG_scaled
0,Russell Henley,2.465,1.000000
1,Will Zalatoris,2.437,0.988641
2,Viktor Hovland,2.408,0.976876
3,Justin Thomas,2.324,0.942799
4,Shane Lowry,2.312,0.937931
...,...,...,...
200,Lucas Herbert,0.275,0.111562
201,Martin Trainer,0.209,0.084787
202,Andrew Landry,0.180,0.073022
203,Jonas Blixt,0.124,0.050304


In [312]:
dfATGfinal = dfATGf[["PLAYER NAME", "AVERAGE_ATG_scaled"]]

### Strokes Gained Around The Green

In [313]:
# Get strokes gained around the green data from pga tour website

dfARTG = pd.read_html("https://www.pgatour.com/stats/stat.02569.html")[1]

In [314]:
dfARTG.columns

Index(['RANK THIS WEEK', 'RANK LAST WEEK', 'PLAYER NAME', 'ROUNDS', 'AVERAGE',
       'TOTAL SG:ARG', 'MEASURED ROUNDS'],
      dtype='object')

In [315]:
# Creating a new final data frame with the only columns we need. Player name and Average Strokes Around the green.
dfARTGf = dfARTG[['PLAYER NAME', 'AVERAGE']]

In [316]:
# Finding the value to adjust by to make all numbers non negative for each golfer.
numofrowsartg = len(dfARTGf) - 1
adjustmentvalueartg = abs(dfARTGf.iloc[numofrowsartg, 1])

In [317]:
# Adjustment to make all numbers non-negative by adding the absolute value of the lowest number to each value in the AVERAGE column. This makes each of the numbers in the AVERAGE column positive.
dfARTGf['AVERAGE'] = dfARTGf['AVERAGE'] + adjustmentvalueartg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfARTGf['AVERAGE'] = dfARTGf['AVERAGE'] + adjustmentvalueartg


In [318]:
dfARTGf

Unnamed: 0,PLAYER NAME,AVERAGE
0,Jim Knous,1.597
1,Danny Willett,1.502
2,Matt Kuchar,1.477
3,Tommy Fleetwood,1.403
4,Matt Jones,1.395
...,...,...
200,Carlos Ortiz,0.251
201,Callum Tarren,0.177
202,Hudson Swafford,0.153
203,Seth Reeves,0.139


In [319]:
# Min/Max scaling all average strokes gained off the tee numbers to between 0-1. This way strokes gained ARTG will be on the same scale as all of the other statistics.
# Using the scikit-learn module imported above to automate min/max scaling
scaler = MinMaxScaler()
dfARTGf['AVERAGE_ARTG_scaled'] = scaler.fit_transform(dfARTGf['AVERAGE'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfARTGf['AVERAGE_ARTG_scaled'] = scaler.fit_transform(dfARTGf['AVERAGE'].values.reshape(-1,1))


In [320]:
dfARTGf

Unnamed: 0,PLAYER NAME,AVERAGE,AVERAGE_ARTG_scaled
0,Jim Knous,1.597,1.000000
1,Danny Willett,1.502,0.940513
2,Matt Kuchar,1.477,0.924859
3,Tommy Fleetwood,1.403,0.878522
4,Matt Jones,1.395,0.873513
...,...,...,...
200,Carlos Ortiz,0.251,0.157170
201,Callum Tarren,0.177,0.110833
202,Hudson Swafford,0.153,0.095805
203,Seth Reeves,0.139,0.087038


In [321]:
dfARTGfinal = dfARTGf[['PLAYER NAME', 'AVERAGE_ARTG_scaled']]

### Strokes Gained Putting

In [322]:
# Get strokes gained putting data from pga tour website

dfP = pd.read_html("https://www.pgatour.com/stats/stat.02564.html")[1]

In [323]:
dfP.columns

Index(['RANK THIS WEEK', 'RANK LAST WEEK', 'PLAYER NAME', 'ROUNDS', 'AVERAGE',
       'TOTAL SG:PUTTING', 'MEASURED ROUNDS'],
      dtype='object')

In [324]:
# Creating a new final data frame with the only columns we need. Player name and Average Strokes Putting.
dfPf = dfP[['PLAYER NAME', 'AVERAGE']]

In [325]:
# Finding the value to adjust by to make all numbers non negative for each golfer.
numofrowsp = len(dfPf) - 1
adjustmentvaluep = abs(dfPf.iloc[numofrowsp, 1])

In [326]:
# Adjustment to make all numbers non-negative by adding the absolute value of the lowest number to each value in the AVERAGE column. This makes each of the numbers in the AVERAGE column positive.
dfPf['AVERAGE'] = dfPf['AVERAGE'] + adjustmentvaluep

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfPf['AVERAGE'] = dfPf['AVERAGE'] + adjustmentvaluep


In [327]:
dfPf

Unnamed: 0,PLAYER NAME,AVERAGE
0,Tyrrell Hatton,2.261
1,Brian Gay,2.206
2,Cameron Smith,2.060
3,Lucas Herbert,1.958
4,Kelly Kraft,1.921
...,...,...
200,Joseph Bramlett,0.257
201,Luke List,0.195
202,Charl Schwartzel,0.114
203,Brett Drewitt,0.071


In [328]:
# Min/Max scaling all average strokes gained off the tee numbers to between 0-1. This way strokes gained putting will be on the same scale as all of the other statistics.
# Using the scikit-learn module imported above to automate min/max scaling
scaler = MinMaxScaler()
dfPf['AVERAGE_putting_scaled'] = scaler.fit_transform(dfPf['AVERAGE'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfPf['AVERAGE_putting_scaled'] = scaler.fit_transform(dfPf['AVERAGE'].values.reshape(-1,1))


In [329]:
dfPf

Unnamed: 0,PLAYER NAME,AVERAGE,AVERAGE_putting_scaled
0,Tyrrell Hatton,2.261,1.000000
1,Brian Gay,2.206,0.975674
2,Cameron Smith,2.060,0.911101
3,Lucas Herbert,1.958,0.865989
4,Kelly Kraft,1.921,0.849624
...,...,...,...
200,Joseph Bramlett,0.257,0.113667
201,Luke List,0.195,0.086245
202,Charl Schwartzel,0.114,0.050420
203,Brett Drewitt,0.071,0.031402


In [330]:
dfPfinal = dfPf[['PLAYER NAME', 'AVERAGE_putting_scaled']]

### Create Master Table With All Strokes Gained for Each Player So Far In 2022

In [331]:
dfmaster = dfPfinal.merge(dfOTTfinal, on= "PLAYER NAME", how ="left")
dfmaster = dfmaster.merge(dfARTGfinal, on= "PLAYER NAME", how ="left")
dfmaster = dfmaster.merge(dfATGfinal, on= "PLAYER NAME", how = "left")
dfmaster22 = dfmaster
%store dfmaster22

Stored 'dfmaster22' (DataFrame)
