In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [24]:
#load csv
pga_df = pd.read_csv('pga_rawdata.csv')

pga_df

Unnamed: 0,Player_initial_last,tournament id,player id,hole_par,strokes,hole_DKP,hole_FDP,hole_SDP,streak_DKP,streak_FDP,...,purse,season,no_cut,Finish,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total
0,A. Ancer,401353224,9261,288,289,60.0,51.1,56,3,7.6,...,12.0,2022,0,T32,0.20,-0.13,-0.08,0.86,0.65,0.85
1,A. Hadwin,401353224,5548,288,286,72.5,61.5,61,8,13.0,...,12.0,2022,0,T18,0.36,0.75,0.31,0.18,1.24,1.60
2,A. Lahiri,401353224,4989,144,147,21.5,17.4,27,0,0.0,...,12.0,2022,0,CUT,-0.56,0.74,-1.09,0.37,0.02,-0.54
3,A. Long,401353224,6015,144,151,20.5,13.6,17,0,0.4,...,12.0,2022,0,CUT,-1.46,-1.86,-0.02,0.80,-1.08,-2.54
4,A. Noren,401353224,3832,144,148,23.5,18.1,23,0,1.2,...,12.0,2022,0,CUT,0.53,-0.36,-1.39,0.19,-1.56,-1.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36859,V. Singh,2271,392,144,146,33.0,26.4,26,0,0.6,...,6.0,2015,0,,,,,,,
36860,W. Kim,2271,7082,144,150,18.5,12.9,21,0,0.2,...,6.0,2015,0,,,,,,,
36861,W. McGirt,2271,3532,216,215,44.5,40.6,45,0,6.2,...,6.0,2015,0,,,,,,,
36862,Z. Blair,2271,9040,288,278,73.0,70.8,74,3,23.2,...,6.0,2015,0,,,,,,,


In [25]:
#data cleaning

#drop unneeded columns
kept_columns = ['player', 'tournament name', 'season', 'pos', 'hole_par', 'strokes', 'sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total']
dropped_columns = [col for col in pga_df.columns if col not in kept_columns]
filtered_df = pga_df.drop(columns=dropped_columns)

#removes players who missed the cut
df_cuts = filtered_df[filtered_df['hole_par'] > 160]

#removes rows that don't have strokes gained data
df_no_sg = df_cuts.dropna(subset=['sg_total'])

#filters data to courses that have a par of 72 (or 288 over 4 golf rounds)
df_par = df_no_sg[df_no_sg['hole_par'] == 288]

#removes last of rows with null values
df = df_par.dropna()

#adds score column
df['score'] = df['strokes'] - df['hole_par']

#final dataframe
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['score'] = df['strokes'] - df['hole_par']


Unnamed: 0,hole_par,strokes,pos,player,tournament name,season,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,score
0,288,289,32.0,Abraham Ancer,The Memorial Tournament pres. by Nationwide,2022,0.20,-0.13,-0.08,0.86,0.65,0.85,1
1,288,286,18.0,Adam Hadwin,The Memorial Tournament pres. by Nationwide,2022,0.36,0.75,0.31,0.18,1.24,1.60,-2
6,288,287,26.0,Aaron Rai,The Memorial Tournament pres. by Nationwide,2022,2.05,0.74,-1.32,-0.12,-0.70,1.35,-1
7,288,287,26.0,Adam Schenk,The Memorial Tournament pres. by Nationwide,2022,-0.96,-0.01,1.84,0.48,2.31,1.35,-1
8,288,299,67.0,Adam Scott,The Memorial Tournament pres. by Nationwide,2022,-0.82,-1.79,2.00,-1.04,-0.83,-1.65,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36457,288,288,61.0,Sean O'Hair,Sanderson Farms Championship,2015,-0.84,0.62,0.15,-0.47,0.30,-0.54,0
36461,288,282,25.0,Shawn Stefani,Sanderson Farms Championship,2015,0.09,-0.56,0.62,0.81,0.87,0.96,-6
36466,288,284,35.0,Tom Hoge,Sanderson Farms Championship,2015,0.03,0.33,0.37,-0.27,0.43,0.46,-4
36471,288,280,20.0,Vaughn Taylor,Sanderson Farms Championship,2015,1.95,-0.23,0.96,-1.23,-0.50,1.46,-8


In [26]:
#check for null values

df.isna().any()

hole_par           False
strokes            False
pos                False
player             False
tournament name    False
season             False
sg_putt            False
sg_arg             False
sg_app             False
sg_ott             False
sg_t2g             False
sg_total           False
score              False
dtype: bool

In [27]:
top_golfers = df.nlargest(5, 'sg_total')

print("Golfers with the highest strokes gained in a single tournament:")
print(top_golfers[['player', 'sg_total']])

Golfers with the highest strokes gained in a single tournament:
                player  sg_total
11996    Abraham Ancer      5.70
12048   Grayson Murray      5.53
33156        Jason Day      5.36
7703        Si Woo Kim      5.06
7677   Patrick Cantlay      5.04


In [28]:
#extract 'strokes gained' stats for each golfer and scale them
scaler = StandardScaler()
numerical_features = ['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total']

df[numerical_features] = scaler.fit_transform(df[numerical_features])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numerical_features] = scaler.fit_transform(df[numerical_features])


In [29]:
#calculates cosine similarity based on golfers' strokes gained
def find_similarities(golfer, df):
    golfer_row = df[df['player'] == golfer].index[0]
    golfer_features = df.iloc[golfer_row][numerical_features].values.reshape(1,-1)
    similarities = cosine_similarity(golfer_features, df[numerical_features])
    
    return similarities

In [30]:
#finds most similar golfers
def most_similar_golfers(similarity_scores, df, top_n=10):
    similar_golfers_indices = similarity_scores.argsort()[0][::-1][1:top_n+1]
    similar_golfers = df.iloc[similar_golfers_indices]
    
    return similar_golfers

In [34]:
#result
golfer = 'Scottie Scheffler'
similarity_scores = find_similarities(golfer, df)
most_similar = most_similar_golfers(similarity_scores, df)
print('The 10 golf performances most similar to the skillset of', golfer,'are:')
print(most_similar[['player', 'tournament name', 'score']])

The 10 golf performances most similar to the skillset of Scottie Scheffler are:
                 player                              tournament name  score
20928   Alexander Noren                     The Players Championship    -10
28422      Troy Merritt                                 Safeway Open    -12
6856      Brooks Koepka                     WGC-Workday Championship    -15
27372    Phil Mickelson                       Farmers Insurance Open     -7
15241       Justin Rose  The Memorial Tournament pres. by Nationwide     -8
6282    Sebastian Munoz                            Valero Texas Open     -8
18825      Chase Wright                                 Safeway Open    -10
34071   David Lingmerth  The Memorial Tournament pres. by Nationwide    -15
12131       Sepp Straka                         The American Express    -20
28480  Charl Schwartzel                             BMW Championship    -12


In [38]:
#result
golfer = 'Dustin Johnson'
similarity_scores = find_similarities(golfer, df)
most_similar = most_similar_golfers(similarity_scores, df)
print('The 10 golf performances most similar to the skillset of', golfer,'are:')
print(most_similar[['player', 'tournament name', 'score']])

The 10 golf performances most similar to the skillset of Dustin Johnson are:
                player                              tournament name  score
10213      Ryan Palmer  The Memorial Tournament pres. by Nationwide     -6
7229   Matthew NeSmith                     AT&T Pebble Beach Pro-Am     -9
7202     Jordan Spieth                     AT&T Pebble Beach Pro-Am    -15
7678        Paul Casey                         The American Express    -14
13498     Zach Johnson                 Sanderson Farms Championship    -13
35795       J.J. Henry                       Farmers Insurance Open     -6
16958       Matt Jones                       Farmers Insurance Open    -11
34914     Scott Piercy                           Shell Houston Open    -11
13247    Cameron Percy                                 Safeway Open    -12
31881        K.J. Choi                       Farmers Insurance Open     -5


In [32]:
#function to compute Euclidean distance
def compute_euclidean_distance(golfer, df):
    golfer_row = df[df['player'] == golfer].index[0]
    golfer_features = df.iloc[golfer_row][numerical_features].values.reshape(1, -1)
    
    distances = []
    for index, row in df.iterrows():
        if row['player'] != golfer:
            golfer_row_features = row[numerical_features].values.reshape(1, -1)
            distance = euclidean_distances(golfer_features, golfer_row_features)
            distances.append((row['player'], distance[0][0]))
    
    return sorted(distances, key=lambda x: x[1])

In [33]:
#result
golfer = 'Tiger Woods'
closest_golfers = compute_euclidean_distance(golfer, df)[:10]
print(f"The golf performances most similar to {golfer}'s are:")
for golfer, distance in closest_golfers:
    print(f"{golfer}: {distance:.2f}")

The golf performances most similar to Tiger Woods's are:
Danny Willett: 0.00
Scott Piercy: 0.16
Robby Shelton: 0.28
Dylan Frittelli: 0.51
John Huh: 0.53
Chad Collins: 0.58
Patrick Reed: 0.59
Matt Jones: 0.65
Brett Stegmaier: 0.65
Vijay Singh: 0.67
