# Notebook Description

In this notebook, I attempt to use machine learning to predict whether or not a golfer will make the cut at the Masters Tournament. The models used take inputs consisting of full season aggregate statistics from the previous year to make the predictions for that year's Masters Tournament (e.g., the models used 2018 full season data to predict 2019 results.

Additionally, we will use the subset of golfers that make the cut to attempt to predict the top 10 come Sunday evening at Augusta using the same inputs as before.

# Necessary Packages and Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Webscrapping
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

# Sklearn
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Class imbalance
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler as ROS

# Graphing
%matplotlib inline
sns.set_style('darkgrid')
figsize = (15,10)
hspace = 0.5

# Pickling
import pickle
from sklearn.externals import joblib



# Helper Functions

## Webscrapping

In [2]:
# PGATour.com Webscrapping Function
def get_PGA_Tour_data(stats, seasons):

    """
    This function pings the PGA Tour's server and gathers the desired statistics
    (by inserting the stat_id into the url) and appends it to our data container
    """

    data = []
    
    for season in seasons:  
        print(f"Beginning {season} season:")
        for stat in stats: 
            url = f"https://www.pgatour.com/content/pgatour/stats/stat.{stat['stat_id']}.y{season}.html"

            #opening up connection, grabbing the page
            uClient = uReq(url)
            page_html = uClient.read()

            #html parsing using BeautifulSoup
            page_soup = soup(page_html, 'html.parser')

            #find the table where stats are kept
            tbody = page_soup.find('tbody')

            #each golfer is separated by a <tr> tag
            raw_golfers = tbody.findAll('tr')

            #loop through each golfer, grab name and avg. distance
            for raw_golfer in raw_golfers:
                golfer = {}
                golfer['season'] = season            
                name = raw_golfer.find('td', {'class':'player-name'}).a.text
                golfer['full_name'] = name.replace(' ', '_').lower()
                golfer[f"{stat['stat_name']}"] = raw_golfer.find('td', {'class':None}).text
                data.append(golfer)
            print(f" {stat['stat_name']} stats added for the {season} season")
        print(f"{season} season completed.\n")

    #close the client
    uClient.close()

    return data

## Convert Strings to Feet

In [3]:
# Convert stats to feet function
def convert_to_feet(x):
    """
    This function takes in a string and returns a float
    """
    
    if type(x) == str:
        y = x.replace("'","").replace('"',"").split()

        if len(y) == 1:
            y.extend([0])
            for i in range(len(y)):
                y[i] = int(y[i])
            return y[0]+(y[1]/12)

        else:
            for i in range(len(y)):
                y[i] = int(y[i])
            return y[0]+(y[1]/12)

    else:
        return np.nan

## Format Golfer

In [5]:
# Format golfer names from results Dataframe
def format_golfer(x):
    return x.replace(' ','_').lower()

## Plotter Functions

### Confusion Matrix Plotter

In [6]:
# creates the confusion matrix
def plot_confusion_matrix(cm, classes):
    cmap = plt.cm.Blues
    fig = plt.figure(figsize=(5,4));
    ax = fig.add_subplot(111);
    cax = ax.matshow(cm, cmap=cmap);
    fig.colorbar(cax);
    ax.set_xticklabels([''] + classes);
    ax.set_yticklabels([''] + classes);

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'), {
            'fontsize' : 18,
            'horizontalalignment' : "center",
            'verticalalignment' : "center"
        }, color="white" if cm[i, j] > thresh else "black")


    plt.xlabel('Predicted');
    plt.ylabel('Actual') ;
    plt.show();

### Feature Importance Plotter

In [7]:
# creates feature importance graphs
def plot_feature_importances(X_train, model, n_features, FI_labels):
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), FI_labels) 
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature')

# Data Gathering and Cleaning

## Data Gathering

### Desired Features

In [None]:
# # The stat_id will be inserted into the url for each ping
# stats = [
#     {'stat_name': 'driving_distance',
#      'stat_id': '101'},
#     {'stat_name': 'driving_accuracy', 
#      'stat_id': '102'},
#     {'stat_name': 'greens_in_regulation', 
#      'stat_id': '103'},
#     {'stat_name': 'proximity', 
#      'stat_id': '331'},
#     {'stat_name': 'scrambling_rough',  
#      'stat_id': '363'},
#     {'stat_name': 'scrambling_sand',  
#      'stat_id': '362'},
#     {'stat_name': 'putting_conversion',
#      'stat_id': '115'},
#     {'stat_name': 'putting_rating',
#      'stat_id': '402'},
#     {'stat_name': 'sg_off_tee',
#      'stat_id': '02567'},
#     {'stat_name': 'sg_approach',
#      'stat_id': '02568'},
#     {'stat_name': 'sg_scrambling',  
#      'stat_id': '02569'},
#     {'stat_name': 'sg_putting',
#      'stat_id': '02564'},
#     {'stat_name': 'sg_tee_to_green',  
#      'stat_id': '02674'},
#     {'stat_name': 'sg_total',  
#      'stat_id': '02675'},
#     {'stat_name': 'scoring_avg',  
#      'stat_id': '120'},
# ]

### Seasons Considered

In [None]:
# # Since strokes_gained stats only go back to 2004, we will consider 2004-2019
# seasons = range(2004, 2020)

### Webscrapping and Dataframe Conversion

In [None]:
# # Call the web scrapping helper function defined above. Save the results and examine Dataframe
# data = get_PGA_Tour_data(stats, seasons)

In [None]:
# # Convert data to Dataframe and save raw data locally
# df = pd.DataFrame(data)
# df.to_csv('csv_files/raw_golfer_data.csv')

## Data Cleaning

### Groupby Season and Golfer

In [41]:
# Reload raw data and examine dataframe
df = pd.read_csv('csv_files/raw_golfer_data.csv', index_col=0)
df.drop('str_diff_to_field', axis=1, inplace=True)
df = df.reset_index(drop=True)

print(df.shape)
df.head()

(48592, 17)


Unnamed: 0,season,full_name,driving_distance,driving_accuracy,greens_in_regulation,proximity,scrambling_rough,scrambling_sand,putting_conversion,putting_rating,sg_off_tee,sg_approach,sg_scrambling,sg_putting,sg_tee_to_green,sg_total,scoring_avg
0,2004,hank_kuehne,314.4,,,,,,,,,,,,,,
1,2004,scott_hend,312.6,,,,,,,,,,,,,,
2,2004,john_daly,306.0,,,,,,,,,,,,,,
3,2004,mike_heinen,305.2,,,,,,,,,,,,,,
4,2004,chris_smith,304.0,,,,,,,,,,,,,,


In [42]:
# Group stats by season and golfer
df = df.groupby(['season', 'full_name'])[
    'driving_distance',
    'driving_accuracy',
    'greens_in_regulation',
    'proximity',
    'scrambling_rough',
    'scrambling_sand',
    'putting_conversion',
    'putting_rating',
    'sg_off_tee',
    'sg_approach',
    'sg_scrambling',
    'sg_putting',
    'sg_tee_to_green',
    'sg_total',
    'scoring_avg'
].mean()

# Save grouped data locally
df.to_csv('csv_files/golfer_data.csv')

# Reload and examine data
df = pd.read_csv('csv_files/golfer_data.csv')

print(df.shape)
df.head()

(3036, 16)


Unnamed: 0,season,full_name,driving_distance,driving_accuracy,greens_in_regulation,scrambling_rough,scrambling_sand,putting_conversion,putting_rating,sg_off_tee,sg_approach,sg_scrambling,sg_putting,sg_tee_to_green,sg_total,scoring_avg
0,2004,aaron_baddeley,288.0,53.08,58.17,57.69,53.51,30.15,1.576,-0.53,-0.679,0.201,0.579,-1.008,-0.429,71.614
1,2004,adam_scott,295.4,57.65,65.6,53.01,61.7,32.9,1.611,0.18,0.571,-0.147,0.824,0.603,1.427,70.096
2,2004,alex_cejka,285.8,64.21,63.81,51.08,57.65,28.77,1.625,0.119,0.255,0.02,-0.006,0.394,0.388,71.153
3,2004,andre_stolz,297.9,58.97,63.0,46.32,52.44,28.38,1.628,-0.333,-0.532,-0.137,-0.246,-1.002,-1.247,72.341
4,2004,arjun_atwal,289.4,60.48,62.52,59.79,41.07,29.8,1.606,0.013,-0.097,-0.116,-0.034,-0.2,-0.234,71.688


### Add Missing Golfers

In [43]:
# Import new csv file that contains data on missing golfers
missing = pd.read_csv('csv_files/masters_results - missing.csv')

print(missing.shape)
missing.head()

(28, 17)


Unnamed: 0,season,full_name,driving_distance,driving_accuracy,greens_in_regulation,proximity,scrambling_rough,scrambling_sand,putting_conversion,putting_rating,sg_off_tee,sg_approach,sg_scrambling,sg_putting,sg_tee_to_green,sg_total,scoring_avg
0,2007,andres_romero,312.5,56.25,58.89,"33' 10""",76.47,62.16,26.42,1.572,1.988,-0.457,0.342,0.416,1.873,2.289,68.653
1,2005,ángel_cabrera,311.2,53.3,63.33,"37' 10""",54.29,47.5,29.03,1.665,0.531,0.067,0.141,-0.136,0.739,0.603,70.669
2,2019,c.t._pan,285.0,64.77,64.16,"36' 7""",58.39,57.45,28.98,1.595,-0.114,0.054,0.18,-0.062,0.12,0.059,70.966
3,2010,charl_schwartzel,300.8,62.05,59.72,"36' 0""",55.88,63.64,29.65,1.615,-0.505,0.865,0.338,0.528,0.699,1.227,70.879
4,2019,charl_schwartzel,291.3,50.67,61.28,"40' 11""",70.51,56.06,30.11,1.583,-1.526,-0.025,0.234,0.407,-1.317,-0.91,72.117


In [17]:
# Reload and examine new dataframe
df = pd.read_csv('csv_files/golfer_data.csv')

print(df.shape)
df.head()

(3036, 18)


Unnamed: 0,season,full_name,driving_distance,driving_accuracy,greens_in_regulation,proximity,scrambling_rough,scrambling_sand,putting_conversion,putting_rating,sg_off_tee,sg_approach,sg_scrambling,sg_putting,sg_tee_to_green,sg_total,str_diff_to_field,scoring_avg
0,2004,aaron_baddeley,288.0,0.5308,0.5817,39.333333,0.5769,0.5351,0.3015,1.576,-0.53,-0.679,0.201,0.579,-1.008,-0.429,-0.44,71.614
1,2004,adam_scott,295.4,0.5765,0.656,35.333333,0.5301,0.617,0.329,1.611,0.18,0.571,-0.147,0.824,0.603,1.427,1.3,70.096
2,2004,alex_cejka,285.8,0.6421,0.6381,36.083333,0.5108,0.5765,0.2877,1.625,0.119,0.255,0.02,-0.006,0.394,0.388,0.27,71.153
3,2004,andre_stolz,297.9,0.5897,0.63,36.416667,0.4632,0.5244,0.2838,1.628,-0.333,-0.532,-0.137,-0.246,-1.002,-1.247,-0.99,72.341
4,2004,arjun_atwal,289.4,0.6048,0.6252,35.916667,0.5979,0.4107,0.298,1.606,0.013,-0.097,-0.116,-0.034,-0.2,-0.234,-0.25,71.688


### Convert Features

In [15]:
# Convert proximity to feet floats and str_diff_to_field to stroke floats
df['proximity'] = df['proximity'].apply(lambda x: convert_to_feet(x))

# Convert percentage features to decimals
df['driving_accuracy'] = df['driving_accuracy']/100
df['greens_in_regulation'] = df['greens_in_regulation']/100
df['scrambling_rough'] = df['scrambling_rough']/100
df['scrambling_sand'] = df['scrambling_sand']/100
df['putting_conversion'] = df['putting_conversion']/100

### Incorporate Historical Tournament Data

In [18]:
# Load Masters Results data and format each golfer to match our data's format
masters_results = pd.read_csv('csv_files/masters_results - data.csv')
masters_results['full_name'] = masters_results['full_name'].apply(lambda x: format_golfer(x))
masters_results = masters_results.sort_values(by=['masters_year', 'full_name'])

print(masters_results.shape)
masters_results.head()

(1469, 6)


Unnamed: 0,masters_year,full_name,total_score,made_cut,top_10,champion
32,2005,adam_scott,294,1,0,0
51,2005,austin_eaton_iii,315,0,0,0
52,2005,ben_crenshaw,315,0,0,0
53,2005,ben_curtis,315,0,0,0
19,2005,bernhard_langer,289,1,0,0


In [19]:
# Merge dataframes together on name
df = pd.merge(df, masters_results, how='left', on='full_name')

# Drop Nans
df.dropna(inplace=True)

# Only consider rows where golfer participated the following masters
df = df[df['masters_year']-df['season'] == 1]

# Drop Masters year and reset Index
df.drop('masters_year', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

# Change made_cut and top_10 to integers
df['made_cut'] = df['made_cut'].astype(int)
df['top_10'] = df['top_10'].astype(int)
df['champion'] = df['champion'].astype(int)
df['total_score'] = df['total_score'].astype(int)

# Reorder columns and examine new dataframe
cols = ['season', 'full_name', 'champion', 'top_10', 'made_cut', 'total_score',
        'driving_distance', 'driving_accuracy', 'greens_in_regulation', 'proximity',
        'scrambling_rough', 'scrambling_sand', 'putting_conversion', 'putting_rating',
        'sg_off_tee', 'sg_approach', 'sg_scrambling', 'sg_putting', 'sg_tee_to_green',
        'sg_total', 'str_diff_to_field', 'scoring_avg']

df = df[cols]

print(df.shape)
df.head()

(922, 22)


Unnamed: 0,season,full_name,champion,top_10,made_cut,total_score,driving_distance,driving_accuracy,greens_in_regulation,proximity,...,putting_conversion,putting_rating,sg_off_tee,sg_approach,sg_scrambling,sg_putting,sg_tee_to_green,sg_total,str_diff_to_field,scoring_avg
0,2004,adam_scott,0,0,1,294,295.4,0.5765,0.656,35.333333,...,0.329,1.611,0.18,0.571,-0.147,0.824,0.603,1.427,1.3,70.096
1,2004,ben_curtis,0,0,0,315,282.1,0.6434,0.6335,37.333333,...,0.2513,1.62,0.171,0.274,0.223,-0.174,0.669,0.494,0.09,71.578
2,2004,bernhard_langer,0,0,1,289,282.2,0.6257,0.6525,34.416667,...,0.2768,1.608,0.029,0.616,0.055,-0.229,0.7,0.471,0.39,71.246
3,2004,bo_van_pelt,0,0,0,315,294.4,0.6515,0.6768,36.333333,...,0.3136,1.623,0.512,0.194,0.094,0.346,0.8,1.146,0.99,70.245
4,2004,carlos_franco,0,0,0,315,290.6,0.5933,0.6846,36.083333,...,0.2838,1.637,0.13,0.413,0.236,-0.134,0.779,0.645,0.61,70.768


In [39]:
df.columns

Index(['season', 'full_name', 'champion', 'top_10', 'made_cut', 'total_score',
       'driving_distance', 'driving_accuracy', 'greens_in_regulation',
       'proximity', 'scrambling_rough', 'scrambling_sand',
       'putting_conversion', 'putting_rating', 'sg_off_tee', 'sg_approach',
       'sg_scrambling', 'sg_putting', 'sg_tee_to_green', 'sg_total',
       'str_diff_to_field', 'scoring_avg'],
      dtype='object')

# Feature Engineering

## Par or Better Distance
Generate a circle where the radius equals a golfer's maximum distance from the hole in which the golfer will make a par or better from

In [None]:
df['on_the_green_area'] = df['proximity']**2 * np.pi
df['birdie_or_better_area'] = df['on_the_green_area']*df['putting_conversion']
df['birdie_or_better_distance'] = np.sqrt(df['birdie_or_better_area']/np.pi)
df['birdie_or_worse_distance'] = df['proximity']-df['birdie_or_better_distance']
df.drop(['proximity', 'putting_conversion', 'on_the_green_area', 'birdie_or_better_area'], axis=1, inplace=True)

## Average Score

In [None]:
hist_scores = pd.DataFrame(df[df['season'] != 2019].groupby('full_name').total_score.mean())
hist_scores.reset_index(drop=False, inplace=True)
hist_scores['avg_score'] = hist_scores['total_score'].astype(int)
hist_scores.drop('total_score', axis=1, inplace=True)

df = pd.merge(df, hist_scores, how='inner', on='full_name')

cols = ['season', 'full_name', 'champion', 'top_10', 'made_cut', 'total_score',
        'avg_score', 'driving_distance', 'driving_accuracy', 'greens_in_regulation',
        'scrambling', 'putting_rating', 'birdie_or_better_distance','birdie_or_worse_distance', ]

df = df[cols]

df.head()

In [None]:
df[df['full_name'] == 'zach_johnson']

## Prior Tournament Score

In [None]:
# Group data by golfer then season in order to lag total score column
df = df.groupby(['full_name', 'season'])['driving_distance',
                                         'driving_accuracy',
                                         'greens_in_regulation',
                                         'par_or_better_distance',
                                         'par_or_worse_distance',
                                         'scrambling',
                                         'putting_rating',
                                         'total_score',
                                         'made_cut',
                                         'top_10',
                                         'champion'].mean()

# Save grouped data locally by ovewriting current csv
df.to_csv('csv_files/noSG/golfer_data.csv')

# Reload and examine new dataframe
df = pd.read_csv('csv_files/noSG/golfer_data.csv')

In [None]:
# Generate a list of unique golfers
golfers = df['full_name'].unique().tolist()

# Create a list of dictionaries to house each golfers score at the Masters for every tournament he played in
names_and_scores = []
for golfer in golfers:
    name_and_scores = {}
    name_and_scores['golfer'] = golfer
    scores = []
    for i in range(df.shape[0]):
        if golfer == df['full_name'].iloc[i]:
            scores.append(df['total_score'].iloc[i])
        else:
            continue
    name_and_scores['scores'] = scores
    names_and_scores.append(name_and_scores)

# Loop through list of dictionaries and create a lagged score list for each golfer
for golfer in names_and_scores:
    scores_lagged = []
    for i in range(len(golfer['scores'])):
        score_lagged = golfer['scores'][i-1]
        scores_lagged.append(score_lagged)
        scores_lagged[0] = 315
    golfer['scores_lagged'] = scores_lagged

# Join all of the sepearate lists together
lagged_scores = []
for golfer in names_and_scores:
    lagged_scores.extend(golfer['scores_lagged'])
    
# Add the lagged scores to our dataframe
df['prior_score'] = lagged_scores

# Convert column to int
df['prior_score'] = df['prior_score'].astype(int)

## Masters Experience
Create another column that quantifies how many times a golfer has played at the Masters at that point in time

In [None]:
df['experience'] = 0
golfers = df['full_name'].unique().tolist()

for golfer in golfers:
    count = 0
    for i in range(df.shape[0]):
        if df['full_name'].iloc[i] == golfer:
            count += 1
            df['experience'].iloc[i] = count
        else:
            continue

## Field Average Score
Create a column that is the average score of the field that year for each golfer

In [None]:
# Get each year's Masters Tournament average score for the field
hist_scores = pd.DataFrame(df.groupby('season')['total_score'].agg('mean'))
hist_scores.reset_index(drop=False, inplace=True)
hist_scores['field_avg_score'] = hist_scores['total_score'].astype(int)
hist_scores.drop('total_score', axis=1, inplace=True)

# Merge with our dataframe
df = pd.merge(df, hist_scores, how='inner', on='season')

## Strokes Gained Over Field
Create a new column that calculates how much better (or worse) the golfer did than the field average that year

In [None]:
# Create column
df['sg_over_field'] = df['field_avg_score']-df['total_score']

# Reorganize columns and examine dataframe
cols = ['full_name', 'season', 'experience', 'champion', 'top_10',
        'made_cut', 'field_avg_score', 'total_score',
        'sg_over_field', 'prior_score', 'driving_distance',
        'driving_accuracy', 'greens_in_regulation',
        'par_or_better_distance', 'par_or_worse_distance',
        'scrambling', 'putting_rating']
df = df[cols]
df.head()

# Holdout Set

In [None]:
# Create holdout set of 2019 data. Holdout set will be used for predicting 2020 Masters results
holdout = df[df['season'] == 2019]
df = df[df['season'] != 2019]

# Examine holdout and dataframe
print(f"Holdout Set: {holdout.shape}")
print(f"Validation Set: {df.shape}")

# Overwrite grouped data and save both the holdout set and grouped dataframe locally
holdout.to_csv('csv_files/noSG/holdout_data.csv')
df.to_csv('csv_files/noSG/golfer_data.csv')

# Exploratory Data Analysis

## General Information

In [None]:
df = pd.read_csv('csv_files/noSG/final_golfer_data.csv', index_col=0)

print(df.shape)
df.head(10)

In [None]:
top10 = df[df['top_10'] == 1]

In [None]:
df.info()

In [None]:
top10.info()

In [None]:
df.drop(['season', 'total_score', 'made_cut', 'top_10', 'champion'],axis=1).describe()

In [None]:
top10.drop(['season', 'total_score', 'made_cut', 'top_10', 'champion'],axis=1).describe()

In [None]:
# Define the cutlines
cut_lines = pd.read_csv('csv_files/masters_results - cutline.csv')

print(cut_lines.shape)
cut_lines.head()

In [None]:
target_variable = 'made_cut'
title = target_variable.replace('_', ' ').title()
fig = plt.figure(figsize=(13,5));

# Percent of Golfers Cut
height = df.groupby(target_variable).season.count().tolist()
missed_cut = height[0]
made_cut = height[1]
ax1 = fig.add_subplot(121);
ax1.bar(
    x=0, 
    height=made_cut,
    ec='darkgreen',
    fc='darkgreen',
    label=f'Made Cut: {made_cut/(made_cut+missed_cut) : 0.1%}'
);
ax1.bar(
    x=1, 
    height=missed_cut,
    ec='darkgreen',
    fc='yellow',
    label=f'Missed Cut: {missed_cut/(made_cut+missed_cut) : 0.1%}'
);
ax1.set_title("Percent of Golfers Cut", {'fontsize' : 16});
ax1.set_ylabel('Number of Golfers', {'fontsize' : 14});
ax1.set_xticks(ticks=[0,1]);
ax1.set_xticklabels(labels=['Made the Cut', 'Missed the Cut'], fontdict={'fontsize' : 14});
ax1.legend(loc='upper right', fontsize='large');

# Cutline overtime
avg_cutline = int(cut_lines['relative_to_par'].mean())
ax2 = fig.add_subplot(122);

cut_lines.groupby('season').relative_to_par.mean().plot(
    kind='line',
    ax=ax2,
    fontsize=12,
    marker='o',
    mec='darkgreen',
    mfc='yellow',
    linestyle='--',
    linewidth=1.0,
    color='red',
    label=f'Avg. Cut Line: +{avg_cutline}'
);
ax2.set_title("Tournament Cut Line by Year", {'fontsize' : 16});
ax2.set_ylabel('Score Relative To Par', {'fontsize' : 14});
ax2.set_yticks(range(0, 11));
ax2.set_yticklabels(
    ('E', '+1', '+2', '+3', '+4',
     '+5', '+6', '+7', '+8', '+9', '+10')
);
ax2.legend(loc='upper right', fontsize='large');

In [None]:
target_variable = 'top_10'
title = target_variable.replace('_', ' ').title()
fig = plt.figure(figsize=(13,5));

# Percent of Golfers Inside Top 10
height = df.groupby(target_variable).season.count().tolist()
outside_top10 = height[0]
inside_top10 = height[1]
ax1 = fig.add_subplot(121);
ax1.bar(
    x=0, 
    height=inside_top10,
    ec='darkgreen',
    fc='darkgreen',
    label=f'Inside: {inside_top10/(inside_top10+outside_top10) : 0.1%}'
);
ax1.bar(
    x=1, 
    height=missed_cut,
    ec='darkgreen',
    fc='yellow',
    label=f'Outside: {outside_top10/(inside_top10+outside_top10) : 0.1%}'
);
ax1.set_title("Top 10 Summary", {'fontsize' : 16});
ax1.set_ylabel('Number of Golfers', {'fontsize' : 14});
ax1.set_xticks(ticks=[0,1]);
ax1.set_xticklabels(labels=['Inside', 'Outside'], fontdict={'fontsize' : 14});
ax1.legend(loc='upper left', fontsize='large');

# Top 10 overtime
top_10 = pd.DataFrame(df[df['top_10'] == 1].groupby('season').total_score.max())
top_10.reset_index(drop=False, inplace=True)
top_10['season'] = top_10['season']+1
top_10['relative_to_par'] = top_10['total_score']-(72*4)
avg_top10 = int(top_10['relative_to_par'].mean())
ax2 = fig.add_subplot(122);

top_10.groupby('season').relative_to_par.mean().plot(
    kind='line',
    ax=ax2,
    fontsize=12,
    marker='o',
    mec='darkgreen',
    mfc='yellow',
    linestyle='--',
    linewidth=1.0,
    color='red',
    label=f'Avg. Top 10 Score: {avg_top10}'
);
ax2.set_title("Top 10 by Year", {'fontsize' : 16});
ax2.set_ylabel('Score Relative To Par', {'fontsize' : 14});
ax2.legend(loc='upper right', fontsize='large');
ax2.set_yticks(range(-11, 9));
ax2.set_yticklabels(
    ('-11', '-10', '-9', '-8',
     '-7', '-6', '-5', '-4',
     '-3', '-2', '-1', 'E',
     '+1', '+2', '+3', '+4',
     '+5', '+6', '+7', '+8')
);

In [None]:
df_corr.columns

In [None]:
df_corr = df.drop(['season', 'full_name', 'field_avg_score', 'total_score', 'sg_over_field'], axis=1)
cols=['champion', 'top_10', 'made_cut', 'prior_score', 'experience',
       'driving_distance', 'driving_accuracy', 'greens_in_regulation',
       'par_or_better_distance', 'par_or_worse_distance', 'scrambling',
       'putting_rating']
df_corr = df_corr[cols]

fig = plt.figure(figsize=(11,11))

# Create mask
mask = np.zeros_like(df_corr.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Create color scheme
cmap = sns.diverging_palette(220, 10, as_cmap=True)

ax1 = fig.add_subplot(111)
sns.heatmap(df_corr.corr(),
            mask=mask, 
            cmap=cmap,
            center=0,
            vmin=-1,
            vmax=1,
            square=True,
            linewidths=0.5,
            cbar_kws={'shrink': 0.5},
            ax=ax1);

## Visualizations by Feature

### Strokes Gained Per Round

In [None]:
fig = plt.figure(figsize=figsize);
ax = fig.add_subplot(111);

sns.boxplot(
    x=df['experience'],
    y=df['sg_over_field']
);
ax.axhline(0, color='black', linestyle='--');
ax.set_ylabel('Strokes Gained Per Round', fontsize=12);

### Prior Score

In [None]:
# Only consider times after 2002 since no one has data prior
df_no_2002 = df[df['season'] != 2002]

# Define sub-dataframes for visualizations
made_cut = df_no_2002[df_no_2002['made_cut'] == 1]
missed_cut = df_no_2002[df_no_2002['made_cut'] == 0]
inside_top10 = df_no_2002[df_no_2002['top_10'] == 1]
outside_top10 = df_no_2002[df_no_2002['top_10'] == 0]

In [None]:
# Define universal Variables
feature = 'prior_score'
mean1_made = round(made_cut[feature].mean(), 1)
mean1_missed = round(missed_cut[feature].mean(), 1)
mean2_in = round(inside_top10[feature].mean(), 1)
mean2_out = round(outside_top10[feature].mean(), 1)
fig = plt.figure(figsize=figsize);
fig.subplots_adjust(hspace=hspace);
xmin = 265
xmax = 320
ymin = 285
ymax = 320
missed_cut_min = df_no_2002[df_no_2002['made_cut']==0][feature].min()
out_t10_min = df_no_2002[df_no_2002['top_10']==0][feature].min()

# Made Cut - Dist Plot
ax1 = fig.add_subplot(231);
sns.distplot(
    made_cut[feature], 
    bins='auto',
    color='blue',
    ax=ax1
);
sns.distplot(
    missed_cut[feature],
    bins='auto',
    color='red',
    ax=ax1
);
ax1.axvline(mean1_made, linestyle='--', color='blue', label=f'Made Cut Avg: {mean1_made}');
ax1.axvline(mean1_missed, linestyle='--', color='red', label=f'Missed Cut Avg: {mean1_missed}');
ax1.set_title('Distribution Plot - Made Cut');
ax1.set_xlim(xmin, xmax);
ax1.legend();

# Made Cut - Scatter Plot
ax2 = fig.add_subplot(232);
sns.scatterplot(
    x=df_no_2002[feature],
    y=df_no_2002['made_cut'],
    hue=df_no_2002['made_cut'],
    palette=['red', 'blue'],
    ax=ax2
);
ax2.axvline(missed_cut_min, linestyle = '--', color ='red', label=f'Missed Cut Min: {missed_cut_min}');
ax2.set_title('Scatter Plot - Made Cut');
ax2.set_xlim(xmin, xmax);
ax2.legend(loc='center');

# Made Cut - Mean Overtime
ax3 = fig.add_subplot(233);
made_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Made Cut Avg.'
);
missed_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Missed Cut Avg.'
);
ax3.set_title('Average Overtime - Made Cut', {'fontsize' : 12});
ax3.set_ylim(ymin, ymax);
ax3.legend();

# Top 10 - Dist Plot
ax4 = fig.add_subplot(234);
sns.distplot(
    inside_top10[feature],
    bins='auto',
    color='blue',
    ax=ax4
);
sns.distplot(
    outside_top10[feature],
    bins='auto',
    color='red',
    ax=ax4
);
ax4.axvline(mean2_in, linestyle='--', color='blue', label=f'Inside Avg: {mean2_in}');
ax4.axvline(mean2_out, linestyle='--', color='red', label=f'Outside Avg: {mean2_out}');
ax4.set_title('Distribution Plot - Top 10');
ax4.set_xlim(xmin, xmax);
ax4.legend();

# Top 10 - Scatter Plot
ax5 = fig.add_subplot(235);
sns.scatterplot(
    x=df_no_2002[feature],
    y=df_no_2002['top_10'],
    hue=df_no_2002['top_10'],
    palette=['red', 'blue'],
    ax=ax5
);
ax5.axvline(out_t10_min, linestyle = '--', color ='red', label=f'Outside Top 10 Min: {out_t10_min}');
ax5.set_title('Scatter Plot - Top 10');
ax5.set_xlim(xmin, xmax);
ax5.legend(loc='center');

# Top 10 - Mean Overtime
ax6 = fig.add_subplot(236);
inside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Inside Top 10 Avg.'
);
outside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Outside Top 10 Avg.' 
);
ax6.set_title('Average Overtime - Top 10', {'fontsize' : 12});
ax6.set_ylim(ymin, ymax);
ax6.legend();

### Driving Distance

In [None]:
#Re-define sub-dataframes for correct time period
made_cut = df[df['made_cut'] == 1]
missed_cut = df[df['made_cut'] == 0]
inside_top10 = df[df['top_10'] == 1]
outside_top10 = df[df['top_10'] == 0]

In [None]:
# Define universal Variables
feature = 'driving_distance'
mean1_made = round(made_cut[feature].mean(), 1)
mean1_missed = round(missed_cut[feature].mean(), 1)
mean2_in = round(inside_top10[feature].mean(), 1)
mean2_out = round(outside_top10[feature].mean(), 1)
fig = plt.figure(figsize=figsize);
fig.subplots_adjust(hspace=hspace);
xmin = 260
xmax = 330
ymin = 270
ymax = 310
made_cut_min = df[df['made_cut']==1][feature].min()
missed_cut_max = df[df['made_cut']==0][feature].max()
in_t10_min = df[df['top_10']==1][feature].min()
out_t10_max = df[df['top_10']==0][feature].max()

# Made Cut - Dist Plot
ax1 = fig.add_subplot(231);
sns.distplot(
    made_cut[feature], 
    bins='auto', 
    color='blue', 
    ax=ax1
);
sns.distplot(
    missed_cut[feature], 
    bins='auto', 
    color='red', 
    ax=ax1
);
ax1.axvline(mean1_made, linestyle='--', color='blue', label=f'Made Cut Avg: {mean1_made}');
ax1.axvline(mean1_missed, linestyle='--', color='red', label=f'Missed Cut Avg: {mean1_missed}');
ax1.set_title('Distribution Plot - Made Cut');
ax1.set_xlim(xmin, xmax);
ax1.legend(loc='upper right');

# Made Cut - Scatter Plot
ax2 = fig.add_subplot(232);
sns.scatterplot(
    x=df[feature],
    y=df['made_cut'],
    hue=df['made_cut'],
    palette=['red', 'blue'],
    ax=ax2
);
ax2.axvline(made_cut_min, linestyle = '--', color ='blue', label=f'Made Cut Min: {made_cut_min}');
ax2.axvline(missed_cut_max, linestyle = '--', color ='red', label=f'Missed Cut Max: {missed_cut_max}');
ax2.set_title('Scatter Plot - Made Cut');
ax2.set_xlim(xmin, xmax);
ax2.legend(loc='center');

# Made Cut - Mean Overtime
ax3 = fig.add_subplot(233);
made_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Made Cut Avg.'
);
missed_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Missed Cut Avg.'
);
ax3.set_title('Average Overtime - Made Cut', {'fontsize' : 12});
ax3.set_ylim(ymin, ymax);
ax3.legend(loc='upper left');

# Top 10 - Dist Plot
ax4 = fig.add_subplot(234);
sns.distplot(
    inside_top10[feature],
    bins='auto',
    color='blue',
    ax=ax4
);
sns.distplot(
    outside_top10[feature],
    bins='auto',
    color='red',
    ax=ax4
);
ax4.axvline(mean2_in, linestyle='--', color='blue', label=f'Inside Avg: {mean2_in}');
ax4.axvline(mean2_out, linestyle='--', color='red', label=f'Outside Avg: {mean2_out}');
ax4.set_title('Distribution Plot - Top 10');
ax4.set_xlim(xmin, xmax);
ax4.legend(loc='upper right');

# Top 10 - Scatter Plot
ax5 = fig.add_subplot(235);
sns.scatterplot(
    x=df[feature],
    y=df['top_10'],
    hue=df['top_10'],
    palette=['red', 'blue'],
    ax=ax5
);
ax5.axvline(in_t10_min, linestyle = '--', color ='blue', label=f'Inside Top 10 Min: {in_t10_min}');
ax5.axvline(out_t10_max, linestyle = '--', color ='red', label=f'Outside Top 10 Max: {out_t10_max}');
ax5.set_title('Scatter Plot - Top 10');
ax5.set_xlim(xmin, xmax);
ax5.legend(loc='center');

# Top 10 - Mean Overtime
ax6 = fig.add_subplot(236);
inside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Inside Top 10 Avg.'
);
outside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Outside Top 10 Avg.'
);
ax6.set_title('Average Overtime - Top 10', {'fontsize' : 12});
ax6.set_ylim(ymin, ymax);
ax6.legend(loc='upper left');

### Driving Accuracy

In [None]:
feature = 'driving_accuracy'
mean1_made = made_cut[feature].mean()
mean1_missed = missed_cut[feature].mean()
mean2_in = inside_top10[feature].mean()
mean2_out = outside_top10[feature].mean()
fig = plt.figure(figsize=figsize);
fig.subplots_adjust(hspace=hspace);
xmin = 0.4
xmax = 0.9
ymin = 0.55
ymax = 0.7
made_cut_min = df[df['made_cut']==1][feature].min()
missed_cut_max = df[df['made_cut']==0][feature].max()
in_t10_min = df[df['top_10']==1][feature].min()
out_t10_max = df[df['top_10']==0][feature].max()

# Made Cut - Dist Plot
ax1 = fig.add_subplot(231);
sns.distplot(
    made_cut[feature], 
    bins='auto', 
    color='blue', 
    ax=ax1
);
sns.distplot(
    missed_cut[feature], 
    bins='auto', 
    color='red', 
    ax=ax1
);
ax1.axvline(mean1_made, linestyle='--', color='blue', label=f'Made Cut Avg: {mean1_made : 0.2%}');
ax1.axvline(mean1_missed, linestyle='--', color='red', label=f'Missed Cut Avg: {mean1_missed : 0.2%}');
ax1.set_title('Distribution Plot - Made Cut');
ax1.set_xlim(xmin, xmax);
ax1.legend(loc='upper right');

# Made Cut - Scatter Plot
ax2 = fig.add_subplot(232);
sns.scatterplot(
    x=df[feature],
    y=df['made_cut'],
    hue=df['made_cut'],
    palette=['red', 'blue'],
    ax=ax2
);
ax2.axvline(made_cut_min, linestyle = '--', color ='blue', label=f'Made Cut Min: {made_cut_min : 0.2%}');
ax2.axvline(missed_cut_max, linestyle = '--', color ='red', label=f'Missed Cut Max: {missed_cut_max : 0.2%}');
ax2.set_title('Scatter Plot - Made Cut');
ax2.set_xlim(xmin, xmax);
ax2.legend(loc='center');

# Made Cut - Mean Overtime
ax3 = fig.add_subplot(233);
made_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Made Cut Avg.'
);
missed_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Missed Cut Avg.'
);
ax3.set_title('Average Overtime - Made Cut', {'fontsize' : 12});
ax3.set_ylim(ymin, ymax);
ax3.legend(loc='upper right');

# Top 10 - Dist Plot
ax4 = fig.add_subplot(234);
sns.distplot(
    inside_top10[feature],
    bins='auto',
    color='blue',
    ax=ax4
);
sns.distplot(
    outside_top10[feature],
    bins='auto',
    color='red',
    ax=ax4
);
ax4.axvline(mean2_in, linestyle='--', color='blue', label=f'Inside Avg: {mean2_in : 0.2%}');
ax4.axvline(mean2_out, linestyle='--', color='red', label=f'Outside Avg: {mean2_out : 0.2%}');
ax4.set_title('Distribution Plot - Top 10');
ax4.set_xlim(xmin, xmax);
ax4.legend(loc='upper right');

# Top 10 - Scatter Plot
ax5 = fig.add_subplot(235);
sns.scatterplot(
    x=df[feature],
    y=df['top_10'],
    hue=df['top_10'],
    palette=['red', 'blue'],
    ax=ax5
);
ax5.axvline(in_t10_min, linestyle = '--', color ='blue', label=f'Inside Top 10 Min: {in_t10_min : 0.2%}');
ax5.axvline(out_t10_max, linestyle = '--', color ='red', label=f'Outside Top 10 Max: {out_t10_max : 0.2%}');
ax5.set_title('Scatter Plot - Top 10');
ax5.set_xlim(xmin, xmax);
ax5.legend(loc='center');

# Top 10 - Mean Overtime
ax6 = fig.add_subplot(236);
inside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Inside Top 10 Avg.'
);
outside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Outside Top 10 Avg.'
);
ax6.set_title('Average Overtime - Top 10', {'fontsize' : 12});
ax6.set_ylim(ymin, ymax);
ax6.legend(loc='upper right');

### Greens in Regulation

In [None]:
feature = 'greens_in_regulation'
mean1_made = made_cut[feature].mean()
mean1_missed = missed_cut[feature].mean()
mean2_in = inside_top10[feature].mean()
mean2_out = outside_top10[feature].mean()
fig = plt.figure(figsize=figsize);
fig.subplots_adjust(hspace=hspace);
xmin = 0.5
xmax = 0.8
ymin = 0.62
ymax = 0.7
made_cut_min = df[df['made_cut']==1][feature].min()
missed_cut_max = df[df['made_cut']==0][feature].max()
in_t10_min = df[df['top_10']==1][feature].min()
out_t10_max = df[df['top_10']==0][feature].max()

# Made Cut - Dist Plot
ax1 = fig.add_subplot(231);
sns.distplot(
    made_cut[feature], 
    bins='auto', 
    color='blue', 
    ax=ax1
);
sns.distplot(
    missed_cut[feature], 
    bins='auto', 
    color='red', 
    ax=ax1
);
ax1.axvline(mean1_made, linestyle='--', color='blue', label=f'Made Cut Avg: {mean1_made : 0.2%}');
ax1.axvline(mean1_missed, linestyle='--', color='red', label=f'Missed Cut Avg: {mean1_missed : 0.2%}');
ax1.set_title('Distribution Plot - Made Cut');
ax1.set_xlim(xmin, xmax);
ax1.legend(loc='upper right');

# Made Cut - Scatter Plot
ax2 = fig.add_subplot(232);
sns.scatterplot(
    x=df[feature],
    y=df['made_cut'],
    hue=df['made_cut'],
    palette=['red', 'blue'],
    ax=ax2
);
ax2.axvline(made_cut_min, linestyle = '--', color ='blue', label=f'Made Cut Min: {made_cut_min : 0.2%}');
ax2.axvline(missed_cut_max, linestyle = '--', color ='red', label=f'Missed Cut Max: {missed_cut_max : 0.2%}');
ax2.set_title('Scatter Plot - Made Cut');
ax2.set_xlim(xmin, xmax);
ax2.legend(loc='center');

# Made Cut - Mean Overtime
ax3 = fig.add_subplot(233);
made_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Made Cut Avg.'
);
missed_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Missed Cut Avg.'
);
ax3.set_title('Average Overtime - Made Cut', {'fontsize' : 12});
ax3.set_ylim(ymin, ymax);
ax3.legend(loc='upper left');

# Top 10 - Dist Plot
ax4 = fig.add_subplot(234);
sns.distplot(
    inside_top10[feature],
    bins='auto',
    color='blue',
    ax=ax4
);
sns.distplot(
    outside_top10[feature],
    bins='auto',
    color='red',
    ax=ax4
);
ax4.axvline(mean2_in, linestyle='--', color='blue', label=f'Inside Avg: {mean2_in : 0.2%}');
ax4.axvline(mean2_out, linestyle='--', color='red', label=f'Outside Avg: {mean2_out : 0.2%}');
ax4.set_title('Distribution Plot - Top 10');
ax4.set_xlim(xmin, xmax);
ax4.legend(loc='upper right');

# Top 10 - Scatter Plot
ax5 = fig.add_subplot(235);
sns.scatterplot(
    x=df[feature],
    y=df['top_10'],
    hue=df['top_10'],
    palette=['red', 'blue'],
    ax=ax5
);
ax5.axvline(in_t10_min, linestyle = '--', color ='blue', label=f'Inside Top 10 Min: {in_t10_min : 0.2%}');
ax5.axvline(out_t10_max, linestyle = '--', color ='red', label=f'Outside Top 10 Max: {out_t10_max : 0.2%}');
ax5.set_title('Scatter Plot - Top 10');
ax5.set_xlim(xmin, xmax);
ax5.legend(loc='center');

# Top 10 - Mean Overtime
ax6 = fig.add_subplot(236);
inside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Inside Top 10 Avg.'
);
outside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Outside Top 10 Avg.'
);
ax6.set_title('Average Overtime - Top 10', {'fontsize' : 12});
ax6.set_ylim(ymin, ymax);
ax6.legend(loc='upper left');

### Par Or Better Distance

In [None]:
feature = 'par_or_better_distance'
mean1_made = round(made_cut[feature].mean(), 1)
mean1_missed = round(missed_cut[feature].mean(), 1)
mean2_in = round(inside_top10[feature].mean(), 1)
mean2_out = round(outside_top10[feature].mean(), 1)
fig = plt.figure(figsize=figsize);
fig.subplots_adjust(hspace=hspace);
xmin = 14
xmax = 24
ymin = 18
ymax = 21
made_cut_min = round(df[df['made_cut']==1][feature].min(), 1)
missed_cut_max = round(df[df['made_cut']==0][feature].max(), 1)
in_t10_min = round(df[df['top_10']==1][feature].min(), 1)
out_t10_max = round(df[df['top_10']==0][feature].max(), 1)

# Made Cut - Dist Plot
ax1 = fig.add_subplot(231);
sns.distplot(
    made_cut[feature], 
    bins='auto', 
    color='blue', 
    ax=ax1
);
sns.distplot(
    missed_cut[feature], 
    bins='auto', 
    color='red', 
    ax=ax1
);
ax1.axvline(mean1_made, linestyle='--', color='blue', label=f'Made Cut Avg: {mean1_made}');
ax1.axvline(mean1_missed, linestyle='--', color='red', label=f'Missed Cut Avg: {mean1_missed}');
ax1.set_title('Distribution Plot - Made Cut');
ax1.set_xlim(xmin, xmax);
ax1.legend(loc='upper right');

# Made Cut - Scatter Plot
ax2 = fig.add_subplot(232);
sns.scatterplot(
    x=df[feature],
    y=df['made_cut'],
    hue=df['made_cut'],
    palette=['red', 'blue'],
    ax=ax2
);
ax2.axvline(made_cut_min, linestyle = '--', color ='blue', label=f'Made Cut Min: {made_cut_min}');
ax2.axvline(missed_cut_max, linestyle = '--', color ='red', label=f'Missed Cut Max: {missed_cut_max}');
ax2.set_title('Scatter Plot - Made Cut');
ax2.set_xlim(xmin, xmax);
ax2.legend(loc='center');

# Made Cut - Mean Overtime
ax3 = fig.add_subplot(233);
made_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Made Cut Avg.'
);
missed_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Missed Cut Avg.'
);
ax3.set_title('Average Overtime - Made Cut', {'fontsize' : 12});
ax3.set_ylim(ymin, ymax);
ax3.legend(loc='upper left');

# Top 10 - Dist Plot
ax4 = fig.add_subplot(234);
sns.distplot(
    inside_top10[feature],
    bins='auto',
    color='blue',
    ax=ax4
);
sns.distplot(
    outside_top10[feature],
    bins='auto',
    color='red',
    ax=ax4
);
ax4.axvline(mean2_in, linestyle='--', color='blue', label=f'Inside Avg: {mean2_in}');
ax4.axvline(mean2_out, linestyle='--', color='red', label=f'Outside Avg: {mean2_out}');
ax4.set_title('Distribution Plot - Top 10');
ax4.set_xlim(xmin, xmax);
ax4.legend(loc='upper right');

# Top 10 - Scatter Plot
ax5 = fig.add_subplot(235);
sns.scatterplot(
    x=df[feature],
    y=df['top_10'],
    hue=df['top_10'],
    palette=['red', 'blue'],
    ax=ax5
);
ax5.axvline(in_t10_min, linestyle = '--', color ='blue', label=f'Inside Top 10 Min: {in_t10_min}');
ax5.axvline(out_t10_max, linestyle = '--', color ='red', label=f'Outside Top 10 Max: {out_t10_max}');
ax5.set_title('Scatter Plot - Top 10');
ax5.set_xlim(xmin, xmax);
ax5.legend(loc='center');

# Top 10 - Mean Overtime
ax6 = fig.add_subplot(236);
inside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Inside Top 10 Avg.'
);
outside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Outside Top 10 Avg.'
);
ax6.set_title('Average Overtime - Top 10', {'fontsize' : 12});
ax6.set_ylim(ymin, ymax);
ax6.legend(loc='upper left');

### Scrambling

In [None]:
# Drop the one scramling outlier
scram_only = df[(df['made_cut'] == 1) | ((df['made_cut'] == 0) & (df['scrambling'] < 0.68))]

#Redefine sub-dataframes for correct time period
made_cut = scram_only[scram_only['made_cut'] == 1]
missed_cut = scram_only[scram_only['made_cut'] == 0]
inside_top10 = scram_only[scram_only['top_10'] == 1]
outside_top10 = scram_only[scram_only['top_10'] == 0]

In [None]:
feature = 'scrambling'
mean1_made = made_cut[feature].mean()
mean1_missed = missed_cut[feature].mean()
mean2_in = inside_top10[feature].mean()
mean2_out = outside_top10[feature].mean()
fig = plt.figure(figsize=figsize);
fig.subplots_adjust(hspace=hspace);
xmin = 0.45
xmax = 0.75
ymin = 0.55
ymax = 0.62
made_cut_min = scram_only[scram_only['made_cut']==1][feature].min()
missed_cut_max = scram_only[scram_only['made_cut']==0][feature].max()
in_t10_min = scram_only[scram_only['top_10']==1][feature].min()
out_t10_max = scram_only[scram_only['top_10']==0][feature].max()

# Made Cut - Dist Plot
ax1 = fig.add_subplot(231);
sns.distplot(
    made_cut[feature], 
    bins='auto', 
    color='blue', 
    ax=ax1, 
);
sns.distplot(
    missed_cut[feature], 
    bins='auto', 
    color='red', 
    ax=ax1, 
);
ax1.axvline(mean1_made, linestyle='--', color='blue', label=f'Made Cut Avg: {mean1_made : 0.2%}');
ax1.axvline(mean1_missed, linestyle='--', color='red', label=f'Missed Cut Avg: {mean1_missed : 0.2%}');
ax1.set_title('Distribution Plot - Made Cut');
ax1.set_xlim(xmin, xmax);
ax1.legend(loc='upper right');

# Made Cut - Scatter Plot
ax2 = fig.add_subplot(232);
sns.scatterplot(
    x=scram_only[feature],
    y=scram_only['made_cut'],
    hue=scram_only['made_cut'],
    palette=['red', 'blue'],
    ax=ax2
);
ax2.axvline(made_cut_min, linestyle = '--', color ='blue', label=f'Made Cut Min: {made_cut_min : 0.2%}');
ax2.axvline(missed_cut_max, linestyle = '--', color ='red', label=f'Missed Cut Max: {missed_cut_max : 0.2%}');
ax2.set_title('Scatter Plot - Made Cut');
ax2.set_xlim(xmin, xmax);
ax2.legend(loc='center');

# Made Cut - Mean Overtime
ax3 = fig.add_subplot(233);
made_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Made Cut Avg.'
);
missed_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Missed Cut Avg.'
);
ax3.set_title('Average Overtime - Made Cut', {'fontsize' : 12});
ax3.set_ylim(ymin, ymax);
ax3.legend(loc='upper right');

# Top 10 - Dist Plot
ax4 = fig.add_subplot(234);
sns.distplot(
    inside_top10[feature],
    bins='auto',
    color='blue',
    ax=ax4
);
sns.distplot(
    outside_top10[feature],
    bins='auto',
    color='red',
    ax=ax4
);
ax4.axvline(mean2_in, linestyle='--', color='blue', label=f'Inside Avg: {mean2_in : 0.2%}');
ax4.axvline(mean2_out, linestyle='--', color='red', label=f'Outside Avg: {mean2_out : 0.2%}');
ax4.set_title('Distribution Plot - Top 10');
ax4.set_xlim(xmin, xmax);
ax4.legend(loc='upper right');

# Top 10 - Scatter Plot
ax5 = fig.add_subplot(235);
sns.scatterplot(
    x=scram_only[feature],
    y=scram_only['top_10'],
    hue=scram_only['top_10'],
    palette=['red', 'blue'],
    ax=ax5
);
ax5.axvline(in_t10_min, linestyle = '--', color ='blue', label=f'Inside Top 10 Min: {in_t10_min : 0.2%}');
ax5.axvline(out_t10_max, linestyle = '--', color ='red', label=f'Outside Top 10 Max: {out_t10_max : 0.2%}');
ax5.set_title('Scatter Plot - Top 10');
ax5.set_xlim(xmin, xmax);
ax5.legend(loc='center');

# Top 10 - Mean Overtime
ax6 = fig.add_subplot(236);
inside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Inside Top 10 Avg.'
);
outside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Outside Top 10 Avg.'
);
ax6.set_title('Average Overtime - Top 10', {'fontsize' : 12});
ax6.set_ylim(ymin, ymax);
ax6.legend(loc='upper right');

### Putting Rating

In [None]:
feature = 'putting_rating'
mean1_made = round(made_cut[feature].mean(), 1)
mean1_missed = round(missed_cut[feature].mean(), 1)
mean2_in = round(inside_top10[feature].mean(), 1)
mean2_out = round(outside_top10[feature].mean(), 1)
fig = plt.figure(figsize=figsize);
fig.subplots_adjust(hspace=hspace);
xmin = 0
xmax = 400
ymin = 150
ymax = 250
made_cut_max = df[df['made_cut']==1][feature].max()
missed_cut_min = df[df['made_cut']==0][feature].min()
in_t10_max = df[df['top_10']==1][feature].max()
out_t10_min = df[df['top_10']==0][feature].min()

# Made Cut - Dist Plot
ax1 = fig.add_subplot(231);
sns.distplot(
    made_cut[feature], 
    bins='auto', 
    color='blue', 
    ax=ax1
);
sns.distplot(
    missed_cut[feature], 
    bins='auto', 
    color='red', 
    ax=ax1
);
ax1.axvline(mean1_made, linestyle='--', color='blue', label=f'Made Cut Avg: {mean1_made}');
ax1.axvline(mean1_missed, linestyle='--', color='red', label=f'Missed Cut Avg: {mean1_missed}');
ax1.set_title('Distribution Plot - Made Cut');
ax1.set_xlim(xmin, xmax);
ax1.legend(loc='upper right');

# Made Cut - Scatter Plot
ax2 = fig.add_subplot(232);
sns.scatterplot(
    x=df[feature],
    y=df['made_cut'],
    hue=df['made_cut'],
    palette=['red', 'blue'],
    ax=ax2
);
ax2.axvline(made_cut_max, linestyle = '--', color ='blue', label=f'Made Cut Max: {made_cut_max}');
ax2.axvline(missed_cut_min, linestyle = '--', color ='red', label=f'Missed Cut Min: {missed_cut_min}');
ax2.set_title('Scatter Plot - Made Cut');
ax2.set_xlim(xmin, xmax);
ax2.legend(loc='center');

# Made Cut - Mean Overtime
ax3 = fig.add_subplot(233);
made_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Made Cut Avg.'
);
missed_cut.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax3,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Missed Cut Avg.'
);
ax3.set_title('Average Overtime - Made Cut', {'fontsize' : 12});
ax3.set_ylim(ymin, ymax);
ax3.legend(loc='upper left');

# Top 10 - Dist Plot
ax4 = fig.add_subplot(234);
sns.distplot(
    inside_top10[feature],
    bins='auto',
    color='blue',
    ax=ax4
);
sns.distplot(
    outside_top10[feature],
    bins='auto',
    color='red',
    ax=ax4
);
ax4.axvline(mean2_in, linestyle='--', color='b', label=f'Inside Avg: {mean2_in}');
ax4.axvline(mean2_out, linestyle='--', color='r', label=f'Outside Avg: {mean2_out}');
ax4.set_title('Distribution Plot - Top 10');
ax4.set_xlim(xmin, xmax);
ax4.legend(loc='upper right');

# Top 10 - Scatter Plot
ax5 = fig.add_subplot(235);
sns.scatterplot(
    x=df[feature],
    y=df['top_10'],
    hue=df['top_10'],
    palette=['red', 'blue'],
    ax=ax5
);
ax5.axvline(in_t10_max, linestyle = '--', color ='blue', label=f'Inside Top 10 Max: {in_t10_max}');
ax5.axvline(out_t10_min, linestyle = '--', color ='red', label=f'Outside Top 10 Min: {out_t10_min}');
ax5.set_title('Scatter Plot - Top 10');
ax5.set_xlim(xmin, xmax);
ax5.legend(loc='center');

# Top 10 - Mean Overtime
ax6 = fig.add_subplot(236);
inside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='blue',
    marker='o',
    linestyle='--',
    label='Inside Top 10 Avg.'
);
outside_top10.groupby(['season'])[feature].mean().plot(
    kind='line',
    ax=ax6,
    fontsize=12,
    color='red',
    marker='o',
    linestyle='--',
    label='Outside Top 10 Avg.'
);
ax6.set_title('Average Overtime - Top 10', {'fontsize' : 12});
ax6.set_ylim(ymin, ymax);
ax6.legend(loc='upper left');

# Modeling

## Load Data

In [None]:
df = pd.read_csv('csv_files/noSG/final_golfer_data.csv', index_col=0)

print(df.shape)
df.head()

## Made Cut

### Define Key Variables

In [None]:
# Random state
SEED = 13

# Split
test_size = 0.2

# Labels
target_names = ['missed_cut', 'made_cut']
FI_labels = df.drop(['season', 'full_name', 'min', 'mean', 'made_cut', 'top_10', 'total_score'], axis=1).columns.values

# K-folds Cross Validation
cv = 3

# Evaluation metric
scoring = 'accuracy'

### Train, Test, and Split

In [None]:
X = df.drop(['season', 'full_name', 'min', 'mean', 'made_cut', 'top_10', 'total_score'], axis=1)
y = df['made_cut']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)

# We will use smote to handle our class imbalance
smt = SMOTE(random_state=SEED)
X_train, y_train = smt.fit_resample(X_train, y_train)

### Standardization

In [None]:
std = StandardScaler()

X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

### Base Model

In [None]:
# Instantiate classifier and define model
classifier = DummyClassifier(random_state = SEED)
model_name = 'Dummy Classifier Model'

# Create param grid for GridSearch
param_grid = {
    'strategy' : [
        'stratified', 
        'most_frequent',
        'prior',
        'uniform',
        'constant'
    ]
}

# Instantiate GridSearch
dummy_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
dummy_clf.fit(X_train, y_train)
y_hat_pred = dummy_clf.predict(X_train)
y_pred = dummy_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = dummy_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# Pickle model
filename = 'models/noSG/base_model.pkl'
model = dummy_clf
joblib.dump(model, filename)

### Logistic Regression

In [None]:
# Instantiate classifier and define model
classifier = LogisticRegression(random_state=SEED)
model_name = 'Logistic Regression Model'

# Create param grid for GridSearch
param_grid = {
    'penalty' : ['l1', 'l2'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear'],
    'C' : np.logspace(-10, 10, 5),
}

# Instantiate GridSearch
log_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
log_clf.fit(X_train, y_train)
y_hat_pred = log_clf.predict(X_train)
y_pred = log_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = log_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# Pickle model
filename = 'models/noSG/log_model.pkl'
model = log_clf
joblib.dump(model, filename)

### KNearest Neighbors (KNN)

In [None]:
# Instantiate classifier and define model
classifier = KNeighborsClassifier()
model_name = 'KNN Model'

k = int(round(np.log(X_train.shape[0]), 0))

# Create param grid for GridSearch
param_grid = {'n_neighbors' : range(k,k+20,2)}

# Instantiate GridSearch
knn_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
knn_clf.fit(X_train, y_train)
y_hat_pred = knn_clf.predict(X_train)
y_pred = knn_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = knn_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names = target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# Pickle model
filename = 'models/noSG/knn_model.pkl'
model = knn_clf
joblib.dump(model, filename)

### Decision Tree

In [None]:
# Instantiate classifier and define model
classifier = DecisionTreeClassifier(random_state=SEED)
model_name = 'Decision Tree Model'

# Create param grid for GridSearch
param_grid = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [2, 5],
    'min_samples_split' : [5, 10],
    'min_samples_leaf' : [5, 10]
}

# Instantiate GridSearch
dt_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
dt_clf.fit(X_train, y_train)
y_hat_pred = dt_clf.predict(X_train)
y_pred = dt_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = dt_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# # Print confusion matrix
# cm = metrics.confusion_matrix(y_test, y_pred)
# classes = target_names
# plot_confusion_matrix(cm, classes)

# Feature importance
dt_optimized = DecisionTreeClassifier(
    criterion = best_params['criterion'],
    max_depth = best_params['max_depth'],
    min_samples_split = best_params['min_samples_split'],
    min_samples_leaf = best_params['min_samples_leaf'],
    random_state = SEED
)

dt_optimized.fit(X_train, y_train)

n_features = X_train.shape[1]
plot_feature_importances(X_train, dt_optimized, n_features, FI_labels)

# Pickle model
filename = 'models/noSG/dt_model.pkl'
model = dt_clf
joblib.dump(model, filename)

### Random Forest

In [None]:
# Instantiate classifier and define model
classifier = RandomForestClassifier(random_state=SEED)
model_name = 'Random Forest Model'

# Create param grid for GridSearch
param_grid = {
    'criterion' : ['gini', 'entropy'],
    'n_estimators' : [50, 100],
    'max_depth' : [2, 5],
    'min_samples_split' : [5, 10],
    'min_samples_leaf' : [5, 10]
}

# Instantiate GridSearch
rf_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
rf_clf.fit(X_train, y_train)
y_hat_pred = rf_clf.predict(X_train)
y_pred = rf_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = rf_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# Feature importance
rf_optimized = RandomForestClassifier(
    criterion = best_params['criterion'],
    n_estimators = best_params['n_estimators'],
    max_depth = best_params['max_depth'],
    min_samples_split = best_params['min_samples_split'],
    min_samples_leaf = best_params['min_samples_leaf'],
    random_state = SEED
)

rf_optimized.fit(X_train, y_train)

n_features = X_train.shape[1]
plot_feature_importances(X_train, dt_optimized, n_features, FI_labels)

# Pickle model
filename = 'models/noSG/rf_model.pkl'
model = rf_clf
joblib.dump(model, filename)

### AdaBoost

In [None]:
# Instantiate classifier and define model
classifier = AdaBoostClassifier(random_state=SEED)
model_name = 'AdaBoost Model'

# Create param grid for GridSearch
param_grid = {
    'n_estimators' : [50, 100],
    'learning_rate' : [0.1, 0.2, 0.5, 1]
}

# Instantiate GridSearch
ab_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
ab_clf.fit(X_train, y_train)
y_hat_pred = ab_clf.predict(X_train)
y_pred = ab_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = ab_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# # Print Precision-Recall Curve
# disp = metrics.plot_precision_recall_curve(ab_clf, X_test, y_test);

# Feature importance
ab_optimized = AdaBoostClassifier(
    n_estimators = best_params['n_estimators'],
    learning_rate = best_params['learning_rate'],
    random_state = SEED
)

ab_optimized.fit(X_train, y_train)

n_features = X_train.shape[1]
plot_feature_importances(X_train, dt_optimized, n_features, FI_labels)

# Pickle model
filename = 'models/noSG/ab_model.pkl'
model = ab_clf
joblib.dump(model, filename)

### Gradient Boost

In [None]:
# Instantiate classifier and define model
classifier = GradientBoostingClassifier(random_state=SEED)
model_name = 'Gradient Boost Model'

# Create param grid for GridSearch
param_grid = {
    'n_estimators' : [50],
    'learning_rate' : [0.01, 0.1, 0.2]
}

# Instantiate GridSearch
gb_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
gb_clf.fit(X_train, y_train)
y_hat_pred = gb_clf.predict(X_train)
y_pred = gb_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = gb_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# Feature importance
gb_optimized = GradientBoostingClassifier(
    n_estimators = best_params['n_estimators'],
    learning_rate = best_params['learning_rate'],
    random_state = SEED
)

gb_optimized.fit(X_train, y_train)

n_features = X_train.shape[1]
plot_feature_importances(X_train, dt_optimized, n_features, FI_labels)

# Pickle model
filename = 'models/noSG/gb_model.pkl'
model = gb_clf
joblib.dump(model, filename)

### Support Vector Machine

In [None]:
# # Instantiate classifier and define model
# classifier = SVC(random_state=SEED)
# model_name = 'Support Vector Machine Model'

# # Create param grid for GridSearch
# param_grid = {
#     'C' : [0.1, 1, 10],
#     'kernel' : ['linear', 'rbf', 'sigmoid'],
#     'gamma' : [0.1, 1, 10]
# }

# # Instantiate GridSearch
# svm_clf = GridSearchCV(
#     classifier,
#     param_grid,
#     cv=cv,
#     scoring=scoring,
#     verbose=2,
#     n_jobs=-1
# )

# # Fit and make predictions
# svm_clf.fit(X_train, y_train)
# y_hat_pred = svm_clf.predict(X_train)
# y_pred = svm_clf.predict(X_test)

# # Print Best Params
# print('--'*27)
# print(f'{model_name} Best Params:')
# print('--'*27)
# best_params = svm_clf.best_params_
# keys = list(best_params.keys())
# for key in keys:
#     print(f"{key} : {best_params[key]}")

# # Print Precision Score
# p_hat = metrics.precision_score(y_train, y_hat_pred)
# p = metrics.precision_score(y_test, y_pred)
# print('--'*27)
# print(f'{model_name} Precision Scores:')
# print('--'*27)
# print(f' train: {p_hat : 0.2%}')
# print(f' test: {p : 0.2%}')

# # Print classification report
# print('--'*27)
# print(f'{model_name} Classification Report')
# print('--'*27)
# print(metrics.classification_report(y_test, y_pred, target_names=target_names))
# print('--'*27)

# # Print confusion matrix
# cm = metrics.confusion_matrix(y_test, y_pred)
# classes = target_names
# plot_confusion_matrix(cm, classes)

# # Print Precision-Recall Curve
# disp = metrics.plot_precision_recall_curve(svm_clf, X_test, y_test);

# # Pickle model
# filename = 'models/gb_model.pkl'
# model = gb_clf
# joblib.dump(model, filename)

### Results Summary

In [None]:
classifiers = [dummy_clf, log_clf, knn_clf, dt_clf, rf_clf, ab_clf, gb_clf]
names = ['base', 'log_reg', 'knn', 'decTree', 'randomF', 'adaB', 'gradB']
results = []
for index, classifier in enumerate(classifiers):
    result = {}
    y_hat_pred = classifier.predict(X_train)
    y_pred = classifier.predict(X_test)
    result['model'] = names[index]
    result['accuracy_mc'] = round(metrics.accuracy_score(y_test, y_pred)*100,2)
    result['precision_mc'] = round(metrics.precision_score(y_test, y_pred)*100,2)
    results.append(result)

results_df = pd.DataFrame(results)

print(results_df.shape)
results_df

## Top 10

### Define Key Variables

In [None]:
# Random state
SEED = 13

# Split
test_size = 0.2

# Labels
target_names = ['outside_top_10', 'inside_top_10']
FI_labels = df.drop(['season', 'full_name', 'min', 'mean', 'made_cut', 'top_10', 'total_score'], axis=1).columns.values

# K-folds Cross Validation
cv = 3

# Evaluation metric
scoring = 'accuracy'

class_weight = [
    'balanced',
    {0: 0.1, 1: 0.9},
    {0: 0.3, 1: 0.7},
    {0: 0.5, 1: 0.5}
]

### Train, Test, Split

In [None]:
X = df.drop(['season', 'full_name', 'min', 'mean', 'made_cut', 'top_10', 'total_score'], axis=1)
y = df['top_10']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)

# # We will use smote to handle our class imbalance
# ros = ROS(random_state=SEED)
# X_train, y_train = ros.fit_resample(X_train, y_train)

### Standarization

In [None]:
std = StandardScaler()

X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

### Base Model

In [None]:
# Instantiate classifier and define model
classifier = DummyClassifier(random_state = SEED)
model_name = 'Dummy Classifier Model'

# Create param grid for GridSearch
param_grid = {
    'strategy' : [
        'stratified', 
        'most_frequent',
        'prior',
        'uniform',
        'constant'
    ]
}

# Instantiate GridSearch
dummy_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
dummy_clf.fit(X_train, y_train)
y_hat_pred = dummy_clf.predict(X_train)
y_pred = dummy_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = dummy_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# Pickle model
filename = 'models/noSG/base_model.pkl'
model = dummy_clf
joblib.dump(model, filename)

### Logistic Regression

In [None]:
# Instantiate classifier and define model
classifier = LogisticRegression(random_state=SEED)
model_name = 'Logistic Regression Model'

# Create param grid for GridSearch
param_grid = {
    'penalty' : ['l1', 'l2'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear'],
    'C' : np.logspace(-10, 10, 5),
    'class_weight': class_weight
}

# Instantiate GridSearch
log_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
log_clf.fit(X_train, y_train)
y_hat_pred = log_clf.predict(X_train)
y_pred = log_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = log_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# Pickle model
filename = 'models/noSG/log_model.pkl'
model = log_clf
joblib.dump(model, filename)

### KNearest Neighbors (KNN)

In [None]:
# Instantiate classifier and define model
classifier = KNeighborsClassifier()
model_name = 'KNN Model'

# Create param grid for GridSearch
param_grid = {
    'n_neighbors' : range(3,80,2),
    'weights' : ['uniform', 'distance']
}

# Instantiate GridSearch
knn_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
knn_clf.fit(X_train, y_train)
y_hat_pred = knn_clf.predict(X_train)
y_pred = knn_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = knn_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names = target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# Pickle model
filename = 'models/noSG/knn_model.pkl'
model = knn_clf
joblib.dump(model, filename)

### Decision Tree

In [None]:
# Instantiate classifier and define model
classifier = DecisionTreeClassifier(random_state=SEED)
model_name = 'Decision Tree Model'

# Create param grid for GridSearch
param_grid = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : range(2, 10),
    'min_samples_split' : range(5, 50, 5),
    'min_samples_leaf' : range(5, 50, 5),
    'class_weight' : class_weight
}

# Instantiate GridSearch
dt_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
dt_clf.fit(X_train, y_train)
y_hat_pred = dt_clf.predict(X_train)
y_pred = dt_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = dt_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# # Print confusion matrix
# cm = metrics.confusion_matrix(y_test, y_pred)
# classes = target_names
# plot_confusion_matrix(cm, classes)

# Feature importance
dt_optimized = DecisionTreeClassifier(
    criterion = best_params['criterion'],
    max_depth = best_params['max_depth'],
    min_samples_split = best_params['min_samples_split'],
    min_samples_leaf = best_params['min_samples_leaf'],
    random_state = SEED
)

dt_optimized.fit(X_train, y_train)

n_features = X_train.shape[1]
plot_feature_importances(X_train, dt_optimized, n_features, FI_labels)

# Pickle model
filename = 'models/noSG/dt_model.pkl'
model = dt_clf
joblib.dump(model, filename)

### Random Forest

In [None]:
# Instantiate classifier and define model
classifier = RandomForestClassifier(random_state=SEED)
model_name = 'Random Forest Model'

# Create param grid for GridSearch
param_grid = {
    'criterion' : ['gini', 'entropy'],
    'n_estimators' : [100, 200],
    'max_depth' : [None, 2, 5, 10],
    'min_samples_split' : [2, 5],
    'min_samples_leaf' : [1, 5]
}

# Instantiate GridSearch
rf_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
rf_clf.fit(X_train, y_train)
y_hat_pred = rf_clf.predict(X_train)
y_pred = rf_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = rf_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# Feature importance
rf_optimized = RandomForestClassifier(
    criterion = best_params['criterion'],
    n_estimators = best_params['n_estimators'],
    max_depth = best_params['max_depth'],
    min_samples_split = best_params['min_samples_split'],
    min_samples_leaf = best_params['min_samples_leaf'],
    random_state = SEED
)

rf_optimized.fit(X_train, y_train)

n_features = X_train.shape[1]
plot_feature_importances(X_train, dt_optimized, n_features, FI_labels)

# Pickle model
filename = 'models/noSG/rf_model.pkl'
model = rf_clf
joblib.dump(model, filename)

### AdaBoost

In [None]:
# Instantiate classifier and define model
classifier = AdaBoostClassifier(random_state=SEED)
model_name = 'AdaBoost Model'

# Create param grid for GridSearch
param_grid = {
    'n_estimators' : [100, 200, 500],
    'learning_rate' : [0.01, 0.1, 0.2, 0.5]
}

# Instantiate GridSearch
ab_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
ab_clf.fit(X_train, y_train)
y_hat_pred = ab_clf.predict(X_train)
y_pred = ab_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = ab_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# # Print Precision-Recall Curve
# disp = metrics.plot_precision_recall_curve(ab_clf, X_test, y_test);

# Feature importance
ab_optimized = AdaBoostClassifier(
    n_estimators = best_params['n_estimators'],
    learning_rate = best_params['learning_rate'],
    random_state = SEED
)

ab_optimized.fit(X_train, y_train)

n_features = X_train.shape[1]
plot_feature_importances(X_train, dt_optimized, n_features, FI_labels)

# Pickle model
filename = 'models/noSG/ab_model.pkl'
model = ab_clf
joblib.dump(model, filename)

### Gradient Boost

In [None]:
# Instantiate classifier and define model
classifier = GradientBoostingClassifier(random_state=SEED)
model_name = 'Gradient Boost Model'

# Create param grid for GridSearch
param_grid = {
    'n_estimators' : [100, 200, 500],
    'learning_rate' : [0.01, 0.1, 0.2, 0.5]
}

# Instantiate GridSearch
gb_clf = GridSearchCV(
    classifier,
    param_grid,
    cv=cv,
    scoring=scoring,
    verbose=2,
    n_jobs=-1
)

# Fit and make predictions
gb_clf.fit(X_train, y_train)
y_hat_pred = gb_clf.predict(X_train)
y_pred = gb_clf.predict(X_test)

# Print Best Params
print('--'*27)
print(f'{model_name} Best Params:')
print('--'*27)
best_params = gb_clf.best_params_
keys = list(best_params.keys())
for key in keys:
    print(f" {key} : {best_params[key]}")

# Print Precision Score
p_hat = metrics.precision_score(y_train, y_hat_pred)
p = metrics.precision_score(y_test, y_pred)
print('--'*27)
print(f'{model_name} Precision Scores:')
print('--'*27)
print(f' train: {p_hat : 0.2%}')
print(f' test: {p : 0.2%}')

# Print classification report
print('--'*27)
print(f'{model_name} Classification Report')
print('--'*27)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print('--'*27)

# Print confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
classes = target_names
plot_confusion_matrix(cm, classes)

# Feature importance
gb_optimized = GradientBoostingClassifier(
    n_estimators = best_params['n_estimators'],
    learning_rate = best_params['learning_rate'],
    random_state = SEED
)

gb_optimized.fit(X_train, y_train)

n_features = X_train.shape[1]
plot_feature_importances(X_train, dt_optimized, n_features, FI_labels)

# Pickle model
filename = 'models/noSG/gb_model.pkl'
model = gb_clf
joblib.dump(model, filename)

### Support Vector Machines

### Results Summary

In [None]:
results = []
for index, classifier in enumerate(classifiers):
    result = {}
    y_hat_pred = classifier.predict(X_train)
    y_pred = classifier.predict(X_test)
    result['model'] = names[index]
    result['accuracy'] = round(metrics.accuracy_score(y_test, y_pred)*100,2)
    result['precision'] = round(metrics.precision_score(y_test, y_pred)*100,2)
    results.append(result)

results_df['accuracy_t10'] = 0.00
results_df['precision_t10'] = 0.00

for i in range(len(results)):
    results_df['accuracy_t10'].iloc[i] = results[i]['accuracy']
    results_df['precision_t10'].iloc[i] = results[i]['precision']

results_df

# Predicitons