In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import itertools
from sklearn import metrics
from sklearn.metrics import pairwise_distances

from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import SpectralClustering

# Background

There is a lot of information on runners and their performance for the Boston Marathon. Pick a year (post-2012 has more info) and do some clustering.

Specifically, use the tools at hand to determine which clustering solution, including number of clusters and algorithm used, is best for the marathon data. Once you have a solution you like, write a data story, including visualizations, where you teach the reader something about the Boston Marathon based on your clusters. Write up your report, including your process from start to finish, in a Jupyter notebook and submit it below.

# Data Cleaning

In [2]:
# Importing results from 2013
results_2013 = pd.read_csv('boston_results.csv')

#Look at how the data looks.
results_2013.head()

Unnamed: 0,25k,age,name,division,10k,gender,half,official,bib,ctz,...,overall,pace,state,30k,5k,genderdiv,20k,35k,city,40k
0,49.87,28,"Cassidy, Josh R.",9,18.18,M,40.93,90.9,W1,,...,9,3.47,ON,62.07,8.9,9,38.8,74.73,Toronto,85.55
1,77.27,30,"Korir, Wesley",5,30.9,M,64.9,132.5,1,,...,5,5.07,,92.97,15.9,5,61.52,108.78,Kenya,124.77
2,77.23,23,"Desisa, Lelisa",1,30.9,M,64.92,130.37,2,,...,1,4.98,,92.72,15.93,1,61.53,108.68,Ambo,123.78
3,50.5,32,"Fearnley, Kurt H.",5,18.73,M,42.0,88.43,W2,,...,5,3.38,,61.35,8.98,5,39.88,73.0,Hamilton,83.43
4,48.75,39,"Hokinoue, Kota",3,18.18,M,40.57,87.22,W3,,...,3,3.33,,59.92,8.92,3,38.55,71.68,Iizuka,81.88


In [3]:
# Show me what type of data im looking at and how many columns there are
results_2013.dtypes

25k           object
age            int64
name          object
division       int64
10k           object
gender        object
half          object
official     float64
bib           object
ctz           object
country       object
overall        int64
pace         float64
state         object
30k           object
5k            object
genderdiv      int64
20k           object
35k           object
city          object
40k           object
dtype: object

In [4]:
results_2013.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16164 entries, 0 to 16163
Data columns (total 21 columns):
25k          16164 non-null object
age          16164 non-null int64
name         16164 non-null object
division     16164 non-null int64
10k          16164 non-null object
gender       16164 non-null object
half         16164 non-null object
official     16164 non-null float64
bib          16164 non-null object
ctz          757 non-null object
country      16164 non-null object
overall      16164 non-null int64
pace         16164 non-null float64
state        14701 non-null object
30k          16164 non-null object
5k           16164 non-null object
genderdiv    16164 non-null int64
20k          16164 non-null object
35k          16164 non-null object
city         16163 non-null object
40k          16164 non-null object
dtypes: float64(2), int64(4), object(15)
memory usage: 2.6+ MB


In [5]:
# Drop 'ctz' since there are only 757 and it's probably all Nan
# drop name, country, bib, city, state since these are strings
results_2013 = results_2013.drop(['ctz', 'name', 'country', 'bib',
                                 'city', 'state'], axis=1)

In [6]:
results_2013.head()

Unnamed: 0,25k,age,division,10k,gender,half,official,overall,pace,30k,5k,genderdiv,20k,35k,40k
0,49.87,28,9,18.18,M,40.93,90.9,9,3.47,62.07,8.9,9,38.8,74.73,85.55
1,77.27,30,5,30.9,M,64.9,132.5,5,5.07,92.97,15.9,5,61.52,108.78,124.77
2,77.23,23,1,30.9,M,64.92,130.37,1,4.98,92.72,15.93,1,61.53,108.68,123.78
3,50.5,32,5,18.73,M,42.0,88.43,5,3.38,61.35,8.98,5,39.88,73.0,83.43
4,48.75,39,3,18.18,M,40.57,87.22,3,3.33,59.92,8.92,3,38.55,71.68,81.88


In [7]:
# Convert gender to binary classifier
results_2013['gender'] = results_2013.gender.map(lambda x: 0 if x is 'F' else 1)

In [8]:
# Convert the marathon numbers to floats.
cols = ['25k', 'half', '30k','10k', '20k', '35k', '40k', '5k']

results_2013[cols] = results_2013[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [10]:
# Drop any missing data
results_2013 = results_2013.dropna()

In [11]:
# Didn't lose much data. Not bad 
results_2013.shape

(16054, 15)

# Exploring Clusters