You now have a pretty varied suite of clustering and clustering evaluation methods; we'd be remiss if we didn't give you the opportunity to try them out on some real data. So here we go!

There is a lot of information on runners and their performance for the Boston Marathon. Pick a year (post-2012 has more info) and do some clustering.

Specifically, use the tools at hand to determine which clustering solution, including number of clusters and algorithm used, is best for the marathon data. Once you have a solution you like, write a data story, including visualizations, where you teach the reader something about the Boston Marathon based on your clusters. Write up your report, including your process from start to finish, in a Jupyter notebook and submit it below.

In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

%matplotlib inline

In [2]:
df=pd.read_csv('results.csv')

In [3]:
pd.set_option('display.expand_frame_repr', False)
from IPython.display import display
pd.options.display.max_columns=None
df=df.reindex(sorted(df.columns),axis=1)
df.head()

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k,age,bib,city,country,ctz,division,gender,genderdiv,half,name,official,overall,pace,state
0,17.37,37.65,47.67,59.18,71.4,80.43,8.02,47,W1,Fukuoka,JPN,,8,M,8,39.72,"Yamamoto, Hiroyuki",85.25,8,3.27,
1,32.58,65.83,82.43,99.33,116.37,132.1,16.22,33,F1,Eldoret,KEN,,1,F,1,69.47,"Jeptoo, Rita",138.95,21,5.3,
2,16.62,36.1,45.8,56.45,67.42,76.1,7.75,41,W2,Paarl,RSA,,1,M,1,38.03,"Van Dyk, Ernst F.",80.6,1,3.08,
3,32.57,65.83,82.43,99.33,116.37,132.95,16.2,24,F2,Shoa,ETH,,3,F,3,69.47,"Dibaba, Mare",140.58,27,5.37,
4,17.12,36.58,46.37,57.03,67.83,76.72,8.02,40,W3,Nogata Fukuoka,JPN,,2,M,2,38.6,"Hokinoue, Kota",81.23,2,3.1,


In [4]:
df.dtypes

10k           object
20k           object
25k           object
30k           object
35k           object
40k           object
5k            object
age            int64
bib           object
city          object
country       object
ctz           object
division       int64
gender        object
genderdiv      int64
half          object
name          object
official     float64
overall        int64
pace         float64
state         object
dtype: object

In [5]:
X=df.iloc[:,:7]

In [6]:
df.shape

(31984, 21)

## Features Selection

In [7]:
col=X.columns.values
col

array(['10k', '20k', '25k', '30k', '35k', '40k', '5k'], dtype=object)

## Data Cleaning

In [8]:
col=X.columns.values
for i in col:
    df.drop(df[df[i]=='-'].index,inplace=True)
    df[i]=df[i].astype(float)

In [9]:
df.shape

(31687, 21)

In [10]:
df.dtypes

10k          float64
20k          float64
25k          float64
30k          float64
35k          float64
40k          float64
5k           float64
age            int64
bib           object
city          object
country       object
ctz           object
division       int64
gender        object
genderdiv      int64
half          object
name          object
official     float64
overall        int64
pace         float64
state         object
dtype: object

In [11]:
df.head()

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k,age,bib,city,country,ctz,division,gender,genderdiv,half,name,official,overall,pace,state
0,17.37,37.65,47.67,59.18,71.4,80.43,8.02,47,W1,Fukuoka,JPN,,8,M,8,39.72,"Yamamoto, Hiroyuki",85.25,8,3.27,
1,32.58,65.83,82.43,99.33,116.37,132.1,16.22,33,F1,Eldoret,KEN,,1,F,1,69.47,"Jeptoo, Rita",138.95,21,5.3,
2,16.62,36.1,45.8,56.45,67.42,76.1,7.75,41,W2,Paarl,RSA,,1,M,1,38.03,"Van Dyk, Ernst F.",80.6,1,3.08,
3,32.57,65.83,82.43,99.33,116.37,132.95,16.2,24,F2,Shoa,ETH,,3,F,3,69.47,"Dibaba, Mare",140.58,27,5.37,
4,17.12,36.58,46.37,57.03,67.83,76.72,8.02,40,W3,Nogata Fukuoka,JPN,,2,M,2,38.6,"Hokinoue, Kota",81.23,2,3.1,


## Shuffle

In [12]:
from sklearn.utils import shuffle
df = shuffle(df)

In [13]:
df.head()

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k,age,bib,city,country,ctz,division,gender,genderdiv,half,name,official,overall,pace,state
2924,43.75,86.55,108.25,131.57,155.73,179.35,22.2,32,3335,Nevada,USA,,2338,M,3677,91.13,"Mcveigh, Sean P.",189.37,4012,7.23,IA
20796,56.82,114.17,143.98,183.83,222.2,267.95,28.03,50,23729,Sherman,USA,,1164,F,11123,120.43,"Lee, Teresa O",287.03,25735,10.95,CT
10845,46.7,94.53,119.3,146.03,176.02,204.15,23.25,31,12420,Philadelphia,USA,,2147,F,2998,99.85,"Shull, Stephanie A",216.05,11145,8.25,PA
27971,65.5,141.57,187.72,231.63,279.92,324.33,31.68,55,31480,Laguna Beach,USA,,1696,M,16830,149.88,"Maguire, Paul T.",342.83,30272,13.08,CA
17577,53.77,108.3,136.48,166.83,198.58,232.77,27.13,41,20096,Hudson,USA,,1401,F,7766,114.23,"Mutschler, Kelli",247.35,19548,9.45,WI


## Reset dataframe Index

In [14]:
df=df.reset_index(drop=True)

In [15]:
df.head()

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k,age,bib,city,country,ctz,division,gender,genderdiv,half,name,official,overall,pace,state
0,43.75,86.55,108.25,131.57,155.73,179.35,22.2,32,3335,Nevada,USA,,2338,M,3677,91.13,"Mcveigh, Sean P.",189.37,4012,7.23,IA
1,56.82,114.17,143.98,183.83,222.2,267.95,28.03,50,23729,Sherman,USA,,1164,F,11123,120.43,"Lee, Teresa O",287.03,25735,10.95,CT
2,46.7,94.53,119.3,146.03,176.02,204.15,23.25,31,12420,Philadelphia,USA,,2147,F,2998,99.85,"Shull, Stephanie A",216.05,11145,8.25,PA
3,65.5,141.57,187.72,231.63,279.92,324.33,31.68,55,31480,Laguna Beach,USA,,1696,M,16830,149.88,"Maguire, Paul T.",342.83,30272,13.08,CA
4,53.77,108.3,136.48,166.83,198.58,232.77,27.13,41,20096,Hudson,USA,,1401,F,7766,114.23,"Mutschler, Kelli",247.35,19548,9.45,WI


## Deivide dataset to 4 evenly sized subset

In [16]:
rows=df.shape[0]-df.shape[0]%4
df=df.iloc[:rows,:8]
df.head()

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k,age
0,43.75,86.55,108.25,131.57,155.73,179.35,22.2,32
1,56.82,114.17,143.98,183.83,222.2,267.95,28.03,50
2,46.7,94.53,119.3,146.03,176.02,204.15,23.25,31
3,65.5,141.57,187.72,231.63,279.92,324.33,31.68,55
4,53.77,108.3,136.48,166.83,198.58,232.77,27.13,41


In [17]:
df.shape

(31684, 8)

In [18]:
df=np.split(df,4,axis=0)

In [19]:
df[0]

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k,age
0,43.75,86.55,108.25,131.57,155.73,179.35,22.20,32
1,56.82,114.17,143.98,183.83,222.20,267.95,28.03,50
2,46.70,94.53,119.30,146.03,176.02,204.15,23.25,31
3,65.50,141.57,187.72,231.63,279.92,324.33,31.68,55
4,53.77,108.30,136.48,166.83,198.58,232.77,27.13,41
5,56.33,115.68,147.05,182.57,224.45,258.13,27.83,67
6,65.10,134.23,173.07,217.02,257.70,294.52,31.82,39
7,42.32,84.68,106.10,128.52,152.23,176.58,21.28,46
8,49.90,100.95,127.62,155.80,185.12,212.58,24.90,41
9,40.78,84.15,106.90,131.90,160.25,194.30,20.30,34


## Features Selection

In [20]:
df[0].iloc[:rows,:7]

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k
0,43.75,86.55,108.25,131.57,155.73,179.35,22.20
1,56.82,114.17,143.98,183.83,222.20,267.95,28.03
2,46.70,94.53,119.30,146.03,176.02,204.15,23.25
3,65.50,141.57,187.72,231.63,279.92,324.33,31.68
4,53.77,108.30,136.48,166.83,198.58,232.77,27.13
5,56.33,115.68,147.05,182.57,224.45,258.13,27.83
6,65.10,134.23,173.07,217.02,257.70,294.52,31.82
7,42.32,84.68,106.10,128.52,152.23,176.58,21.28
8,49.90,100.95,127.62,155.80,185.12,212.58,24.90
9,40.78,84.15,106.90,131.90,160.25,194.30,20.30


In [21]:
X=[]
y=[]

In [22]:
X.append(df[0].iloc[:rows,:7])
y.append(df[0].iloc[:rows,7])

In [23]:
X[0].head()

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k
0,43.75,86.55,108.25,131.57,155.73,179.35,22.2
1,56.82,114.17,143.98,183.83,222.2,267.95,28.03
2,46.7,94.53,119.3,146.03,176.02,204.15,23.25
3,65.5,141.57,187.72,231.63,279.92,324.33,31.68
4,53.77,108.3,136.48,166.83,198.58,232.77,27.13


In [25]:
y[0].head()

0    32
1    50
2    31
3    55
4    41
Name: age, dtype: int64