In [10]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import nationality_helpers
from nationality_helpers import create_top_medalist

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import tree

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = create_top_medalist()

In [4]:
df.shape

(1072, 13)

### To do:

### Split into male/female.  Then split athletics vs not athletics.  Athletics contains all of track.  This is a large range from distance runners to shot put throwers.  Create age column.

In [5]:
df.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,medal_or_nm,country_count
0,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1,1,69
1,769580282,Akeem Haynes,CAN,male,3/11/92,1.68,71.0,athletics,0,0,1,1,69
2,373002185,Allison Beveridge,CAN,female,6/1/93,1.69,62.0,cycling,0,0,1,1,69
3,686662012,Allysha Chapman,CAN,female,1/25/89,1.6,58.0,football,0,0,1,1,69
4,857846421,Andre de Grasse,CAN,male,11/10/94,1.76,70.0,athletics,0,1,2,3,69


### There are a lot of sports with small amounts of athletes.  Let's narrow it down to sports with more than 30 athletes.

In [7]:
df.groupby('sport')['country_count'].agg('count')

sport
aquatics             221
archery               11
athletics             83
badminton              8
basketball            48
canoe                 27
cycling               46
equestrian            39
fencing               40
football              54
golf                   3
gymnastics            50
handball              60
hockey                49
judo                  28
modern pentathlon      3
rowing                79
rugby sevens          36
sailing               20
shooting              28
table tennis          18
taekwondo              9
tennis                 9
triathlon              4
volleyball            65
weightlifting          9
wrestling             25
Name: country_count, dtype: int64

In [15]:
athlete_count = pd.DataFrame(df.groupby('sport')['name'].agg('count'))
athlete_count.columns = ['athlete_count']

In [17]:
athlete_count.head(3)

Unnamed: 0_level_0,athlete_count
sport,Unnamed: 1_level_1
aquatics,221
archery,11
athletics,83


In [18]:
df = df.merge(athlete_count, on='sport')

In [20]:
df = df[df.athlete_count > 30]

In [21]:
df.describe()

Unnamed: 0,id,height,weight,gold,silver,bronze,medal_or_nm,country_count,athlete_count
count,870.0,870.0,870.0,870.0,870.0,870.0,870.0,870.0,870.0
mean,505131400.0,1.798437,74.056322,0.402299,0.351724,0.37931,1.133333,138.274713,99.367816
std,288454600.0,0.122346,15.586731,0.58677,0.512639,0.508645,0.477977,69.773523,72.255941
min,4071248.0,1.4,33.0,0.0,0.0,0.0,1.0,53.0,36.0
25%,256745200.0,1.71,63.0,0.0,0.0,0.0,1.0,82.0,49.0
50%,501741400.0,1.8,72.0,0.0,0.0,0.0,1.0,109.0,65.0
75%,760133000.0,1.89,83.75,1.0,1.0,1.0,1.0,159.0,221.0
max,999374100.0,2.21,136.0,5.0,2.0,2.0,6.0,258.0,221.0


### Checking for data integrity issues.  Height and weight seem feasible.

In [11]:
df.dob = pd.to_datetime(df.dob)

In [22]:
df.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,medal_or_nm,country_count,athlete_count
0,435962603,Aaron Brown,CAN,male,1992-05-27,1.98,79.0,athletics,0,0,1,1,69,83
1,769580282,Akeem Haynes,CAN,male,1992-03-11,1.68,71.0,athletics,0,0,1,1,69,83
2,857846421,Andre de Grasse,CAN,male,1994-11-10,1.76,70.0,athletics,0,1,2,3,69,83
3,321655820,Brendon Rodney,CAN,male,1992-04-09,1.95,80.0,athletics,0,0,1,1,69,83
4,542571086,Brianne Theisen Eaton,CAN,female,1988-12-18,1.75,64.0,athletics,0,0,1,1,69,83


In [27]:
categorical_features = df.dtypes==object

In [28]:
categorical_cols = df.columns[categorical_features].tolist()

In [29]:
le = LabelEncoder()

In [32]:
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

In [33]:
df.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,medal_or_nm,country_count,athlete_count
0,435962603,0,1,1,1992-05-27,1.98,79.0,1,0,0,1,1,69,83
1,769580282,8,1,1,1992-03-11,1.68,71.0,1,0,0,1,1,69,83
2,857846421,50,1,1,1994-11-10,1.76,70.0,1,0,1,2,3,69,83
3,321655820,101,1,1,1992-04-09,1.95,80.0,1,0,0,1,1,69,83
4,542571086,103,1,0,1988-12-18,1.75,64.0,1,0,0,1,1,69,83


In [34]:
df['age']] = 2016-df['dob'].dt.year

KeyError: "['age'] not in index"

In [None]:
# train, test = train_test_split(df, test_size=.3, random_state=123, stratify=df[['e_or_p']])