In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import sport_helpers
from sport_helpers import create_top_sport
from sport_helpers import create_enc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import tree

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
df = create_top_sport()

In [5]:
df.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,age
0,736041664,A Jesus Garcia,ESP,male,1969-10-17,1.72,64.0,athletics,0,0,0,47
1,532037425,A Lam Shin,KOR,female,1986-09-23,1.68,56.0,fencing,0,0,0,30
2,435962603,Aaron Brown,CAN,male,1992-05-27,1.98,79.0,athletics,0,0,1,24
3,521041435,Aaron Cook,MDA,male,1991-01-02,1.83,80.0,taekwondo,0,0,0,25
4,33922579,Aaron Gate,NZL,male,1990-11-26,1.81,71.0,cycling,0,0,0,26


In [7]:
df.shape

(10858, 12)

In [10]:
df.groupby('sport')['id'].count()

sport
aquatics             1396
archery               126
athletics            2187
badminton             164
basketball            284
canoe                 327
cycling               502
equestrian            215
fencing               245
football              583
golf                  115
gymnastics            319
handball              349
hockey                432
judo                  377
modern pentathlon      72
rowing                535
rugby sevens          296
sailing               372
shooting              380
table tennis          168
taekwondo             125
tennis                191
triathlon             109
volleyball            382
weightlifting         258
wrestling             349
Name: id, dtype: int64

In [9]:
df.sport.nunique()

27

### We have 27 different sports listed.  All but one sport have more than 100 athletes.  

### Let's look a few thoughts/hypothesis:

### H1: The mean height and weight for basketball players will be greater than gymnastics or equestrian.

### H2: The mean age for gymnastics will be much lower than overall mean.

### H3: Archery, sailing, or golf may have the highest average age.

In [21]:
df.groupby('sport')['height'].mean().sort_values()

sport
gymnastics           1.638119
weightlifting        1.666628
table tennis         1.721845
wrestling            1.722951
shooting             1.728921
triathlon            1.732110
hockey               1.733981
judo                 1.734085
archery              1.735952
badminton            1.745610
golf                 1.745739
football             1.746003
cycling              1.747151
equestrian           1.749349
athletics            1.752039
rugby sevens         1.755270
sailing              1.761478
modern pentathlon    1.762361
fencing              1.772449
canoe                1.781101
aquatics             1.788338
taekwondo            1.797200
tennis               1.802251
handball             1.840401
rowing               1.849477
volleyball           1.896702
basketball           1.919120
Name: height, dtype: float64

In [22]:
df.groupby('sport')['weight'].mean().sort_values()

sport
gymnastics           54.278997
triathlon            60.633028
table tennis         65.065476
modern pentathlon    65.958333
equestrian           67.493023
athletics            67.717421
cycling              67.820717
taekwondo            68.088000
football             68.433962
badminton            68.774390
hockey               68.900463
fencing              70.661224
sailing              71.169355
golf                 71.443478
archery              72.190476
aquatics             72.277221
tennis               73.162304
shooting             73.905263
judo                 76.960212
canoe                77.015291
wrestling            77.742120
rugby sevens         78.756757
rowing               79.938318
weightlifting        79.980620
volleyball           80.102094
handball             83.710602
basketball           87.750000
Name: weight, dtype: float64

In [23]:
df.groupby('sport')['age'].mean().sort_values()

sport
equestrian           20.990698
gymnastics           22.517241
aquatics             23.906160
football             23.943396
taekwondo            24.584000
weightlifting        25.201550
modern pentathlon    25.930556
archery              25.984127
shooting             26.036842
rugby sevens         26.398649
hockey               26.506944
judo                 26.543767
athletics            26.896662
canoe                26.941896
wrestling            26.957020
table tennis         27.065476
rowing               27.091589
cycling              27.260956
badminton            27.317073
sailing              27.709677
volleyball           27.842932
fencing              27.881633
basketball           28.031690
triathlon            28.055046
handball             28.246418
tennis               28.335079
golf                 30.052174
Name: age, dtype: float64

### Pretty good on the thoughts.  Gymna