In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('datasets/game-of-thrones/GoT_age_at_death.csv')

In [4]:
df.head()

Unnamed: 0,Character,Age,Dead (1 = dead),Gender (1=male),Affiliation
0,Sandor Clegan,29,1,1,4
1,Benjen Stark,35,1,1,10
2,Syrio Forel,41,1,1,1
3,Tysha,29,0,0,4
4,Jeyne Pool,12,1,0,1


In [5]:
# change column names
df.columns = ['character', 'age', 'dead', 'gender', 'affiliation']

In [6]:
df.head(1)

Unnamed: 0,character,age,dead,gender,affiliation
0,Sandor Clegan,29,1,1,4


In [7]:
# change datatypes of columns
df['dead'] = df['dead'].astype('category')
df['gender'] = df['gender'].astype('category')
df['affiliation'] = df['affiliation'].astype('category')

In [8]:
# calculate mean

pandas_mean = df.age.mean()
numpy_mean = np.mean(df.age)

print('pandas_mean', pandas_mean)
print('numpy_mean', numpy_mean)

pandas_mean 35.59891598915989
numpy_mean 35.59891598915989


In [9]:
x = [1,2,3,4,5]
# x.mean() will not work but the np.mean(x) method will work.

np.mean(x)

3.0

In [10]:
# calculate standard deviation

# Population standard deviation
pop_stdev = np.std(df.age)

# Sample standard deviation
sampl_stdev = np.std(df.age, ddof = 1)

print('pop_stdev', pop_stdev)
print('sampl_stdev', sampl_stdev)

pop_stdev 18.99184246263994
sampl_stdev 19.01762909021605


In [11]:
# calculate the median

np.mean(df.age)

35.59891598915989

In [12]:
# calculate the quartiles and show descriptions

df.age.describe()

count    369.000000
mean      35.598916
std       19.017629
min        0.000000
25%       20.000000
50%       35.000000
75%       47.000000
max      102.000000
Name: age, dtype: float64

In [13]:
np.percentile(df.age, 65)

41.0

65% of values are equal to and below 41 while 35% of values are above 41.

In [14]:
# calculate max and min values

max_age = np.amax(df.age)
min_age = np.amin(df.age)

print('max_age', max_age)
print('min_age', min_age)

max_age 102
min_age 0


### Bivariate Analysis

In [15]:
df2 = pd.read_csv('datasets/stature-hand-foot.csv')

In [16]:
df2.head()

Unnamed: 0,gender,height,hand length,foot length
0,1,1760.2,208.6,269.6
1,1,1730.1,207.6,251.3
2,1,1659.6,173.2,193.6
3,1,1751.3,258.0,223.8
4,1,1780.6,212.3,282.1


In [17]:
df2['gender'] = df2['gender'].astype('category')
df2.columns = ['gender', 'height', 'hand_length', 'foot_length']

In [18]:
df2.head()

Unnamed: 0,gender,height,hand_length,foot_length
0,1,1760.2,208.6,269.6
1,1,1730.1,207.6,251.3
2,1,1659.6,173.2,193.6
3,1,1751.3,258.0,223.8
4,1,1780.6,212.3,282.1


In [19]:
# calculate covariance

df2.hand_length.cov(df2.foot_length)

195.07014411395065

In [20]:
# calculate correlation

df2.hand_length.corr(df2.foot_length)

0.78822430812387168

### More Practice

In [22]:
df.head()

Unnamed: 0,character,age,dead,gender,affiliation
0,Sandor Clegan,29,1,1,4
1,Benjen Stark,35,1,1,10
2,Syrio Forel,41,1,1,1
3,Tysha,29,0,0,4
4,Jeyne Pool,12,1,0,1


In [25]:
# get mean age for each gender

df.groupby(['gender'])['age'].mean()

gender
0    29.467391
1    37.635379
Name: age, dtype: float64

In [26]:
# get mean age for each gender - affiliation combination

df.groupby(['gender', 'affiliation'])['age'].mean()

gender  affiliation
0       0              29.000000
        1              33.000000
        2              25.300000
        3              31.750000
        4              28.714286
        5              24.166667
        6              20.600000
        9              19.000000
        12             30.333333
        13             75.000000
        14             30.666667
        15             68.000000
        16             35.666667
1       0              33.375000
        1              35.588235
        2              35.437500
        3              38.279070
        4              41.645161
        5              31.166667
        6              40.933333
        8              59.000000
        9              30.500000
        10             35.939394
        11             36.666667
        12             36.222222
        13             44.894737
        14             33.142857
        15             44.777778
        16             33.857143
Name: age, dtype: float

In [27]:
df.columns

Index(['character', 'age', 'dead', 'gender', 'affiliation'], dtype='object')

In [31]:
alive_chars = df[df['dead'] == 0]
alive_chars.head()

Unnamed: 0,character,age,dead,gender,affiliation
3,Tysha,29,0,0,4
7,Jhiqui,17,0,0,3
12,Craster's Younger Wife,23,0,0,12
13,Palla,18,0,0,1
16,Hallis Mollen,40,0,1,1


In [32]:
np.mean(df.age)

35.59891598915989

In [33]:
np.mean(alive_chars['age'])

33.00568181818182

In [34]:
np.std(df.age)

18.99184246263994

In [35]:
np.std(df.age, ddof = 1)

19.01762909021605

ddof = delta degrees of freedom. (1 = n-1 for samples. 0 = N = default = population)

Symmetry of distribution - When the mean > median = data skewed to the right. When the median is greater than the mean, the data is skewed to the left. When the mean and median are the same, the data are symmetrical. Right skewed distribution = most data points are below the mean but there are some high values pulling the mean higher. 

In [37]:
np.median(df.age)

35.0

In [38]:
np.mean(df.age)

35.59891598915989

In [39]:
np.percentile(df.age, 65)

41.0

In [40]:
alive_chars.age.describe()

count    176.000000
mean      33.005682
std       17.829830
min        1.000000
25%       18.000000
50%       32.000000
75%       44.250000
max       92.000000
Name: age, dtype: float64

In [41]:
print('IQR: ', np.percentile(df.age, 75) - np.percentile(df.age, 25))

IQR:  27.0


In [42]:
df2.head()

Unnamed: 0,gender,height,hand_length,foot_length
0,1,1760.2,208.6,269.6
1,1,1730.1,207.6,251.3
2,1,1659.6,173.2,193.6
3,1,1751.3,258.0,223.8
4,1,1780.6,212.3,282.1


In [43]:
female = df2[df2.gender == 1]
male = df2[df2.gender == 2]

In [44]:
df2.corr()

Unnamed: 0,height,hand_length,foot_length
height,1.0,0.873295,0.88128
hand_length,0.873295,1.0,0.788224
foot_length,0.88128,0.788224,1.0


In [48]:
male.corr()

Unnamed: 0,height,hand_length,foot_length
height,1.0,0.709244,0.6991
hand_length,0.709244,1.0,0.609538
foot_length,0.6991,0.609538,1.0


In [49]:
male.cov()

Unnamed: 0,height,hand_length,foot_length
height,2424.11756,325.975892,416.985944
hand_length,325.975892,87.141622,68.931676
foot_length,416.985944,68.931676,146.760912


In [46]:
female.corr()

Unnamed: 0,height,hand_length,foot_length
height,1.0,0.722356,0.715975
hand_length,0.722356,1.0,0.473
foot_length,0.715975,0.473,1.0


In [47]:
female.cov()

Unnamed: 0,height,hand_length,foot_length
height,3754.465848,407.545449,545.01288
hand_length,407.545449,84.781441,54.106147
foot_length,545.01288,54.106147,154.336922
