### Titanic dataset from seaborn

In [29]:
# Usual imports
import numpy as np
import pandas as pd
import seaborn as sns

In [30]:
# Load Titanic dataset from seaborn
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [31]:
# Check for NaN values
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [32]:
# Deck values have lots of NaN values 688/891 = 77.2%
# Drop deck dolumn, because it hard to impute
df = df.drop('deck', axis=1)
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [33]:
# Check again missing values
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
embark_town      2
alive            0
alone            0
dtype: int64

In [34]:
# Check age based values by cross table grouping
# We get mean age by sex, class and embark_town
df.groupby(['class', 'embark_town', 'sex']).mean('age')

  df.groupby(['class', 'embark_town', 'sex']).mean('age')


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
class,embark_town,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
First,Cherbourg,female,0.976744,1.0,36.052632,0.511628,0.302326,115.640309,0.0,0.418605
First,Cherbourg,male,0.404762,1.0,40.111111,0.238095,0.333333,93.536707,1.0,0.547619
First,Queenstown,female,1.0,1.0,33.0,1.0,0.0,90.0,0.0,0.0
First,Queenstown,male,0.0,1.0,44.0,2.0,0.0,90.0,1.0,0.0
First,Southampton,female,0.958333,1.0,32.704545,0.604167,0.625,99.02691,0.0,0.291667
First,Southampton,male,0.35443,1.0,41.897188,0.329114,0.253165,52.949947,0.962025,0.658228
Second,Cherbourg,female,1.0,2.0,19.142857,0.714286,0.571429,25.268457,0.0,0.285714
Second,Cherbourg,male,0.2,2.0,25.9375,0.5,0.5,25.42125,0.9,0.4
Second,Queenstown,female,1.0,2.0,30.0,0.0,0.0,12.35,0.0,1.0
Second,Queenstown,male,0.0,2.0,57.0,0.0,0.0,12.35,1.0,1.0


In [35]:
# Create helper function to fill missing NaN values in age column
# Use age mean from previous cell block
def impute_age(row):
    pclass = row['pclass']
    age = row['age']
    town = row['embark_town']
    sex = row['sex']

    if pd.isnull(age):
        if pclass == 1:
            if town == 'Cherbourg':
                if sex == 'male':
                    return 40
                else:
                    return 36
            elif town == 'Queenstown':
                if sex == 'male':
                    return 44
                else:
                    return 33
            else:
                if sex == 'male':
                    return 41
                else:
                    return 32
        elif pclass == 2:
            if town == 'Cherbourg':
                if sex == 'male':
                    return 26
                else:
                    return 19
            elif town == 'Queenstown':
                if sex == 'male':
                    return 57
                else:
                    return 30
            else:
                if sex == 'male':
                    return 30
                else:
                    return 30
        else:
            if town == 'Cherbourg':
                if sex == 'male':
                    return 25
                else:
                    return 14
            elif town == 'Queenstown':
                if sex == 'male':
                    return 28
                else:
                    return 23
            else:
                if sex == 'male':
                    return 26
                else:
                    return 23
                
    else:
        return age


In [36]:
# Check NaN values in age column
df.tail(15)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
876,0,3,male,20.0,0,0,9.8458,S,Third,man,True,Southampton,no,True
877,0,3,male,19.0,0,0,7.8958,S,Third,man,True,Southampton,no,True
878,0,3,male,,0,0,7.8958,S,Third,man,True,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0,S,Second,woman,False,Southampton,yes,False
881,0,3,male,33.0,0,0,7.8958,S,Third,man,True,Southampton,no,True
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,Southampton,no,True
883,0,2,male,28.0,0,0,10.5,S,Second,man,True,Southampton,no,True
884,0,3,male,25.0,0,0,7.05,S,Third,man,True,Southampton,no,True
885,0,3,female,39.0,0,5,29.125,Q,Third,woman,False,Queenstown,no,False


In [37]:
# Apply helper function to age column
df['age'] = df.apply(impute_age, axis=1)

In [38]:
# Check if NaN values are replaced
df.tail(15)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
876,0,3,male,20.0,0,0,9.8458,S,Third,man,True,Southampton,no,True
877,0,3,male,19.0,0,0,7.8958,S,Third,man,True,Southampton,no,True
878,0,3,male,26.0,0,0,7.8958,S,Third,man,True,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0,S,Second,woman,False,Southampton,yes,False
881,0,3,male,33.0,0,0,7.8958,S,Third,man,True,Southampton,no,True
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,Southampton,no,True
883,0,2,male,28.0,0,0,10.5,S,Second,man,True,Southampton,no,True
884,0,3,male,25.0,0,0,7.05,S,Third,man,True,Southampton,no,True
885,0,3,female,39.0,0,5,29.125,Q,Third,woman,False,Queenstown,no,False


In [39]:
# Check what NaN values still exists
df.isna().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
embark_town    2
alive          0
alone          0
dtype: int64

In [40]:
# Drop rows with NaN values, since there is only few of them
df = df.dropna()

In [41]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,23.0,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [42]:
# Use factorize to automatically assing number for each category
values, names = pd.factorize(df['sex'], sort=False)

In [43]:
# Assing binary category to sex column 
df['sex'] = values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sex'] = values


In [44]:
# Check how well common statistical measurements relate to data
df.describe()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,2.311586,0.350956,29.211665,0.524184,0.382452,32.096681
std,0.48626,0.8347,0.477538,13.271115,1.103705,0.806761,49.697504
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.8958
50%,0.0,3.0,0.0,27.0,0.0,0.0,14.4542
75%,1.0,3.0,1.0,36.0,1.0,0.0,31.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [45]:
# NaN values looking good
df.isna().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [46]:
# Create mapper for embark_town column to replace ordinal category
c_map = {'Cherbourg': 1, 'Queenstown': 2, 'Southampton': 3}
# Apply mapper for new column
df['Embark_town'] = df['embark_town'].replace(c_map)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Embark_town'] = df['embark_town'].replace(c_map)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,Embark_town
0,0,3,0,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False,3
1,1,1,1,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False,1
2,1,3,1,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True,3
3,1,1,1,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False,3
4,0,3,0,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True,3
887,1,1,1,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True,3
888,0,3,1,23.0,1,2,23.4500,S,Third,woman,False,Southampton,no,False,3
889,1,1,0,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True,1


In [47]:
# Create helper function for age_groups
def age_group(row):
    age = row['age']

    if age < 30:
        return 1
    elif 30 <= age < 50:
        return 2
    else:
        return 3

In [48]:
# Apply helper function for new column
df['age_group'] = df.apply(age_group, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age_group'] = df.apply(age_group, axis=1)


In [49]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,Embark_town,age_group
0,0,3,0,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False,3,1
1,1,1,1,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False,1,2
2,1,3,1,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True,3,1
3,1,1,1,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False,3,2
4,0,3,0,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True,3,1
887,1,1,1,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True,3,1
888,0,3,1,23.0,1,2,23.4500,S,Third,woman,False,Southampton,no,False,3,1
889,1,1,0,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True,1,1


In [55]:
# Lets start to check for more deeper info about passengers who survived
df.groupby(['sex'])['survived'].sum().sort_values(ascending=False)

sex
1    231
0    109
Name: survived, dtype: int64

**By sex:**
- 68 %, (231) where female
- 32 %, (109) were male

In [67]:
# Lets start to check for more deeper info about passengers who survived
df.groupby(['embark_town'])['survived'].sum().sort_values(ascending=False)

embark_town
Southampton    217
Cherbourg       93
Queenstown      30
Name: survived, dtype: int64

**By embark town:**
- 64 % embarked from Southampton
- 27 % embarked from Cherbourg
- 9 % embarked from Queenstown

In [56]:
# By sex and age group
df.groupby(['sex', 'age_group'])['survived'].sum().sort_values(ascending=False)

sex  age_group
1    1            130
     2             82
0    1             61
     2             41
1    3             19
0    3              7
Name: survived, dtype: int64

**By sex and age group**
- 56 % females were under 30 years --> 130 person
    - 38 % of all survivors
- 55 % of males were under 30 years --> 61 person
    - 18 % of all survivors
- 92 % females were under 50 years --> 212 person
    - 62 % of all survivors

In [68]:
# By sex and embark town
df.groupby(['sex', 'embark_town'])['survived'].sum().sort_values(ascending=False)

sex  embark_town
1    Southampton    140
0    Southampton     77
1    Cherbourg       64
0    Cherbourg       29
1    Queenstown      27
0    Queenstown       3
Name: survived, dtype: int64

**By sex and embark town**
- 60 % of females embarked from Southampton
- 28 % of females embarked from Cherbourg
- 12 % of females embarked from Queenstown
- 70 % of males embarked from Southampton
- 27 % of males embarked from Cherbourg
- 3 % of males embarked from Queenstown

In [69]:
# By sex, embark town and age group
df.groupby(['sex', 'embark_town', 'age_group'])['survived'].sum().sort_values(ascending=False)

sex  embark_town  age_group
1    Southampton  1            72
                  2            56
0    Southampton  1            40
1    Cherbourg    1            34
0    Southampton  2            32
1    Queenstown   1            24
     Cherbourg    2            23
0    Cherbourg    1            18
1    Southampton  3            12
0    Cherbourg    2             9
1    Cherbourg    3             7
0    Southampton  3             5
1    Queenstown   2             3
0    Queenstown   1             3
     Cherbourg    3             2
     Queenstown   3             0
                  2             0
Name: survived, dtype: int64

**By sex, embark town and age group**
- 31 % were females under 30 who embarked from Southampton
- 56 % were females under 50 who embarked from Southampton
- 17 % were males under 30 who embarked from Southampton

In [70]:
# By sex, embark town, age group and pclass
df.groupby(['sex', 'age_group', 'embark_town', 'pclass'])['survived'].sum().sort_values(ascending=False)

sex  age_group  embark_town  pclass
1    1          Southampton  2         31
                             3         25
     2          Southampton  2         25
     1          Queenstown   3         24
0    1          Southampton  3         24
1    2          Southampton  1         24
                Cherbourg    1         23
0    2          Southampton  1         17
1    1          Southampton  1         16
                Cherbourg    3         15
                             1         12
0    2          Southampton  3         10
     1          Cherbourg    3         10
                Southampton  2          9
     2          Cherbourg    1          9
1    2          Southampton  3          7
     1          Cherbourg    2          7
0    1          Southampton  1          7
1    3          Cherbourg    1          7
0    1          Cherbourg    1          6
1    3          Southampton  1          6
0    2          Southampton  2          5
1    3          Southampton  2          

**By sex, age group, embark town and pclass**
- 24 % were females under 30 years who embarked from Southampton and travelled in 2nd or 3dr class.

### Conclusions:
- Typical survivor was a female age under 30 years who embarked from Southampton and travelled in 2nd or 3rd class (24 %)
- Majority of survivors were females (68 %)
- Majority of survivors were females under 30 years (38 %)