In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
titanic_df = pd.read_csv("data/titanic.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,156.0,156.0,156.0,126.0,156.0,156.0,156.0
mean,78.5,0.346154,2.423077,28.141508,0.615385,0.397436,28.109587
std,45.177428,0.477275,0.795459,14.61388,1.056235,0.870146,39.401047
min,1.0,0.0,1.0,0.83,0.0,0.0,6.75
25%,39.75,0.0,2.0,19.0,0.0,0.0,8.00315
50%,78.5,0.0,3.0,26.0,0.0,0.0,14.4542
75%,117.25,1.0,3.0,35.0,1.0,0.0,30.37185
max,156.0,1.0,3.0,71.0,5.0,5.0,263.0


In [4]:
titanic_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,156,156,156,31,155
unique,156,2,145,28,3
top,"Braund, Mr. Owen Harris",male,113803,C123,S
freq,1,100,2,2,110


In [5]:
# Survivors.
survivors = titanic_df[titanic_df['Survived'] == 1]
n_survivors = survivors.shape[0]

# Deceased.
deceased = titanic_df[titanic_df['Survived'] == 0]
n_deceased = deceased.shape[0]

print("Survivors: {}\nDeceased: {}".format(n_survivors, n_deceased))

Survivors: 54
Deceased: 102


In [6]:
# Survival rate per gender.
survival_by_sex = pd.DataFrame(columns = ['Sex', 'Survivors', 'Deceased', 'Total', 'Survival Rate %'])

# Get names for each sex
sexes = titanic_df.Sex.unique()

# Create and append a new row to the data frame with data for each sex
for i in range(len(sexes)):    
    current_sex = sexes[i]
    n_survivors_current_sex = survivors[survivors.Sex == current_sex].Sex.count()
    n_deceased_current_sex = deceased[deceased.Sex == current_sex].Sex.count()
    total_current_sex = n_survivors_current_sex + n_deceased_current_sex
    new_row = {
        'Sex': current_sex,
        'Survivors': n_survivors_current_sex,
        'Deceased': n_deceased_current_sex,
        'Total': total_current_sex,
        'Survival Rate %': round((n_survivors_current_sex / total_current_sex) * 100, 2)
    }
    survival_by_sex.loc[len(survival_by_sex)] = new_row

# Display data frame
survival_by_sex

Unnamed: 0,Sex,Survivors,Deceased,Total,Survival Rate %
0,male,14,86,100,14.0
1,female,40,16,56,71.43


In [7]:
# Age group function.
def createAgeGroup(row_val):
    return 'Infants/children' if row_val >= 0 and row_val < 12 else 'Teenagers' if row_val >= 12 and row_val < 20 else 'Adults' if row_val >= 20 and row_val < 65 else 'Elders' if row_val >= 65 else 'Unknown'

# Age group column creation.
survivors['AgeGroup'] = survivors.Age.apply(lambda x: createAgeGroup(x))
deceased['AgeGroup'] = deceased.Age.apply(lambda x: createAgeGroup(x))
titanic_df['AgeGroup'] = titanic_df.Age.apply(lambda x: createAgeGroup(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survivors['AgeGroup'] = survivors.Age.apply(lambda x: createAgeGroup(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deceased['AgeGroup'] = deceased.Age.apply(lambda x: createAgeGroup(x))


In [8]:
# Create data frame to display survival data by age group
survival_data_by_age_group = pd.DataFrame(columns = ['Age Group', 'Survivors', 'Deceased', 'Total', 'Survival Rate %'])

# Get names for age groups
age_groups = titanic_df.AgeGroup.unique()

# Create and append a new row to the data frame with data for each age group
for i in range(len(age_groups)):    
    current_group = age_groups[i]
    n_survivors_current_group = survivors[survivors.AgeGroup == current_group].AgeGroup.count()
    n_deceased_current_group = deceased[deceased.AgeGroup == current_group].AgeGroup.count()
    total_current_group = n_survivors_current_group + n_deceased_current_group
    new_row = {
        'Age Group': current_group,
        'Survivors': n_survivors_current_group,
        'Deceased': n_deceased_current_group,
        'Total': total_current_group,
        'Survival Rate %': round((n_survivors_current_group / total_current_group) * 100, 2)
    }
    survival_data_by_age_group.loc[len(survival_data_by_age_group)] = new_row

# Display data frame
survival_data_by_age_group

Unnamed: 0,Age Group,Survivors,Deceased,Total,Survival Rate %
0,Adults,29,60,89,32.58
1,Unknown,13,17,30,43.33
2,Infants/children,4,8,12,33.33
3,Teenagers,8,13,21,38.1
4,Elders,0,4,4,0.0


In [9]:
# Create data frame to display survival data by passenger's class
survival_by_class = pd.DataFrame(columns = ['Passenger Class', 'Survivors', 'Deceased', 'Total', 'Survival Rate %'])

# Get names for classes
classes = titanic_df.Pclass.unique()

# Create and append a new row to the data frame with data for each class
for i in range(len(classes)):    
    current_class = classes[i]
    n_survivors_current_class = survivors[survivors.Pclass == current_class].Pclass.count()
    n_deceased_current_class = deceased[deceased.Pclass == current_class].Pclass.count()
    total_current_class = n_survivors_current_class + n_deceased_current_class
    new_row = {
        'Passenger Class': current_class,
        'Survivors': n_survivors_current_class,
        'Deceased': n_deceased_current_class,
        'Total': total_current_class,
        'Survival Rate %': round((n_survivors_current_class / total_current_class) * 100, 2)
    }
    survival_by_class.loc[len(survival_by_class)] = new_row

# Display data frame
survival_by_class

Unnamed: 0,Passenger Class,Survivors,Deceased,Total,Survival Rate %
0,3,28,68,96,29.17
1,1,12,18,30,40.0
2,2,14,16,30,46.67


In [10]:
# Gender
# Passenger class
# Age group

# Create data frame to display survival data by gender, passenger class and age group
survival_by_gender_class_age = pd.DataFrame(columns = ['Sex', 'Passenger Class', 'Age Group', 'Survivors', 'Deceased', 'Total', 'Survival Rate %'])

# Create and append a new row to the data frame with data for each age group
for i in range(len(sexes)):    
    current_sex = sexes[i]
    current_survivors_sex_df = survivors[survivors.Sex == current_sex]
    current_deceased_sex_df = deceased[deceased.Sex == current_sex]
    for j in range(len(classes)):
        current_class = classes[j]
        current_survivors_sex_class_df = current_survivors_sex_df[current_survivors_sex_df.Pclass == current_class]
        current_deceased_sex_class_df = current_deceased_sex_df[current_deceased_sex_df.Pclass == current_class]
        for k in range(len(age_groups)):
            current_group = age_groups[k]
            current_survivors_sex_class_ageGroup_df = current_survivors_sex_class_df[current_survivors_sex_class_df.AgeGroup == current_group]
            current_deceased_sex_class_ageGroup_df = current_deceased_sex_class_df[current_deceased_sex_class_df.AgeGroup == current_group]
            n_survivors_current_sca_group = current_survivors_sex_class_ageGroup_df.Sex.count()
            n_deceased_current_sca_group = current_deceased_sex_class_ageGroup_df.Sex.count()
            total_current_sca_group = n_survivors_current_sca_group + n_deceased_current_sca_group
            new_row = {
                'Sex': current_sex,
                'Passenger Class': current_class,
                'Age Group': current_group,
                'Survivors': n_survivors_current_sca_group,
                'Deceased': n_deceased_current_sca_group,
                'Total': total_current_sca_group,
                'Survival Rate %': 0 if n_survivors_current_sca_group == total_current_sca_group and total_current_sca_group == 0 else round((n_survivors_current_sca_group / total_current_sca_group) * 100, 2)
            }
            survival_by_gender_class_age.loc[len(survival_by_gender_class_age)] = new_row

# Display data frame
survival_by_gender_class_age

Unnamed: 0,Sex,Passenger Class,Age Group,Survivors,Deceased,Total,Survival Rate %
0,male,3,Adults,4,28,32,12.5
1,male,3,Unknown,3,15,18,16.67
2,male,3,Infants/children,0,5,5,0.0
3,male,3,Teenagers,1,4,5,20.0
4,male,3,Elders,0,1,1,0.0
5,male,1,Adults,2,14,16,12.5
6,male,1,Unknown,1,1,2,50.0
7,male,1,Infants/children,0,0,0,0.0
8,male,1,Teenagers,0,1,1,0.0
9,male,1,Elders,0,2,2,0.0
