# HW1: Titanic Data

In [2]:
# First, import all packages
!pip install seaborn

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.precision', 3)



In [3]:
# Load the data set
df = pd.read_csv('titanic.csv')

In [4]:
# Question #1: How many passengers are in our passenger list?
total_passengers = len(df)

print total_passengers

891


In [5]:
# Question #2: What is the overall survival rate?

# Count the number of records where Survived == 1
total_survivors = len(df[df.Survived == 1])

# Divide (using float() to ensure decimal output)
#   total_survivors by total_passengers to get
#   overall_survival_rate
overall_survival_rate = float(total_survivors) / total_passengers

print overall_survival_rate

0.383838383838


In [6]:
# Question #3: How many male passengers were onboard?

# Count the number of records where Sex == 'male'
total_males = len(df[df.Sex == 'male'])

print total_males

577


In [7]:
# Question #4: How many female passengers were onboard?

# Count the number of records where Sex == 'female'
total_females = len(df[df.Sex == 'female'])

print total_females

314


In [8]:
# Question #5: What is the overall survival rate of male passengers?

# Count the number of records where Sex == 'male'
#   and Survived == 1
total_survivors_male = len(df[(df.Sex == 'male') & (df.Survived == 1)])

# Divide total_survivors_male by total_males to get
#   overall_survival_rate_male
overall_survival_rate_male = float(total_survivors_male) / total_males

print overall_survival_rate_male

0.188908145581


In [9]:
# Question #6: What is the overall survival rate of female passengers?

# Count the number of records where Sex == 'female'
#   and Survived == 1
total_survivors_female = len(df[(df.Sex == 'female') & (df.Survived == 1)])

# Divide total_survivors_female by total_males to 
#   get overall_survival_rate_female
overall_survival_rate_female = float(total_survivors_female) / total_females

print overall_survival_rate_female

0.742038216561


In [19]:
# Question #7: What is the average age of all passengers onboard?

# Use both mean() and median() methods on Age to get
#   different average age calculations
overall_median_age = df.Age.median()
overall_mean_age = df.Age.mean()

print 'median: ', overall_median_age, '| mean: ', overall_mean_age

median:  28.0 | mean:  29.6991176471


In [20]:
# Question #7a: Note that some of the passengers do not have an age value. 
#   How did you deal with this? What are some other ways of dealing with this?

# I left values as-is in the last example, but one way to deal with null values
#   is to ignore records will null values by dropping them from the data frame
#   on which the calculations would be done:
df_no_nulls = df[df.Age.notnull()]

# Use both mean() and median() methods on Age to get
#   different average age calculations
overall_median_age_no_nulls = df_no_nulls.Age.median()
overall_mean_age_no_nulls = df_no_nulls.Age.mean()

print 'median: ', overall_median_age_no_nulls, '| mean: ', overall_mean_age_no_nulls

# It looks like the median() and mean() functions ignore null values when calculating
#   on fields containing null values.

median:  28.0 | mean:  29.6991176471


In [33]:
# Question #8: What is the average age of passengers who survived?
# Question #9: What is the average age of passengers who did not survive?

# Groupby Survived values and calculate the mean ages of each group
df.groupby(['Survived']).Age.mean()

Survived
0           30.63
1           28.34
Name: Age, dtype: float64

In [31]:
# Groupby Survived values and calculate the median ages of each group
df.groupby(['Survived']).Age.median()

Survived
0           28
1           28
Name: Age, dtype: float64

In [34]:
# Question #10: At this (early) point in our analysis, 
#   what might you infer about any patterns you are seeing?

# Based on the data, it looks like you're more likely to survive if
#   you're female, and maybe a little more likely the younger you are.

In [37]:
# Question #11: How many passengers are in each of the three classes 
#   of service (e.g. First, Second, Third?)

# Get value_counts() of each Pclass value
df.Pclass.value_counts()

3    491
1    216
2    184
dtype: int64

In [38]:
# Question #12: What is the survival rate for passengers in 
#   each of the three classes of service?

# There are at least two ways to do this (that I can see):

# Method #1: Groupby Pclass, then get value_counts() by Survived value
#   This method doesn't actually set you up to do rate calculations,
#   but it does allow you to see all the data at once, after which
#   you can manually do calculations
df.groupby('Pclass').Survived.value_counts()

Pclass   
1       1    136
        0     80
2       0     97
        1     87
3       0    372
        1    119
dtype: int64

In [44]:
# Method #2: Use boolean conditions to store the values needed to
#   perform the calculations for each class' survival rate

# Count surivivors and total passengers for Pclass == 1
total_first_class_survivors = len(df[(df.Pclass == 1) & (df.Survived == 1)])
total_first_class_passengers = len(df[df.Pclass == 1])

first_class_survival_rate = float(total_first_class_survivors) / total_first_class_passengers

print first_class_survival_rate

0.62962962963


In [45]:
# Count surivivors and total passengers for Pclass == 2
total_second_class_survivors = len(df[(df.Pclass == 2) & (df.Survived == 1)])
total_second_class_passengers = len(df[df.Pclass == 2])

second_class_survival_rate = float(total_second_class_survivors) / total_second_class_passengers

print second_class_survival_rate

0.472826086957


In [46]:
# Count surivivors and total passengers for Pclass == 3
total_third_class_survivors = len(df[(df.Pclass == 3) & (df.Survived == 1)])
total_third_class_passengers = len(df[df.Pclass == 3])

third_class_survival_rate = float(total_third_class_survivors) / total_third_class_passengers

print third_class_survival_rate

0.242362525458


In [40]:
# Question #13: What else might you conclude?

# It looks like it's a helluva lot better to be younger, in a higher 
#   fare class, and a female if you want to survive. I feel badly for the
#   old, poor, males :(

In [None]:
# Question #14: Last, if we were to build a predictive model, which features 
#   in the data do you think we should include in the model and 
#   which can we leave out? Why?

# Based on what we've observed in this lab, age, fare class, and gender seem to be
#   features worth exploring. The 'SibSp' and 'Embarked' features also seem to have
#   potentialy relevant groupings, so they may be worth exploring. Other fields like 
#   'Ticket' and 'Name' don't seem to have meaningful groupings that could lead to
#   correlations.