### Load libraries and data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

titanic = pd.read_csv("./data/titanic.csv")  #note: your file path may be different

In [None]:
titanic.info()
titanic.describe()

In [None]:
#compute family size
titanic['family size'] = titanic['SibSp']+titanic['Parch'] + 1

In [None]:
#convert Survived and Pclass to categorical variables
d_Survived={0:"No",1:"Yes"}
d_Pclass={1:"1st",2:"2nd",3:"3rd"}
titanic['Survived'] = titanic['Survived'].astype('category').apply(lambda x:d_Survived[x])
titanic['Pclass'] = titanic['Pclass'].astype('category').apply(lambda x:d_Pclass[x])
titanic.head()

### 1. Generate a series of bar charts to describe the gender, ticket class and survival of the passengers onboard.


In [None]:
# Bar chart on gender
titanic.groupby('Sex').size().plot.bar(title = 'Gender of the passengers onboard', 
                                       ylabel = "number of passengers", xlabel = "gender")
plt.show()

# Note we did not call fig, ax = plt.subplots() and then pass ax into pandas Dataframe/Series plot(). 
# We could use fig, ax = plt.subplots() and pass ax into Dataframe/Series plot(), but we do not have to.

In [None]:
# Bar chart on gender (alternative using kind='bar'; other suitable options are 'barh', 'pie')
titanic.groupby('Sex').size().plot(kind='bar',
                                   title = 'Gender of the passengers onboard', 
                                   ylabel = "number of passengers", xlabel = "gender")
plt.show()

In [None]:
# Bar chart on gender (sort value by descending order)
titanic.groupby('Sex').size().sort_values(ascending=False).plot(kind='bar', title = 'Gender of the passengers onboard', ylabel = "number of passengers", xlabel = "gender")
#titanic['Sex'].value_counts().sort_values(ascending=False).plot(kind='bar', title = 'Gender of the passengers onboard', ylabel = "number of passengers", xlabel = "gender")
plt.show()

In [None]:
# Bar chart on gender (with data values shown) (method 1)

fig, ax = plt.subplots()

plot_data = titanic.groupby('Sex').size()

ax.bar(x=plot_data.index, height=plot_data, 
       alpha=0.6, #alpha=0.6 makes bars slightly transparent
       width = 0.5) #argument width=0.5 makes the bars thinner

ax.set_title('Gender of the passengers onboard') 
ax.set_ylabel("number of passengers")
ax.set_xlabel("gender")         

for i in range(len(plot_data.index)):
    #print(plot_data.index[i])
    ax.text(x=plot_data.index[i], y=plot_data[i]-50, s=int(plot_data[i]), ha='center')

plt.show()

In [None]:
# Bar chart on gender (with data values shown) (method 2)
# Reference -- https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9

fig, ax = plt.subplots()

plot_data = titanic.groupby('Sex').size()

ax = plot_data.plot(kind='bar', 
                    title = 'Gender of the passengers onboard', 
                    ylabel = "number of passengers", xlabel = "gender",
                    alpha = 0.6, width = 0.5)
    
for bar in ax.patches:
    width = bar.get_width()
    height = bar.get_height()
    x, y = bar.get_xy() 
    ax.text(x = x+width/2, 
            y = y+height-30, 
            s = int(height), 
            ha='center', va='center')
    
plt.show()

In [None]:
# Bar chart on ticket class
titanic.groupby('Pclass').size().plot.bar(title = 'Ticket class of the passengers onboard', 
                                          xlabel = "ticket class", ylabel = "number of passengers")
plt.show()

In [None]:
# Bar chart on ticket class (sort by values in descending order)
ax = titanic.groupby('Pclass').size().sort_values(ascending=False).plot(kind='bar',
                                                                   title='Ticket class of the passengers onboard', 
                                                                   xlabel="ticket class", 
                                                                   ylabel="number of passengers")
plt.show()

In [None]:
# Bar chart on ticket class (manually reorder categories)
# We create a new column 'Pclass2'
titanic['Pclass2'] = titanic['Pclass'].cat.reorder_categories(['3rd','2nd','1st'])
ax = titanic.groupby('Pclass2').size().plot.bar(title='Ticket class of the passengers onboard', 
                                                xlabel="ticket class", ylabel="number of passengers")
plt.show()

In [None]:
# Bar chart on survival
titanic.groupby('Survived').size().plot.bar(title = 'Survival of the passengers onboard', 
                                            xlabel = "survived", ylabel = "number of passengers")
plt.show()

### 2. Generate a histogram for the passengers’ age. Furthermore, describe the passengers’ age using the following two boxplots: age per ticket class and age based on survival.

In [None]:
# Histogram on age
titanic['Age'].plot(kind = 'hist', title = "Age of the passengers onboard");

In [None]:
# Histogram on age (alternative)
titanic['Age'].plot.hist(title = "Age of the passengers onboard");

In [None]:
# Boxplot on age (group by ticket class)

fig, ax = plt.subplots(figsize = (8, 4))

sns.boxplot(data = titanic, x = 'Pclass', y = 'Age', width = 0.3, 
            boxprops = dict(alpha=0.6)) #note: in seaborn, alpha (and other parameters) is set in a different way

ax.set_xlabel("Ticket class")
ax.set_title("Age of the passengers onboard")
plt.show()

In [None]:
# Boxplot on age (group by survival)

fig, ax = plt.subplots(figsize = (8, 4))

sns.boxplot(data = titanic, x = 'Survived', y = 'Age', width = 0.3, 
            boxprops = dict(alpha=0.6)) # note in seaborn, alpha (and other parameters) is set in a different way

ax.set_xlabel("Survived")
ax.set_title("Age of the passengers onboard")
plt.show()

### 3. Generate a histogram for the travel fare and a table showing the number of people who did not pay – you may want to check on Google why a handful of people was on board for free! 

In [None]:
# Histogram on travel fare
titanic['Fare'].plot(kind='hist', title="Travel fare of the passengers onboard", xlabel='fare');

In [None]:
titanic[['Fare']].apply(lambda x: x==0).groupby('Fare').size()

### 4. A chart of your choice to describe the family size per ticket class


In [None]:
# Bar chart on ticket class and family size
titanic.groupby(['Pclass', 'family size']).size().unstack().fillna(0).plot.bar(width=1.2, xlabel='ticket class', ylabel='frequency')
plt.show()

In [None]:
titanic.groupby(['Pclass', 'family size']).size()

In [None]:
titanic.groupby(['Pclass', 'family size']).size().unstack()
#titanic.groupby(['Pclass', 'family size']).size().unstack(level=-1) 
#titanic.groupby(['Pclass', 'family size']).size().unstack(level=1) 

#note: level refers level(s) of index to unstack, -1 is last level

In [None]:
titanic.groupby(['Pclass', 'family size']).size().unstack().fillna(0)

In [None]:
# Histogram to show the family size per ticket class

import numpy as np
import matplotlib.pyplot as plt

np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

#plots the histogram (using density)
fig, ax = plt.subplots()
ax.hist([titanic.loc[titanic.Pclass==class_idx, "family size"] for class_idx in ['1st','2nd','3rd']], 
        density=True,
        label=['1st Class','2nd Class','3rd Class'])
ax.set_ylabel("density")
ax.set_xlabel("family size")
plt.tight_layout()
plt.legend(loc='upper right')
plt.show()

#plots the histogram (using frequency)
fig, ax = plt.subplots()
ax.hist([titanic.loc[titanic.Pclass==class_idx, "family size"] for class_idx in ['1st','2nd','3rd']], 
        density=False,
        label=['1st Class','2nd Class','3rd Class'])
ax.set_ylabel("frequency")
ax.set_xlabel("family size")
plt.tight_layout()
plt.legend(loc='upper right')
plt.show()

In [None]:
# Boxplots to show the family size per ticket class

#plots the boxplots

fig, ax = plt.subplots()

ax = sns.boxplot(data=titanic, x='Pclass', y='family size', width=0.3, 
            boxprops=dict(alpha=0.6)) #in seaborn, alpha (and other parameters) is set in different way

#alternative - equivalent
#ax = sns.boxplot(data=titanic, x='Pclass', y='family size', width=0.3, 
#            boxprops={'alpha':0.6}) #in seaborn, alpha (and other parameters) is set in different way

ax.set_ylabel('family size')
ax.set_xlabel('ticket class')

plt.tight_layout()
plt.show()

In [None]:
# Additional plot 1
sns.catplot(x='Pclass', y='family size', data=titanic, kind='violin')
plt.xlabel("ticket class")
plt.ylabel("family size")
plt.show()

In [None]:
# Additional plot 2
sns.catplot(x='Pclass', y='family size', hue='Survived', 
            data=titanic, kind='bar', ci=False, estimator=np.mean)  #default estimator is mean
plt.xlabel("ticket class")
plt.ylabel("family size (mean)")
plt.show()

In [None]:
# Additional plot 2 (with data values shown)
# Reference -- https://pretagteam.com/question/seaborn-catplot-set-values-over-the-bars

g = sns.catplot(x='Pclass', y='family size', hue='Survived', data=titanic, 
                kind='bar', ci=False, estimator=np.mean)  #default estimator is mean

plt.xlabel("ticket class")
plt.ylabel("family size (mean)")

# extract the Matplotlib axes_subplot objects from the FacetGrid
ax = g.facet_axis(0, 0)
for bar in ax.patches:
    width = bar.get_width()
    height = bar.get_height()
    x, y = bar.get_xy()    
    ax.text(x = x+width/2,
            y = y+height-0.1,
            s = round(height,2),
            size = 10, 
            ha = 'center')

plt.show()

### 5. A series of stacked bar charts to show the how survival differs for different gender and ticket class

In [None]:
# Stacked bar chart on survival by gender
survival = titanic.groupby(['Sex','Survived']).size().unstack()
survival.plot(kind='bar', stacked=True, ylabel="number of passengers", xlabel="gender", 
              title = "Survival by gender");

In [None]:
# Stacked bar chart on survival by gender (with data values shown)

survival = titanic.groupby(['Sex','Survived']).size().unstack()
ax = survival.plot(kind='bar', stacked=True, ylabel="number of passengers", xlabel="gender", 
                   title = "Survival by gender")

for bar in ax.patches:
    width = bar.get_width()
    height = bar.get_height()
    x, y = bar.get_xy() 
    ax.text(x=x+width/2, 
            y=y+height-30, 
            s=int(height), 
            ha='center', va='center')

In [None]:
# Stacked bar chart on survival by ticket class
survival = titanic.groupby(['Pclass','Survived']).size().unstack()
survival.plot(kind='bar', stacked = True, ylabel = "number of passengers", xlabel = "ticket class", 
              title = "Survival by ticket class");

### 6. A violin chart describing how survival related to age and gender


In [None]:
ax = sns.violinplot(data=titanic, x='Sex', y='Age', hue='Survived', width=0.3, split=True)
ax.set_xlabel("gender")
plt.show()

### 7. A violin chart describing the survival rate related to age and ticket class


In [None]:
ax = sns.violinplot(data=titanic, x='Pclass', y='Age', hue='Survived', width=0.3, split=True)
ax.set_xlabel("ticket class")
plt.show()