Importing libraries

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

Seaborn comes with pre-loaded datasets. Let's explore one of these, the 'titanic' dataset.

In [None]:
df = sns.load_dataset('titanic')
df.head()

In [None]:
df.info()

First, let's use heat maps to display the null values in our dataset

In [None]:
df.isnull()

In [None]:
plt.figure(figsize = (8, 6))
sns.heatmap(df.isnull(), cbar = False, cmap = 'viridis')  # removing the colorbar as this is boolean data
plt.title('Missing values heat map')
plt.xlabel('Features')
plt.ylabel('Sample index')
plt.show()

Let's now find the correlations between numeric features of our dataset

In [None]:
correlation_matrix = df.corr(numeric_only = True)
correlation_matrix

In [None]:
# Displaying this as a heat map
plt.figure(figsize = (8, 6))
sns.heatmap(correlation_matrix, annot = True, cmap = 'coolwarm', fmt = '.2f')
plt.title('Correlation matrix of the titanic dataset')
plt.show()

Heat maps can also be used to view value counts. For instance, we could view the number of people within each class and sex.

In [None]:
class_sex_counts = pd.crosstab(df['pclass'], df['sex'])
class_sex_counts

In [None]:
# Displaying this as a heat map
plt.figure(figsize = (8, 6))
sns.heatmap(class_sex_counts, annot = True, cmap = 'viridis', fmt = 'd')  # fmt = 'd' specifies integer formatting
plt.title('Value counts of passengers by class and gender')
plt.show()

We could also create our own categories. Here, let's bin ages and fares using `pd.cut()`and depict the value counts of passengers within these bins.

In [None]:
age_bins = pd.cut(df['age'], bins = [0, 18, 30, 40, 50, 60, 80])  # cutting our age data into various bins
fare_bins = pd.cut(df['fare'], bins = [0, 25, 50, 100, 200, 600])
age_fare_counts = pd.crosstab(age_bins, fare_bins)
age_fare_counts

In [None]:
# Displaying this as a heat map
plt.figure(figsize = (10, 6))
sns.heatmap(age_fare_counts, annot = True, cmap = 'YlOrBr', fmt = 'd')
plt.title('Passenger counts by age and fare bins')
plt.xlabel('Fare bin')
plt.ylabel('Age group')
plt.show()

We could also use aggregation functions in displaying our data. Here, let's find the survival rate for each age bin. The survival rate per group would be given by

$$\text{Survival rate} = \dfrac{\text{Number of survivors in age bin}}{\text{Total number of passengers in age bin}}$$

In [None]:
survival_by_age = df.pivot_table(values = 'survived', index = age_bins, columns = fare_bins, aggfunc = 'mean', observed = False)
survival_by_age

In [None]:
# Displaying this as a heat map
plt.figure(figsize = (10, 6))
sns.heatmap(survival_by_age, annot = True, cmap = 'YlOrBr')
plt.title('Passenger survival rate by age and fare bins')
plt.xlabel('Fare bin')
plt.ylabel('Age group')
plt.show()