# Notebook 7: Data visualization with Seaborn

Here we will see how Seaborn can greatly speed up the visualization of data. Again we will work with Fisher's Iris dataset. 

In [None]:
# Load the standard libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Set seaborn style
sns.set_style('ticks')

In [None]:
# Load Iris dataset
df = sns.load_dataset('iris')
df.head()

Suppose we want to make a bar chart showing the mean sepal length of each species. What do we need to do using standard Matplotlib?

In [None]:
# First we need to get a list of mean values, one for each species
means_df = df.groupby('species').mean()
means_df

In [None]:
# Get the heights
heights = means_df.loc[:,'sepal_length']

# Set the positions
xs = [0,1,2]

# Set the colors
colors = ['C0','C1','C2']
 
# Plot
plt.bar(x=xs, height=heights, color=colors)
    
# And set xticks and labels
plt.xticks(xs, means_df.index)

In [None]:
# Creating this plot is far easier with seaborn
sns.barplot(data=df, y="sepal_length", x="species")

In [None]:
# Make bar chart horizontal by switching x <-> y
sns.barplot(data=df, x="sepal_length", y="species")

In [None]:
# Create a box plot
sns.boxplot(data=df, y="sepal_length", x="species")

# Box plots are a bit strange
#   Middle line = median
#   Box = interquartile range (IQR)
#   Bottom stem = 1st quartile - 1.5*IQR
#   Top stem = 3rd quartile + 1.5*IQR
#   Dots = outside stems = "outliers"

In [None]:
# Create a swarm plot
sns.swarmplot(data=df, y="sepal_length", x="species")

In [None]:
# Violin plot
sns.violinplot(data=df, y="sepal_length", x="species", inner="quartile")
# Try inner = None, "box","quartile", "stick"

In [None]:
# If we want to compare different lengths, we need to "melt" the data frame
melt_df = df.melt(id_vars="species")
melt_df.head(10)

In [None]:
# Separate by both species and variable
plt.figure(figsize=[10,6])
sns.barplot(data=melt_df, x="variable", y="value", hue="species")

In [None]:
# Show a scatter plot with different species in different hues
sns.scatterplot(data=df, x="petal_length", y="petal_width", hue="species")

In [None]:
### Create a multi-panel figure

# We need the "gridspec" functionality of matplotlib
import matplotlib.gridspec as gridspec

# Create a gridspec object. This will tell matplotlib where to position Axes
gs = gridspec.GridSpec(1, 3)

# Create a figure
fig = plt.figure(figsize=[10,5])

# Create an Axes spanning two columns
ax = plt.subplot(gs[0, 0:2])

# Draw a barplot on this Axes
sns.barplot(data=melt_df, x="variable", y="value", hue="species", ax=ax)
ax.set_title('Bar chart')

# Create an Axes in the third column
ax = plt.subplot(gs[0, 2])

# Draw a scatterplot on this Axes
sns.scatterplot(data=df, x="petal_length", y="petal_width", hue="species")
ax.set_title('Scatter plot')

# Add panel labes. Note that x and y are in Figure coordinates, not Axes coordinates
t = fig.text(x=.01, y=.99, s='(A)', fontsize=20, horizontalalignment="left", verticalalignment="top")
fig.text(x=.66, y=.99, s='(B)', fontsize=20, horizontalalignment="left", verticalalignment="top")

# Adjust plot positions
plt.tight_layout(pad=2)

# Save figure
file_name ='7_seaborn_1.pdf' 
plt.savefig(file_name)

# Open figure in Preview
!open $file_name

It should be noted that seaborn is particularly good for data exploration, i.e., quickly surveying the structure of your data by making lots and lots of plots.

In [None]:
# Plot all pairwise variables against each other
sns.pairplot(data=df, hue="species")

# That's it! Thanks for participating, and see you next week.