# Intro to Data Visualization with Seaborn

## 0 - Setup

##### Import libraries & initialize datasets

In [1]:
# Importing the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the course datasets
country_data = pd.read_csv('datasets/countries-of-the-world.csv', decimal=",")
mpg = pd.read_csv('datasets/mpg.csv')
student_data = pd.read_csv('datasets/student-alcohol-consumption.csv', index_col=0)
survey_data = pd.read_csv('datasets/young-people-survey-responses.csv', index_col=0)

# add supplemental columns so that course example code works properly
parent_advise_lookup = {1: 'Never', 2: 'Rarely', 3: 'Sometimes', 4: 'Often', 5: 'Always'}
num_of_siblings_lookup = {0: '0',  1: '1 -2',  2: '1 -2',  3: '3+',  4: '3+', 5: '3+',  6: '3+',  10: '3+', np.nan: np.nan}
feels_lonely_lookup = {1: False, 2: False, 3: False, 4: True, 5: True}
survey_data['Age Category'] = survey_data.Age.apply(lambda col: '21+' if col >= 21 else 'Less than 21')
survey_data['Interested in Math'] = survey_data.Mathematics.apply(lambda col: False if col <= 3.0 else True)
survey_data['Parents Advice'] = survey_data["Parents' advice"].map(parent_advise_lookup)
survey_data['Number of Siblings'] = survey_data.Siblings.map(num_of_siblings_lookup)
survey_data['Feels Lonely'] = survey_data.Loneliness.map(feels_lonely_lookup)
survey_data['Interested in Pets'] = survey_data.Pets.apply(lambda col: 'No' if col <= 3 else 'Yes')
survey_data['Likes Techno'] = survey_data.Techno.apply(lambda col: False if col <= 3 else True)
# survey_data = survey_data.drop("Parents' advice", axis = 1)
# survey_data = survey_data.drop('Siblings', axis = 1)

## 1 - Intro to Seaborn

##### Scatter Plot

In [None]:
# Create a scatter plot of absences vs. final grade
sns.scatterplot(x = 'absences', y = 'G3', data = student_data, hue = 'location')
plt.show()

##### Scatter Plot - Subgroup on Location

In [None]:
# Create a dictionary mapping subgroup values to colors
palette_colors = {'Rural': 'green', 'Urban': 'blue'}

# Create a count plot of school with location subgroups
sns.countplot(x = 'school', data = student_data, hue = 'location', palette=palette_colors)
plt.show()

## 2 - Two Quantitative Variables

##### Creating Subplots with col and row

In [None]:
# Change this scatter plot to arrange the plots in rows instead of columns
sns.relplot(x = 'absences', y  ='G3', data = student_data, kind = 'scatter', row = 'study_time')
plt.show()

##### Creating Two-Factor Subplots

In [None]:
# Adjust further to add subplots based on family support
sns.relplot(x = 'G1', y = 'G3', data = student_data, kind = 'scatter', col = 'schoolsup', col_order = ['yes', 'no'], row = 'famsup', row_order = ['yes', 'no'])
plt.show()

##### Changing the Size of Scatter Plot Points

In [None]:
# Create scatter plot of horsepower vs. mpg
sns.relplot(x = 'horsepower', y = 'mpg', data = mpg, kind = 'scatter', size = 'cylinders', hue = 'cylinders')
plt.show()

##### Changing the Style of Scatter Plot Points

In [None]:
# Create a scatter plot of acceleration vs. mpg
sns.relplot(x='acceleration', y='mpg', data = mpg, kind='scatter', style='origin', hue='origin')
plt.show()

##### Visualizing Standard Deviation with Line Plots

In [None]:
# Make the shaded area show the standard deviation
sns.relplot(x="model_year", y="mpg", data=mpg, kind="line", ci = 'sd')
plt.show()

##### Plotting Subgroups in Line Plots

In [None]:
# Add markers and make each line have the same style
sns.relplot(x = 'model_year', y = 'horsepower', data = mpg, kind = 'line', ci = None, style = 'origin', hue = 'origin', dashes = False, markers = True)
plt.show()

## 3 - Categorical Variables

##### Count Plots

In [None]:
# Create count plot of internet usage
sns.catplot(y = 'Internet usage', data = survey_data, kind = 'count', col = 'Age Category')
plt.show()

##### Bar Plots with Percentages

In [None]:
# Create a bar plot of interest in math, separated by gender
sns.catplot(x = 'Gender', y = 'Interested in Math', data = survey_data, kind = 'bar')
plt.show()

##### Customizing Bar Plots

In [None]:
# List of categories from lowest to highest
category_order = ['<2 hours', '2 to 5 hours', '5 to 10 hours', '>10 hours']

# Turn off the confidence intervals
sns.catplot(x = 'study_time', y = 'G3', data = student_data, kind = 'bar', order = category_order, ci = None)
plt.show()

##### Creating Box Plots

In [None]:
# Specify the category ordering
study_time_order = ['<2 hours', '2 to 5 hours', '5 to 10 hours', '>10 hours']

# Create a box plot and set the order of the categories
sns.catplot(x = 'study_time', y = 'G3', data = student_data, kind = 'box', order = study_time_order)
plt.show()

##### Omitting Outliers

In [None]:
# Create a box plot with subgroups and omit the outliers
sns.catplot(x = 'internet', y = 'G3', data = student_data, kind = 'box', hue = 'location', sym = '')
plt.show()

##### Adjusting the Wiskers

In [None]:
# Set the whiskers at the min and max values
sns.catplot(x = 'romantic', y = 'G3', data = student_data, kind = 'box', whis = [0, 100])
plt.show()

##### Customizing Point Plots

In [None]:
# Remove the lines joining the points
sns.catplot(x = 'famrel', y = 'absences', data = student_data, kind = 'point', capsize = 0.2, join = False)
plt.show()

##### Point Plots with Subgroups

In [None]:
from numpy import median

# Plot the median number of absences instead of the mean
sns.catplot(x = 'romantic', y = 'absences', data = student_data, kind = 'point', hue = 'school', ci = None, estimator = median)
plt.show()

## 4 - Customizing Seaborn Plots

##### Changing Style and Palette

In [None]:
# Change the color palette to 'RdBu'
sns.set_style('whitegrid')
sns.set_palette('RdBu')

# Create a count plot of survey responses
category_order = ['Never', 'Rarely', 'Sometimes', 'Often', 'Always']

sns.catplot(x = 'Parents Advice', data = survey_data, kind = 'count', order = category_order)
plt.show()

##### Changing Output Scale, i.e., Size

In [None]:
# possible values: 'paper', 'notebook', 'talk', 'poster'
sns.set_context('notebook')

# Create bar plot
sns.catplot(x = 'Number of Siblings', y = 'Feels Lonely', data = survey_data, kind = 'bar')
plt.show()

##### Using a Custom Palette

In [None]:
# Set the style to 'darkgrid'
sns.set_style('darkgrid')

# Set a custom color palette
sns.set_palette(['#39A7D0', '#36ADA4'])

# Create the box plot of age distribution by gender
sns.catplot(x = 'Gender', y = 'Age', data = survey_data, kind = 'box')
plt.show()

##### `FacetGrid`s vs. `AxesSubplots`s

In [None]:
# Create scatter plot
g = sns.relplot(x = 'weight', y = 'horsepower', data = mpg, kind = 'scatter')
g.fig.suptitle('Car Weight vs. Horsepower')
plt.show()
print(f'sns.relplot() and sns.catlot() return type: {type(g)}')

##### Adding a Title and Axis Labels

In [None]:
mpg_mean = mpg.groupby(['model_year', 'origin']).agg({'mpg': np.mean}).reset_index().rename(columns = {'mpg': 'mpg_mean'})
# Create line plot
g = sns.lineplot(x = 'model_year', y = 'mpg_mean', data = mpg_mean, hue = 'origin')

# Add a title 'Average MPG Over Time'
g.set_title('Average MPG Over Time')

# Add x-axis and y-axis labels
g.set(xlabel = 'Car Model Year', ylabel = 'Average MPG')
plt.show()

##### Rotating X-Tick Labels

In [None]:
# Create point plot
sns.catplot(x = 'origin', y = 'acceleration', data = mpg, kind = 'point', join = False, capsize = 0.1)

# Rotate x-tick labels
plt.xticks(rotation = 90)
plt.show()

##### Box Plots with Subgroups

In [None]:
# Set palette to 'Blues'
sns.set_palette('Blues')

# Adjust to add subgroups based on 'Interested in Pets'
g = sns.catplot(x = 'Gender', y = 'Age', data = survey_data, kind = 'box', hue = 'Interested in Pets')

# Set title to 'Age of Those Interested in Pets vs. Not'
g.fig.suptitle('Age of Those Interested in Pets vs. Not')
plt.show()

##### Box Plots with Subgroups and Subplots

In [None]:
# Set the figure style to 'dark'
sns.set_style('dark')

# Adjust to add subplots per gender
g = sns.catplot(x = 'Village - town', y = 'Likes Techno', data = survey_data, kind = 'bar', col = 'Gender')

# Add title and axis labels
g.fig.suptitle('Percentage of Young People Who Like Techno', y = 1.02)
g.set(xlabel = 'Location of Residence', ylabel = '% Who Like Techno')
plt.show()
print(f'sns.catlot() and sns.relplot() return type: {type(g)}')

In [1]:
from zip_util import compress_folder, decompress_folder
import os, sys
from pathlib import Path

cwd = Path(os.getcwd())
datasets = cwd.joinpath('datasets')
archive = cwd.joinpath('sns_datasets.zip')
compress_folder(datasets, archive, True)

Compressing /work/files/workspace/datasets to /work/files/workspace/sns_datasets.zip
Compressed /work/files/workspace/datasets to /work/files/workspace/sns_datasets.zip


In [None]:
test = cwd.joinpath('test')
decompress_folder(archive, test, True)