# Visualizing data in Python


In [None]:
# The following imports a number of libraries that we'll need, as well as
# configures a number of options that will make interacting with the notebook
# a little easier. If you don't have them in stalled, create a cell below
# to install them using the magic pip command: 
#     %pip install LIBRARY
# or install them from the command line:
#     python3 -m pip install LIBRARY

import pandas as pd    # For reading an manipulating tabular data.
import seaborn as sns  # For making pretty plots.
import matplotlib.pyplot as plt # For plotting options.

## Import geopandas library for doing maps. This works with the map data in the `usa-map/` directory.
## (You can download the `usa-map.tbz2` file, which is a bzip2 tar ball of the same directory. Untar it
## with `tar xjf usa-map.tbz2`)
import geopandas 

## Datasets

Aside from the continental USA map data (see above), we'll be using the following datasets in this
notbook:
  * **iris**—a classic dataset looking at various features across three species of irises
  * **names**—first names on U.S. social security card applications between 1880 and 2017
  * **namesByState**—first names on U.S. social security card applications by state from 1910 to 2017

In [None]:
# Load the example iris dataset from seaborn (this is a classic dataset often used to demonstrate stats and visualization).
iris = sns.load_dataset("iris")
iris.head()

In [None]:
# "Melt" the dataset to "long-form" representation. Take a look at how this
# looks different from the "unmelted" (wide-form) version.
iris_long = pd.melt(iris, "species", var_name="measurement")
iris_long.head()

In [None]:
## Load the names data (we'll use this for time series). 
names = pd.read_csv('../data/names_1880-20180304.csv.bz2').sort_values(by='yob')

In [None]:
names.head()

In [None]:
namesByState = pd.read_csv('../data/names-by-states_1910-20180304.csv.bz2')

In [None]:
namesByState.head()



---


## Distributions
  * box plots
  * swarm plots
  * histograms

In [None]:
# Make a box plot in pandas:
iris.plot.box()

In [None]:
# Box plot in Seaborn.
sns.boxplot(data=iris)

In [None]:
# Box plot looking at each species separately.
sns.boxplot(x='measurement', y='value', hue='species', data=iris_long)

In [None]:
sns.boxplot(x='measurement', y='value', hue='species', data=iris_long[iris_long['measurement']=='sepal_length'])

In [None]:
# Swarm plot.
sns.swarmplot(data=iris)

In [None]:
# Swarm plot -- each species has it's own color.
sns.swarmplot(x='measurement', y='value', hue='species', data=iris_long)

In [None]:
# Histogram.
iris.plot.hist()


In [None]:
# Plot the distribution of a single variable.
sns.histplot(
    iris['petal_length'], kde=True,
    stat="density", kde_kws=dict(cut=3)
)



---

## Relationships

  * scatter plots
  * joint plots
  * pair plots
  * line plots

In [None]:
# Basic scatter plot.
iris.plot.scatter(x='petal_length', y='petal_width')

In [None]:
# Seaborn makes it easy to add in color by a third categorial variable.
sns.lmplot(x='petal_length', y='petal_width', data=iris, hue='species', fit_reg=False)

In [None]:
# You can also easily include regression fits by leaving out fit_reeg=False:
sns.lmplot(x='petal_length', y='petal_width', data=iris, hue='species')

In [None]:
# To see the distribution of each variable in the scatter plot, use a joint plot:
sns.jointplot(x='petal_length', y='petal_width', data=iris)

In [None]:
# Quickly see the relationship between different variables as well as their
# individual distributions.
sns.pairplot(iris);

In [None]:
# Let's add some color.
sns.pairplot(iris, hue='species');

In [None]:
# Line plots for time series.
names[(names['gender'] == 'F') & (names['name'] == 'Mary')].plot(x='yob', y='count')

In [None]:
# Plot multiple lines on the same graph.
ax = names[(names['gender'] == 'F') & (names['name'] == 'Mary')].plot(x='yob', y='count')
names[(names['gender'] == 'F') & (names['name'] == 'Margaret')].plot(x='yob', y='count', ax=ax)
names[(names['gender'] == 'F') & (names['name'] == 'Makenzie')].plot(x='yob', y='count', ax=ax)
plt.legend(['Mary', 'Margaret', 'Makenzie'])
plt.xlabel('Year of birth')
plt.ylabel('Number registered')

In [None]:
# This isn't a heatmap, but it acts kind of like one with continuous data; helps
# see density when points overlap.
sns.jointplot(x='petal_length', y='petal_width', data=iris, kind='hex', gridsize=20)



---

## Comparison

  * bar charts

In [None]:
# Compare frequencies for 2017.
top2017MaleNames = names[(names['yob'] == 2017) & (names['gender'] == 'M')].sort_values(by='count', ascending=False).head(10)
top2017FemaleNames = names[(names['yob']==2017) & (names['gender'] == 'F')].sort_values(by='count', ascending=False).head(10)

In [None]:
top2017MaleNames.plot.bar(x='name', y='count')

In [None]:
sns.barplot(x='name', y='count', data=top2017FemaleNames, color='lightblue')

In [None]:
# Bar plots with subcategories -- this also calculates the means for us.
sns.barplot(x='measurement', y='value', hue='species', data=iris_long)



---

## Geo spactial data

In [None]:
# Read in the map data -- This works with the map data in the `usa-map/` directory.
## (You can download the `usa-map.tbz2` file into your ../data directory.  This is a bzip2 tar ball that you can untar
## with `tar xjf usa-map.tbz2` from inside your ../data directory -- do that before you run this cell)
continental_usa = geopandas.read_file('../data/usa-map/usa-states-census-2014.shp')

In [None]:
# Here's what the map data looks like:
continental_usa.head()

In [None]:
# The data we're using for maps uses a GIS (geographical information system) format called 
# WGS 84; we want the Spherical Mercator format, which is used by Google Maps, etc. (EPSG:3395).
# Here, we'll convert it. 
continental_usa = continental_usa.to_crs('EPSG:3395')

In [None]:
# Let's see what it looks like.
continental_usa.plot() 

In [None]:
name = 'John'
year = 2015
x = continental_usa.merge(namesByState[(namesByState.name == name) & (namesByState.yob == year)], 
    left_on='STUSPS', right_on='state')
x.head()

In [None]:
def plotNameToAx(name, year, ax):
    """Reusable plotting wrapper. This will generate a map plot of 
    name popularity in the given year from the namesByState (global) dataset,
    then place it at the position within the plot given by ax (an Axis object).

    Parameters:
        name (str): The name whose popularity should be plotted.
        year (int): The year the name's popularity should be calculated over.
        ax (Axis): A Matplotlib Axis object.
    """
    mapData =  continental_usa.merge(
        namesByState[(namesByState['name'] == name) & (namesByState['yob'] == year)], 
        left_on='STUSPS', right_on='state').reindex()
    
    mapData.plot(column='count', ax=ax, legend=True)

# Make a 2x2 figure (four total plots)
f, axarr = plt.subplots(2, 2, figsize=(11, 6))

plt.suptitle('Popularity of names by state in 2015', fontsize=16)
plt.subplots_adjust(top=0.95)

plotNameToAx('Mary', 2015, axarr[0][0])
axarr[0][0].set_title('Mary')

plotNameToAx('Margaret', 2015, axarr[0][1])
axarr[0][1].set_title('Margaret')

plotNameToAx('Elizabeth', 2015, axarr[1][0])
axarr[1][0].set_title('Elizabeth')

plotNameToAx('Makenzie', 2015, axarr[1][1])
axarr[1][1].set_title('Makenzie')

In [None]:
# Hmm, the most populated states unsurprisingly have the highest occurrences.
# Let's noramlize by state -- how exactly are we doing that???

def plotNameToAx(name, year, ax):
    """Reusable plotting wrapper. This will generate a map plot of 
    name popularity in the given year from the namesByState (global) dataset,
    then place it at the position within the plot given by ax (an Axis object).
    Normalizes by state population.

    Parameters:
        name (str): The name whose popularity should be plotted.
        year (int): The year the name's popularity should be calculated over.
        ax (Axis): A Matplotlib Axis object.
    """
    
    mapData =  continental_usa.merge(
        namesByState[(namesByState['name'] == name) & (namesByState['yob'] == year)], 
        left_on='STUSPS', right_on='state').reindex()
    
    
    mapData['total_state'] = mapData['state'].map(
        lambda s: namesByState[
            (namesByState['state'] == s) & (namesByState['yob'] == year)]['count'].sum())
    mapData['prop_within_state'] = mapData['count'] / mapData['total_state']
    
    mapData.plot(column='prop_within_state', ax=ax, legend=True)


# Make a 2x2 figure (four total plots)
f, axarr = plt.subplots(2, 2, figsize=(11, 6))

plt.suptitle('Popularity of names by state in 2015', fontsize=16)
plt.subplots_adjust(top=0.95)

plotNameToAx('Mary', 2015, axarr[0][0])
axarr[0][0].set_title('Mary')

plotNameToAx('Margaret', 2015, axarr[0][1])
axarr[0][1].set_title('Margaret')

plotNameToAx('Elizabeth', 2015, axarr[1][0])
axarr[1][0].set_title('Elizabeth')

plotNameToAx('Makenzie', 2015, axarr[1][1])
axarr[1][1].set_title('Makenzie')

In [None]:
names[(names['gender'] == 'F') & (names['name'] == 'Mary')].groupby('yob').sum()['count']

In [None]:
names.groupby('yob').sum()

In [None]:
names[(names['gender'] == 'F') & (names['name'] == 'Mary')].groupby('yob').sum() / names.groupby('yob').sum()

In [None]:
(names[(names['gender'] == 'F') & (names['name'] == 'Mary')].groupby('yob').sum() / names.groupby('yob').sum()).plot()