# Python Data Visualizations and Demo

In [1]:
!pip install plotly

In [2]:
import numpy as np
import pandas as pd
import random as random
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d
import plotly
import plotly.graph_objs as go
import plotly.express as px

In [3]:
# Iris Reproduction Dataset

means_dict = {'species': ['setosa', 'versicolor', 'virginica'], 
         'sepal_length': [5.006, 5.936, 6.588], 
         'sepal_width': [3.428, 2.770, 2.974], 
         'petal_length': [1.462,4.260,5.552], 
         'petal_width':[0.246,1.326,2.026]
    }

stds_dict = {'species': ['setosa', 'versicolor', 'virginica'], 
         'sepal_length': [0.352490, 0.516171, 0.635880], 
         'sepal_width': [0.379064, 0.313798, 0.322497], 
         'petal_length': [0.173664, 0.469911, 0.551895], 
         'petal_width': [0.105386, 0.197753, 0.274650]
    }

means = pd.DataFrame(data = means_dict)
stds = pd.DataFrame(data = stds_dict)

species = ['setosa'] * 50 + ['versicolor'] * 50 + ['virginica'] * 50
species_id = [1] * 50 + [2] * 50 + [3] * 50
sepal_length, sepal_width, petal_length, petal_width = [],[],[],[]
variables = [sepal_length, sepal_width, petal_length, petal_width]

for sp in species_id:
    for var in [1,2,3,4]:
        variables[var - 1].append(random.normalvariate(means.iloc[sp - 1,var], stds.iloc[sp - 1,var]))
        
df = pd.DataFrame(list(zip(species, species_id, sepal_length, sepal_width, petal_length, petal_width)), 
                  columns = ['species',  'species_id', 'sepal_length',  'sepal_width',  'petal_length',  'petal_width'])

## Matplotlib

Matplotlib is the standard, most commonly used plotting platform in Python.  It's pretty easy and relatively versatile.  

In [4]:
# evenly sampled time at 200ms intervals
t = np.arange(0., 5., 0.2)

# red dashes, blue squares and green triangles
plt.figure(figsize=(10,7))
plt.plot(t, t, 'r--', t, t**2, 'bs', t, t**4, 'r^')
plt.show()

In [5]:
data = {'a': np.arange(50),
        'c': np.random.randint(0, 50, 50),
        'd': np.random.randn(50)}
data['b'] = data['a'] + 10 * np.random.randn(50)
data['d'] = np.abs(data['d']) * 100

plt.figure(figsize=(10,7))
plt.scatter('a', 'b', c='c', s='d', data=data)
plt.xlabel('entry a')
plt.ylabel('entry b')
plt.show()

In [6]:
names = ['group_a', 'group_b', 'group_c']
values = [1, 10, 100]

plt.figure(figsize=(12, 4))

plt.subplot(131)
plt.bar(names, values)
plt.subplot(132)
plt.scatter(names, values)
plt.subplot(133)
plt.plot(names, values)
plt.suptitle('Categorical Plotting')
plt.show()

In [7]:
mu, sigma = 100, 15
x = mu + sigma * np.random.randn(10000)

# the histogram of the data
plt.figure(figsize=(10,7))
n, bins, patches = plt.hist(x, 50, density=1, facecolor='g', alpha=0.75)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of IQ')
plt.text(81, .0289, r'$\mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)
plt.show()

In [8]:
# Setup

f = lambda x, y: (1-x**3) + 100 * (y**2 - x)**2

x = np.linspace(-2, 2, 100)
y = np.linspace(-2, 2, 100)
X, Y = np.meshgrid(x, y)

Z = f(X, Y)

# Contour Plotting
fig = plt.figure(figsize = (12, 8))
ax = plt.axes(projection='3d')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='viridis', edgecolor='none')
ax.set_title('Contour Visualization\n', fontsize = 18);
ax.set_xlabel('x', fontsize = 14)
ax.set_ylabel('y', fontsize = 14)
plt.show()

In [9]:
# Setup
trajectories = []
lr = [0.001, 0.0005, 0.00001]
n_iter = 1000

def grad_f(x, y):
    
    dfdx = -3 * x**2 - 200 * (y**2 - x)
    dfdy = 400 * y * (y**2 - x)
    
    return np.array([dfdx, dfdy])

# Gradient Descent
for i in lr:
    
    x, y = -1.5, 1.5
    
    trajectories.append((i, x, y, f(x,y)))
    
    for j in range(0, n_iter):
        grad = grad_f(x, y)
        x = x - i * grad[0]
        y = y - i * grad[1]
        trajectories.append((i, x, y, f(x,y)))
    
# Reformat collected trajectories for use
traj = np.array(trajectories)   

# Learning Rate Plotting
fig = plt.figure(figsize = (12, 12))
ax = plt.axes(projection='3d')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='viridis', edgecolor='none', alpha = 0.5)
ax.set_title('Contour Visualization\n', fontsize = 18);
ax.set_xlabel('x', fontsize = 14)
ax.set_ylabel('y', fontsize = 14)
ax.set_zlabel('z', fontsize = 14)

list_colors = ['red', 'green', 'blue']

for i, line_color in zip(lr, list_colors):
    indexes = np.isin(traj[:,0], i)                                             # Look in first col of traj for lr = i
    temp = traj[indexes]                                                        # Collect the trajectories for the line
    ax.plot(temp[:, 1], temp[:, 2], temp[:, 3], color = line_color, lw = 1)     # Plot the line

ax.legend(lr, title = "Learning Rates", title_fontsize = 15, fontsize = 13)

ax.view_init(40, -20)
plt.show()

## Seaborn

Seaborn specizlizes in easy, beautiful statistical plots, but does many different kinds of graphs well.  

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.histplot(data=df, x="sepal_length", hue="species", multiple="stack")

In [10]:
sns.kdeplot(data=df, x="sepal_length", hue="species", multiple="stack")

In [11]:
f, axs = plt.subplots(1, 2, figsize=(12, 6), gridspec_kw=dict(width_ratios=[4, 3]))
sns.scatterplot(data=df, x="sepal_length", y="petal_length", hue="species", ax=axs[0])
sns.histplot(data=df, x="species", hue="species", shrink=.8, alpha=.8, legend=False, ax=axs[1])
f.tight_layout()

In [12]:
sns.jointplot(data=df, x="sepal_length", y="sepal_width", hue="species", height = 10)

## Bokeh

Another interactive graphing program, Bokeh specializes in streaming datasets, but also does many things well.  Go ahead, try to interact with the graphs and you'll be surprised with what you may find.

In [13]:
from bokeh.plotting import figure 
from bokeh.io import output_notebook, show
from numpy import cos, linspace
output_notebook()

In [15]:
x = linspace(-6, 6, 100)
y = cos(x)
p = figure(width=500, height=500)
p.circle(x, y, size=7, color="firebrick", alpha=0.5)
show(p)

In [17]:
from bokeh.sampledata.autompg import autompg

grouped = autompg.groupby("yr")

mpg = grouped.mpg
avg, std = mpg.mean(), mpg.std()
years = list(grouped.groups)
american = autompg[autompg["origin"]==1]
japanese = autompg[autompg["origin"]==3]
p = figure(title="MPG by Year (Japan and US)")

p.vbar(x=years, bottom=avg-std, top=avg+std, width=0.8, fill_alpha=0.2, line_color=None, legend_label="MPG 1 stddev")
# p.vbar(x=years, bottom=avg-2*std, top=avg+2*std, width=0.8, fill_alpha=0.1, line_color=None, color="green", legend_label="MPG 1 stddev")
p.circle(x=japanese["yr"], y=japanese["mpg"], size=10, alpha=0.5, color="red", legend_label="Japanese")
p.triangle(x=american["yr"], y=american["mpg"], size=10, alpha=0.3, color="blue", legend_label="American")
p.legend.location = "top_left"

show(p)

In [19]:
import numpy as np

from bokeh.io import output_file, show
from bokeh.models import HoverTool
from bokeh.plotting import figure

n = 500
x = 2 + 2*np.random.standard_normal(n)
y = 2 + 2*np.random.standard_normal(n)

p = figure(title="Hexbin for 500 points", match_aspect=True,
           tools="wheel_zoom,reset", background_fill_color='#440154')
p.grid.visible = False

r, bins = p.hexbin(x, y, size=0.5, hover_alpha=0.8)

p.circle(x, y, color="white", size=1)

p.add_tools(HoverTool(
    tooltips=[("count", "@c"), ("(q,r)", "(@q, @r)")],
    mode="mouse", point_policy="follow_mouse", renderers=[r]
))

output_file("hexbin.html")

show(p)

In [20]:
from collections import OrderedDict
from io import StringIO
from math import log, sqrt

import numpy as np
import pandas as pd

from bokeh.plotting import figure, output_file, show

antibiotics = """
bacteria,                        penicillin, streptomycin, neomycin, gram
Mycobacterium tuberculosis,      800,        5,            2,        negative
Salmonella schottmuelleri,       10,         0.8,          0.09,     negative
Proteus vulgaris,                3,          0.1,          0.1,      negative
Klebsiella pneumoniae,           850,        1.2,          1,        negative
Brucella abortus,                1,          2,            0.02,     negative
Pseudomonas aeruginosa,          850,        2,            0.4,      negative
Escherichia coli,                100,        0.4,          0.1,      negative
Salmonella (Eberthella) typhosa, 1,          0.4,          0.008,    negative
Aerobacter aerogenes,            870,        1,            1.6,      negative
Brucella antracis,               0.001,      0.01,         0.007,    positive
Streptococcus fecalis,           1,          1,            0.1,      positive
Staphylococcus aureus,           0.03,       0.03,         0.001,    positive
Staphylococcus albus,            0.007,      0.1,          0.001,    positive
Streptococcus hemolyticus,       0.001,      14,           10,       positive
Streptococcus viridans,          0.005,      10,           40,       positive
Diplococcus pneumoniae,          0.005,      11,           10,       positive
"""

drug_color = OrderedDict([
    ("Penicillin",   "#0d3362"),
    ("Streptomycin", "#c64737"),
    ("Neomycin",     "black"  ),
])

gram_color = OrderedDict([
    ("negative", "#e69584"),
    ("positive", "#aeaeb8"),
])

df = pd.read_csv(StringIO(antibiotics),
                 skiprows=1,
                 skipinitialspace=True,
                 engine='python')

width = 800
height = 800
inner_radius = 90
outer_radius = 300 - 10

minr = sqrt(log(.001 * 1E4))
maxr = sqrt(log(1000 * 1E4))
a = (outer_radius - inner_radius) / (minr - maxr)
b = inner_radius - a * maxr

def rad(mic):
    return a * np.sqrt(np.log(mic * 1E4)) + b

big_angle = 2.0 * np.pi / (len(df) + 1)
small_angle = big_angle / 7

p = figure(plot_width=width, plot_height=height, title="",
    x_axis_type=None, y_axis_type=None,
    x_range=(-420, 420), y_range=(-420, 420),
    min_border=0, outline_line_color="black",
    background_fill_color="#f0e1d2")

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

# annular wedges
angles = np.pi/2 - big_angle/2 - df.index.to_series()*big_angle
colors = [gram_color[gram] for gram in df.gram]
p.annular_wedge(
    0, 0, inner_radius, outer_radius, -big_angle+angles, angles, color=colors,
)

# small wedges
p.annular_wedge(0, 0, inner_radius, rad(df.penicillin),
                -big_angle+angles+5*small_angle, -big_angle+angles+6*small_angle,
                color=drug_color['Penicillin'])
p.annular_wedge(0, 0, inner_radius, rad(df.streptomycin),
                -big_angle+angles+3*small_angle, -big_angle+angles+4*small_angle,
                color=drug_color['Streptomycin'])
p.annular_wedge(0, 0, inner_radius, rad(df.neomycin),
                -big_angle+angles+1*small_angle, -big_angle+angles+2*small_angle,
                color=drug_color['Neomycin'])

# circular axes and lables
labels = np.power(10.0, np.arange(-3, 4))
radii = a * np.sqrt(np.log(labels * 1E4)) + b
p.circle(0, 0, radius=radii, fill_color=None, line_color="white")
p.text(0, radii[:-1], [str(r) for r in labels[:-1]],
       text_font_size="11px", text_align="center", text_baseline="middle")

# radial axes
p.annular_wedge(0, 0, inner_radius-10, outer_radius+10,
                -big_angle+angles, -big_angle+angles, color="black")

# bacteria labels
xr = radii[0]*np.cos(np.array(-big_angle/2 + angles))
yr = radii[0]*np.sin(np.array(-big_angle/2 + angles))
label_angle=np.array(-big_angle/2+angles)
label_angle[label_angle < -np.pi/2] += np.pi # easier to read labels on the left side
p.text(xr, yr, df.bacteria, angle=label_angle,
       text_font_size="12px", text_align="center", text_baseline="middle")

# OK, these hand drawn legends are pretty clunky, will be improved in future release
p.circle([-40, -40], [-370, -390], color=list(gram_color.values()), radius=5)
p.text([-30, -30], [-370, -390], text=["Gram-" + gr for gr in gram_color.keys()],
       text_font_size="9px", text_align="left", text_baseline="middle")

p.rect([-40, -40, -40], [18, 0, -18], width=30, height=13,
       color=list(drug_color.values()))
p.text([-15, -15, -15], [18, 0, -18], text=list(drug_color),
       text_font_size="12px", text_align="left", text_baseline="middle")

output_file("burtin.html", title="burtin.py example")

show(p)

# This visualization shows xyz

In [21]:
from bokeh.io import output_file, show
from bokeh.plotting import figure
from bokeh.sampledata.periodic_table import elements
from bokeh.transform import dodge, factor_cmap

output_file("periodic.html")

periods = ["I", "II", "III", "IV", "V", "VI", "VII"]
groups = [str(x) for x in range(1, 19)]

df = elements.copy()
df["atomic mass"] = df["atomic mass"].astype(str)
df["group"] = df["group"].astype(str)
df["period"] = [periods[x-1] for x in df.period]
df = df[df.group != "-"]
df = df[df.symbol != "Lr"]
df = df[df.symbol != "Lu"]

cmap = {
    "alkali metal"         : "#a6cee3",
    "alkaline earth metal" : "#1f78b4",
    "metal"                : "#d93b43",
    "halogen"              : "#999d9a",
    "metalloid"            : "#e08d49",
    "noble gas"            : "#eaeaea",
    "nonmetal"             : "#f1d4Af",
    "transition metal"     : "#599d7A",
}

TOOLTIPS = [
    ("Name", "@name"),
    ("Atomic number", "@{atomic number}"),
    ("Atomic mass", "@{atomic mass}"),
    ("Type", "@metal"),
    ("CPK color", "$color[hex, swatch]:CPK"),
    ("Electronic configuration", "@{electronic configuration}"),
]

p = figure(title="Periodic Table (omitting LA and AC Series)", plot_width=1000, plot_height=450,
           x_range=groups, y_range=list(reversed(periods)),
           tools="hover", toolbar_location=None, tooltips=TOOLTIPS)

r = p.rect("group", "period", 0.95, 0.95, source=df, fill_alpha=0.6, legend_field="metal",
           color=factor_cmap('metal', palette=list(cmap.values()), factors=list(cmap.keys())))

text_props = {"source": df, "text_align": "left", "text_baseline": "middle"}

x = dodge("group", -0.4, range=p.x_range)

p.text(x=x, y="period", text="symbol", text_font_style="bold", **text_props)

p.text(x=x, y=dodge("period", 0.3, range=p.y_range), text="atomic number",
       text_font_size="11px", **text_props)

p.text(x=x, y=dodge("period", -0.35, range=p.y_range), text="name",
       text_font_size="7px", **text_props)

p.text(x=x, y=dodge("period", -0.2, range=p.y_range), text="atomic mass",
       text_font_size="7px", **text_props)

p.text(x=["3", "3"], y=["VI", "VII"], text=["LA", "AC"], text_align="center", text_baseline="middle")

p.outline_line_color = None
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_standoff = 0
p.legend.orientation = "horizontal"
p.legend.location ="top_center"
p.hover.renderers = [r] # only hover element boxes

show(p)

In [22]:
from IPython.display import IFrame
IFrame('https://demo.bokeh.org/gapminder', width=950, height=1150)

# D7 Notional Data

In [23]:
df = pd.read_csv('Notional Cutter Hours Boardings and Missions.csv')

In [24]:
df.head()

In [25]:
df.dtypes

In [26]:
df.info()