# Plotting and Visualization
---
DAT 512 Canisuis College <br>
Professor Paul Lambson<br>
<br>
### Learning Objectives
- become familiar with matplotlib high level API
- modify elements of a visualization
- create visualizations from pandas data
- work with complex visualizations of matrix relationships
- review an example of geo mapping
<br>


### Sections
- [A Brief matplotlib API Primer](#brief_matplotlib_api_primer)
- [Figures and Subplots](#figure_and_subplots)
- [Colors, Markers, and Line Styles](#colors_markers_line_styles)
- [Ticks, Labels, and Legends](#ticks_lables_legends)
- [Annotations and Drawing on a Subplot](#annotations_drawing)
- [Plotting with pandas and seaborn](#plotting_pandas_seaborn)
- [Line Plots](#line_plots)
- [Bar Plots](#bar_plots)
- [Histograms and Density Plots](#histograms_density_plots)
- [Scatter or Point Plots](#scatter_point_plots)
- [Facet Grids and Categorical Data](#facet_grids_categorical_data)

In [None]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
import matplotlib
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [None]:
# adding matplotlib
import matplotlib.pyplot as plt
import plotly.express as px

# Brief matplotlib API Primer
<a id='brief_matplotlib_api_primer'></a>

In [None]:
# create an array to show a simple line
data = np.arange(10)
data

In [None]:
plt.plot(data);

# Figures and Subplots
<a id='figure_and_subplots'></a>

In [None]:
# Building out the fig with subplots
fig = plt.figure() #initiate a figure
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)

In [None]:
# add data to ax3 will populate the fig
ax3.plot(np.random.standard_normal(50).cumsum(),
         color='black',
         linestyle="dashed") # a semicolon here will remove the graph type message
fig

In [None]:
# other types of visualizations can be added
ax1.hist(np.random.standard_normal(100), 
         bins=20, 
         color="black", 
         alpha=.3); # alphs set sets the transparency of the overlaid plot
ax2.scatter(np.arange(30), 
            np.arange(30) + 3 * np.random.standard_normal(30)
           );
fig

In [None]:
# To make creating a grid of subplots more convenient, matplotlib includes a plt.subplots 
fig, axes = plt.subplots(2, 3);
axes # not a 2-D array

In [None]:
#make 4 plots
fig, axes = plt.subplots(2, 2, 
                         sharex=True, 
                         sharey=True)
for i in range(2):
    for j in range(2):
        axes[i, j].hist(np.random.standard_normal(500), 
                        bins=50,
                        color="black", 
                        alpha=0.5)


In [None]:
# adjustable margins
fig.subplots_adjust(wspace=0, hspace=0)
fig

# Colors, Markers and Line Styles
<a id='colors_markers_line_styles'></a>

In [None]:
# Line plot with markers
fig = plt.figure()
ax = fig.add_subplot()
#! figure,id=mpl_marker_ex,width=4in,title="Line plot with markers"
ax.plot(np.random.standard_normal(30).cumsum(), 
        color="black",
        linestyle="dashed", 
        marker="o");

In [None]:
px.line(np.random.standard_normal(300).cumsum(),
       title='test')

In [None]:
# subsequent points are linearly interpolated by default
fig = plt.figure()
ax = fig.add_subplot()
data = np.random.standard_normal(30).cumsum()
ax.plot(data, 
        color="black", 
        linestyle="dashed", 
        label="Default");
ax.plot(data, 
        color="black", 
        linestyle="dashed",
        drawstyle="steps-post", 
        label="steps-post");
ax.legend()

# Ticks, Labels, and Legends
<a id='ticks_lables_legends'></a>

In [None]:
# initiate plot for customization
fig, ax = plt.subplots()
ax.plot(np.random.standard_normal(1000).cumsum());

In [None]:
#! ipython id=08aa54cdf68c46fa8137f4c30189fb7d
ticks = ax.set_xticks([0, 250, 500, 750, 1000])
labels = ax.set_xticklabels(["one", "two", "three", "four", "five"],
                            rotation=30, fontsize=8)
fig

In [None]:
# changing other elements of the figure
ax.set_xlabel("Stages")
ax.set_title("My first matplotlib plot")
fig

In [None]:
# create a messy plot
fig, ax = plt.subplots()
ax.plot(np.random.randn(1000).cumsum(), 
        color="black", 
        label="one");
ax.plot(np.random.randn(1000).cumsum(), 
        color="black", 
        linestyle="dashed",
        label="two");
ax.plot(np.random.randn(1000).cumsum(), 
        color="black", 
        linestyle="dotted",
        label="three");

In [None]:
# legend takes the labels and line attributes and creates a box
ax.legend()
fig

# Annotations and Drawing on a Subplot
<a id='annotations_drawing'></a>

In [None]:
#! ipython verbatim id=00b59126f3364fdf900df06d486ed23d
from datetime import datetime

fig, ax = plt.subplots()

data = pd.read_csv("examples/spx.csv", index_col=0, parse_dates=True)
spx = data["SPX"]

spx.plot(ax=ax, color="black")

crisis_data = [
    (datetime(2007, 10, 11), "Peak of bull market"),
    (datetime(2008, 3, 12), "Bear Stearns Fails"),
    (datetime(2008, 9, 15), "Lehman Bankruptcy")
]

for date, label in crisis_data:
    ax.annotate(label, xy=(date, spx.asof(date) + 75),
                xytext=(date, spx.asof(date) + 225),
                arrowprops=dict(facecolor="black", headwidth=4, width=2,
                                headlength=4),
                horizontalalignment="left", verticalalignment="top")

# Zoom in on 2007-2010
ax.set_xlim(["1/1/2007", "1/1/2011"])
ax.set_ylim([600, 1800])

ax.set_title("Important dates in the 2008-2009 financial crisis")

In [None]:
# set a main title for the graphic
ax.set_title("Important dates in the 2008–2009 financial crisis");
fig

In [None]:
# shapes are built in functions
fig, ax = plt.subplots(figsize=(12, 6))
rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color="black", alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color="blue", alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],
                   color="green", alpha=0.5)
ax.add_patch(rect)
ax.add_patch(circ)
#! figure,id=vis_patch_ex,width=4in,title="Data visualization composed from three different patches"
ax.add_patch(pgon);

In [None]:
# saving plots to file
fig.savefig("figpath.svg")

# Plotting with pandas and seaborn
<a id='plotting_pandas_seaborn'></a>

# Line Plots
<a id='line_plots'></a>

In [None]:
#! ipython id=21edcf8da2154b7db12f7083ab4e9777
s = pd.Series(np.random.standard_normal(10).cumsum(), 
              index=np.arange(0, 100, 10))
#! figure,id=vis_series_plot_1,width=4in,title="Simple Series plot"
s.plot()

In [None]:
# DataFrame’s plot method plots each of its columns as a different line on the same subplot
df = pd.DataFrame(np.random.standard_normal((10, 4)).cumsum(0),
                  columns=["A", "B", "C", "D"],
                  index=np.arange(0, 100, 10))
plt.style.use('grayscale')
df.plot();

# Bar Plots
<a id='bar_plots'></a>

In [None]:
# bar and barh plots
fig, axes = plt.subplots(2, 1)
data = pd.Series(np.random.uniform(size=16), index=list("abcdefghijklmnop"))
data.plot.bar(ax=axes[0], color="black", alpha=0.7)
#! figure,id=vis_bar_plot_ex,width=4.5in,title="Horizonal and vertical bar plot"
data.plot.barh(ax=axes[1], color="black", alpha=0.7)
fig

In [None]:
#! ipython suppress id=bbbe2554feb1460caf331d0a9c1a158a
np.random.seed(12348)

In [None]:
#! ipython id=da2de3b4af64451ca5b47b6687e40c44
df = pd.DataFrame(np.random.uniform(size=(6, 4)),
                  index=["one", "two", "three", "four", "five", "six"],
                  columns=pd.Index(["A", "B", "C", "D"], name="Genus"))
df

In [None]:
#! figure,id=vis_frame_barplot,width=4in,title="DataFrame bar plot"
df.plot.bar()

In [None]:
# data frame stacked bar chart
df.plot.barh(stacked=True, alpha=0.5)

In [None]:
#! ipython suppress id=70abd941e8d843678a1c1cf5010eaa27
plt.close("all")

In [None]:
# review the tips dataset again
tips = pd.read_csv("examples/tips.csv")
tips.head()

In [None]:
party_counts = pd.crosstab(tips["day"], tips["size"])
party_counts = party_counts.reindex(index=["Thur", "Fri", "Sat", "Sun"])
party_counts

In [None]:
# take a subset of parties 2 to 5 sized
party_counts = party_counts.loc[:, 2:5]

In [None]:
# Normalize to sum to 1
party_pcts = party_counts.div(party_counts.sum(axis="columns"),
                              axis="index")
party_pcts

In [None]:
# Fraction of parties by size within each day
party_pcts.plot.bar(stacked=True);

In [None]:
#! ipython suppress id=2e6c41575fc34b8ca7963b2c24167be2
plt.close("all")

In [None]:
# seaboarn helps with aggregation in visualization
import seaborn as sns

tips["tip_pct"] = tips["tip"] / (tips["total_bill"] - tips["tip"])
tips.head()

In [None]:
# Tipping percentage by day with error bars
plt.style.use('default')
sns.barplot(x="tip_pct", y="day", data=tips, orient="h");

In [None]:
#! ipython suppress id=05ae3cd4fb7048c5ba0d9ed6390c1f03
plt.close("all")


In [None]:
# add a category to color
sns.barplot(x="tip_pct", y="day", hue="time", data=tips, orient="h");

In [None]:
# customize elements with methods
sns.set_style("whitegrid")
sns.set_palette("Greys_r")

# Histograms and Density Plots
<a id='histograms_density_plots'></a>

In [None]:
# histogram created automagically
tips["tip_pct"].plot.hist(bins=50);

In [None]:
# checking distribution with a density plot
tips["tip_pct"].plot.density()

In [None]:

comp1 = np.random.standard_normal(200)
comp2 = 10 + 2 * np.random.standard_normal(200)
values = pd.Series(np.concatenate([comp1, comp2]))

# Normalized histogram of normal mixture"
sns.histplot(values, bins=100, color="black")

# Scatter or Point Plots
<a id='scatter_point_plots'></a>

In [None]:
macro = pd.read_csv("examples/macrodata.csv")
data = macro[["cpi", "m1", "tbilrate", "unemp"]]
trans_data = np.log(data).diff().dropna()
trans_data.tail()

In [None]:
# establish an object is a plt object
type(ax)

In [None]:
# A seaborn regression/scatter plot
ax = sns.regplot(x="m1", y="unemp", data=trans_data)
ax.set_title('Changes in log(m1) versus log(unemp)');

In [None]:
# Pair plot matrix of statsmodels macro data
sns.pairplot(trans_data, diag_kind="kde", plot_kws={"alpha": 0.2})

# Facet Grids and Categorical Data
<a id='facet_grids_categorical_data'></a>

In [None]:
# Tipping percentage by day/time/smoker
sns.catplot(x="day", y="tip_pct", hue="time", col="smoker",
            kind="bar", data=tips[tips.tip_pct < 1])

In [None]:
# rows rather than hue, produces rows of charts
sns.catplot(x="day", y="tip_pct", row="time",
            col="smoker",
            kind="bar", data=tips[tips.tip_pct < 1])

In [None]:
# Box plot of tipping percentage by day
sns.catplot(x="tip_pct", y="day", kind="box",
            data=tips[tips.tip_pct < 0.5])

# Geo Demo

In [None]:
# Quick Primer on Geo
import plotly.express as px
df = px.data.carshare()
fig = px.scatter_mapbox(df, lat="centroid_lat", lon="centroid_lon", color="peak_hour", size="car_hours",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10,
                  mapbox_style="stamen-toner")
fig.show()

In [None]:

df['peak_hour']=df['peak_hour'].astype("category")

In [None]:
fig = px.scatter_mapbox(df, lat="centroid_lat", lon="centroid_lon", color="peak_hour", size="car_hours",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10,
                  mapbox_style="stamen-toner")
fig.show()

# In Class Problems

#### 0- Create a dataframe from the following link
https://raw.githubusercontent.com/sfrechette/adventureworks-neo4j/master/data/orders.csv
- Adjust any data types as needed (thinks dates)
- Create any new columns that make sense (think shipping time)
- Perform some exploratory data analysis on the dataframe

#### 1- Start graphing
- Create a histogram on Order Quantity
- Create a visualization on Line Total
- Look for seasonality on order date
- Create other visualization as curiosity dictates