Warning: sometimes our graphs look bad because the resolution of the 
graph is poor.  This has a technical fix.  
Here are some pointers to getting graphics out of MATPLOTLIB that look good when imported into other programs.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# imports data from Nat1988.txt which is about 810 Mb
# and KEY1988.txt which is an ad-hoc table of columns and labels
# https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/DVS/natality/Nat1988.zip 


In [None]:
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists datadictionary
    (for example [[43, 44, "MATERNALAGE"]] ) and return a pandas dataframe (same 
    number of rows) with labeled columns  '''
    df = pd.DataFrame()
    for start, stop, column_name in datadictionary:
        # Get the first column (start)
        df[column_name] = natalitydata[0].str.get(start-1)
        # Get the remaining columns
        for i in range(1, stop-start + 1):
            df[column_name]=  df[column_name].str.cat(natalitydata[0].str.get(start-1+i))
    return df
    for start, stop, column_name in datadictionary:
        pd[column_name] = pd.to_numeric(pd[column_name])

In [None]:
# Load in a file of three columns into a list of lists, the first two elements of which are integers.
# This is our table of column positions and column names
key1988 = []
for line in open("/Users/wltrimbl/git/VIZ/visualization-curriculum/KEY1988.txt"):
    fields = line.strip().split()
    key1988.append([int(fields[0]), int(fields[1]), fields[2]])
key1988


In [None]:
# Read the large (215 x 3.9million cells) data file into a pandas frame 
n1988 = pd.read_csv("~/Downloads/birth/NATL1988.txt", header=None)

In [None]:
df = parse_natality(n1988, key1988)
df.head()

In [None]:
# looks like I need to groupby both ONEMINUTEAPGAR and FIVEMINUTEAPGAR
apgarapgarhist = df.groupby(["ONEMINUTEAPGAR", "FIVEMINUTEAPGAR"]).FIVEMINUTEAPGAR.count()


In [None]:
apgarapgar= np.reshape(np.array(apgarapgarhist), (12,12))
# chop off unknown apgar scores
apgarapgar = apgarapgar[0:11,0:11]

In [None]:
apgarapgar  # now it's just a numpy arary, so I don't have pandas 
# conveniences like .head()

In [None]:
# Exaggerated-color scale version
#  https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html
#  plt.savefig() claims to support PNG, EPS, PDF, and SVG, and does support JPG
#  It stands to reason they have different optional arguments (resolution, quality, metadata)
from matplotlib.colors import LogNorm

plt.imshow((apgarapgar/apgarapgar.sum()), norm=LogNorm())
plt.ylabel("One minute Apgar score"), plt.xlabel("Five minute Apgar score"); plt.colorbar()


In [None]:
# first, bigger axis labels
plt.imshow((apgarapgar/apgarapgar.sum()), norm=LogNorm())
plt.ylabel("One minute Apgar score", fontsize=14)
plt.xlabel("Five minute Apgar score", fontsize=14) 
plt.colorbar()

In [None]:
# Next, bigger axis tick marks
# first, bigger axis labels
plt.imshow((apgarapgar/apgarapgar.sum()), norm=LogNorm())
plt.ylabel("One minute Apgar score", fontsize=14)
plt.xlabel("Five minute Apgar score", fontsize=14) 
plt.xticks(fontsize=13); plt.yticks(fontsize=13)
plt.colorbar()


In [None]:
# Next, bigger color bar tick labels
plt.imshow((apgarapgar/apgarapgar.sum()), norm=LogNorm())
plt.ylabel("One minute Apgar score", fontsize=14)
plt.xlabel("Five minute Apgar score", fontsize=14) 
plt.xticks(fontsize=13); plt.yticks(fontsize=13)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=13) 

In [None]:
plt.imshow((apgarapgar/apgarapgar.sum()), norm=LogNorm())
plt.ylabel("One minute Apgar score", fontsize=14)
plt.xlabel("Five minute Apgar score", fontsize=14) 
plt.xticks(fontsize=13); plt.yticks(fontsize=13)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=13) 
plt.savefig("APGAR.png")

In [None]:
# savefig results in a below-presentation-standard graphic:
!exiftool APGAR.png
#  432 x 288 pixels!! No wonder it looks crummy.  

In [None]:
# Let us marginalize over output formats...
plt.imshow((apgarapgar/apgarapgar.sum()), norm=LogNorm())
plt.ylabel("One minute Apgar score", fontsize=14)
plt.xlabel("Five minute Apgar score", fontsize=14) 
plt.xticks(fontsize=13); plt.yticks(fontsize=13)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=13) 
plt.savefig("APGAR.jpg")
plt.savefig("APGAR.pdf")
plt.savefig("APGAR.svg")
plt.savefig("APGAR.png")
plt.savefig("APGAR.eps")

In [None]:
!ls -lh APGAR.???
# So some are bigger than others.  
# Here JPG and PNG are bitmap formats,
# EPS and PDF are container formats that *can* contain 
# vector graphics if they are created properly, and 
# SVG is a vector format that isn't widely supported.

# Why create so many files?  To check which ones I can use
# downstream.

In [None]:
# For my presentation software, I can import EPS and SVG.
# If I was completely stuck with the bitmap formats, 
# I could improve the quality by increasing the resolution:

In [None]:
plt.imshow((apgarapgar/apgarapgar.sum()), norm=LogNorm())
plt.ylabel("One minute Apgar score", fontsize=14)
plt.xlabel("Five minute Apgar score", fontsize=14) 
plt.xticks(fontsize=13); plt.yticks(fontsize=13)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=13) 
plt.savefig("APGAR300.png", dpi=300)


In [None]:
!exiftool APGAR300.png # and now it's 1800x1200 and 62kB
# It looks good enough for anywhere I am going to put it,
# but it is not infinitely zoomable like EPS or SVG.