![Control panel at Three Mile Island generating station](controlroomTMI.jpg "The Control Panel")

## Basic plotting: bar charts and histograms with altair, matplotlib, and plotnine

ipython has "magic" commands, %pwd, %cd, which are intended to make life easier for humans, like showing our directory and letting us change directory.

In [None]:
# Let us first handle the preliminary imports
import pandas as pd
import altair as alt
import plotnine as p9

Note: you can import all of the functions
defined in a module with 
    from pandas import *
This means less typing (good for interactive programming)
but also obscures which code comes from which library.

In [None]:
rainfall = pd.read_csv("cities.csv",delimiter="\t")

In [None]:
head(rainfall)

Sorry about that, .head() is a method that belongs to pandas data frames.. I have to call df.head()

In [None]:
rainfall.head()

In [None]:
# Let's just quick check what data types this has:
print(rainfall["Days"].dtype)
print(rainfall["City, State"].dtype)
print(rainfall["Inches"].dtype)
print(rainfall["Milimeters"].dtype)

In [None]:
rainfall.dtypes

In [None]:
# I can add a new column that is easier to type:
rainfall["City"] = rainfall["City, State"]


In [None]:
# basic, altair-defaults bar chart:
alt.Chart(rainfall).mark_bar().encode(alt.X("City"), alt.Y("Milimeters"))

In [None]:
# basic, altair-defaults bar chart:
alt.Chart(rainfall).mark_bar().encode(alt.X("City:N"), alt.Y("Milimeters:Q"))

In [None]:
# basic, altair-defaults dot chart:
alt.Chart(rainfall).mark_point().encode(alt.X("City:N"), alt.Y("Milimeters:Q"))

In [None]:
# basic, altair-defaults line chart:
alt.Chart(rainfall).mark_line().encode(alt.X("City:N"), alt.Y("Milimeters:Q"))

In [None]:
# We can put the city names in y, where they are easier to read:
alt.Chart(rainfall).mark_bar().encode(
    alt.Y("City:N"), 
    alt.X("Milimeters:Q"))

In [None]:
# And we can instruct altair to sort by rainfall in "Milimeters" column:
alt.Chart(rainfall).mark_bar().encode(
    alt.Y("City:N", sort=alt.Sort(field="Milimeters")), 
    alt.X("Milimeters:Q", sort="ascending"))

In [None]:
# Can I sort my data?  
alt.Chart(rainfall.sort_values("Milimeters")).mark_bar().encode(
    alt.Y("City:N"), 
    alt.X("Milimeters:Q", sort="ascending"))

Note, the syntax here (the precise magical incantation to cause the sorting) is *not* at all obvious.  We have to look this up in the altair documentation: 
https://altair-viz.github.io/user_guide/generated/channels/altair.X.html

We can reverse the order by making the "sort" argument to alt.Y a more complex alt.Sort object:

In [None]:
alt.Chart(rainfall).mark_bar().encode(alt.Y("City:N", sort=alt.Sort(field="Milimeters", order="descending")), alt.X("Milimeters:Q"))

That was altair's default.   Now what do we remember about the matplotlib API...
we can generate plots three ways:
* data.plot()
* plt.plot(data)
* *    fig,ax = plt.subplot()  
* *  ax.plot(data)

In [None]:
import matplotlib.pyplot as plt


In [None]:
plt.bar(rainfall["City"], rainfall["Milimeters"])

In [None]:
# This is not acceptable.  Perhaps the documentation will help?
plt.bar?

In [None]:
# This doesn't help.  But a quick search reveals the solution:
plt.bar(rainfall["City"], rainfall["Milimeters"])
plt.xticks(rotation=90)
1
# I was able to use this only because xticks() is exposed in the plt interface.

In [None]:
plt.bar(rainfall["City"], rainfall["Milimeters"])
plt.xticks(rotation=90)
plt.ylabel("Annual rainfall, mm")
plt.title("The rain falls on the plains, just less of it")
1

In [None]:
print(len(rainfall))
rainfall.head()

In [None]:
rainfallsorted =rainfall.sort_values(by = "Milimeters")
rainfallsorted

In [None]:
plt.bar(rainfallsorted["City"], rainfallsorted["Milimeters"])
plt.xticks(rotation=90)
plt.ylabel("Annual rainfall, mm")
plt.title("The rain falls on the plains, just less of it [MATPLOTLIB]")
1

In [None]:
# I can do the same horizontally:
# plt.barh(y_axis,x_axis)
plt.barh(rainfallsorted["City"], rainfallsorted["Milimeters"])
plt.xticks(rotation=90)
plt.ylabel("Annual rainfall, mm")
plt.title("The rain falls on the plains, just less of it [MATPLOTLIB]")
1

In [None]:
import plotnine as p9

In [None]:
#  p9.ggplot(data=surveys_complete, mapping=p9.aes(x='factor(year)')) + p9.geom_bar()    

p9.ggplot(data=rainfall, mapping=p9.aes(x="City", y="Milimeters")) + p9.geom_bar(stat="identity")

In [None]:
p9.ggplot(data=rainfall, mapping=p9.aes(x="City", y="Milimeters")) + p9.geom_bar(stat="identity")+  p9.coord_flip()

In [None]:
p9.ggplot(data=rainfallsorted, mapping=p9.aes(x="City", y="Milimeters")) + p9.geom_bar(stat="identity") + p9.theme(axis_text_x = p9.element_text(angle = 90))

In [None]:
import seaborn as sns

In [None]:
ax = sns.barplot(x="Milimeters", y="City", data=rainfall)

In [None]:
Oh, my.  Well.  Surely there is a knob somewhere that changes the color scheme?

In [None]:
ax = sns.barplot(x="Milimeters", y="City", data=rainfallsorted)

In [None]:
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="Milimeters", y="City", data=rainfallsorted)

In [None]:
sns.set_style("dark")
ax = sns.barplot(x="Milimeters", y="City", data=rainfallsorted)

In [None]:
sns.set_style("darkgrid")
ax = sns.barplot(x="Milimeters", y="City", data=rainfallsorted)

In [None]:
Ok, set_style is not going to get me out of pastel rainbow decorative bar colors.

In [None]:
sns.color_palette("rocket")
ax = sns.barplot(x="Milimeters", y="City", data=rainfallsorted)

In [None]:
# That didn't do it either.
ax = sns.barplot(x="Milimeters", y="City", data=rainfallsorted, palette="rocket")


In [None]:
ax = sns.barplot(x="Milimeters", y="City", data=rainfallsorted, palette="ch:start=.2,rot=-.3")


![City of Chicago](city-of-chicago.jpg "Chicago logo")

In [None]:
salary=pd.read_csv("Current_Employee_Names__Salaries__and_Position_Titles.csv")

In [None]:
salary.head()

In [None]:
len(salary)

In [None]:
salary.dtypes

In [None]:
salary["Salary or Hourly"]

In [None]:
salary["Salary or Hourly"] == "Salary"

In [None]:
salary[salary["Salary or Hourly"] == "Salary"]


In [None]:
salaried = salary[salary["Salary or Hourly"] == "Salary"]
hourly = salary[salary["Salary or Hourly"] == "Hourly"]

In [None]:
print(len(salaried))
print(len(hourly))

In [None]:
len(salary)

In [None]:
plt.hist(salaried["Annual Salary"])

In [None]:
salary["Annual Salary fix"] = pd.to_numeric(salary["Annual Salary"].str.replace(",", ""))

In [None]:
pd.to_numeric(salary["Annual Salary"].str.replace(",", ""))

In [None]:
salary["Annual Salary fix"].dtype

In [None]:
plt.hist(salary["Annual Salary fix"])

In [None]:
plt.hist(salary["Annual Salary fix"], bins=30)

In [None]:
plt.hist(salary["Annual Salary fix"], bins=30)
plt.xlabel("Annual salary, $")

In [None]:
a = pd.to_numeric(salary["Hourly Rate"].str.replace(",", ""))
a.head()

In [None]:
import altair as alt
alt.Chart(salary[0:5000]).mark_bar().encode(alt.X("Annual Salary fix:Q", bin=True), y="count()")

In [None]:
salary[0:1000]

In [None]:
salaried = salary[salary["Salary or Hourly"] == "Salary"]
hourly = salary[salary["Salary or Hourly"] == "Hourly"]

In [None]:
alt.Chart(salaried[0:10]).mark_bar().encode(alt.X("Annual Salary:Q", bin=True), alt.Y("count()"))

In [None]:
# And if I want more resolution, I replace bin=True with  bin=alt.Bin(maxbins=N)

In [None]:
salaried["Annual Salary fix"]

In [None]:
salary["Department"].value_counts()

In [None]:
import altair as alt
alt.Chart(salary[0:1000]).mark_bar().encode(alt.X("Annual Salary fix:Q", bin=True), y="count()", color="Department")

In [None]:
topdepts = salary["Department"].value_counts()[0:9]

In [None]:
topdepts

In [None]:
alt.Chart(salary[0:1000]).mark_bar().encode(alt.X("Annual Salary fix:Q", bin=alt.Bin(maxbins=30)), y="count()", color="Department")

In [None]:
topdepts = salary["Department"].value_counts()[0:9]

In [None]:
topdepts


In [None]:
topdepts.index

In [None]:
deptwhitelist = set (topdepts.index)

In [None]:
"PUBLIC LIBRARY" in deptwhitelist


In [None]:
"HOGWARTS" in deptwhitelist

In [None]:
"BOARD OF ETHICS" in deptwhitelist

In [None]:
salary["Department_clean"] = salary["Department"]

In [None]:
for i in salary.index:
    if i < 10:
        print(salary.Department_clean[i])
    if salary.Department_clean[i] not in deptwhitelist:
        salary.Department_clean[i] = "OTHER"


In [None]:
salary.Department.value_counts()

In [None]:
salary.Department_clean.value_counts()

In [None]:
alt.Chart(salary[0:5000]).mark_bar().encode(alt.X("Annual Salary fix:Q", bin=alt.Bin(maxbins=30)), y="count()", color="Department_clean")