## **Getting the notebook ready for analysis**

In [0]:
# -- connect your Google Drive
from google.colab import drive
drive.mount("/content/drive")

In [0]:
# -- install sodapy
!pip install sodapy

In [0]:
# -- useful imports
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
import sodapy

## **Reading in data**

In [0]:
# -- read in a data file from a URL (example: NOAA sea level)
fname = "https://www.star.nesdis.noaa.gov/sod/lsa/SeaLevelRise/slr/slr_sla_gbl_free_txj1j2_90.csv"
noaa  = pd.read_csv(fname, skiprows=5, sep=",", lineterminator="\n")

In [0]:
# -- read in data from your Google Drive (example: World Bank)
fname = "drive/My Drive/dstep20/data/world_bank/world_bank_gdp_leb.csv"
wbdat = pd.read_csv(fname)

In [0]:
# -- read in a data from a Socrata Open Data base (example: NYC DSNY)
dom     = "data.cityofnewyork.us"
dsid    = "ebb7-mvp5"
lim     = 100000
app_tok = None
client  = sodapy.Socrata(dom, app_tok, timeout=120)
result  = client.get(dsid, limit=lim)
dsny    = pd.DataFrame.from_records(result)

## **Accessing data in DataFrames**

In [0]:
# -- access a single column from a DataFrame
gdp = wbdat["gdp2017"]

In [0]:
# -- access multiple columns from a DataFrame
cols    = ["gdp2017", "leb2017"]
gdp_leb = wbdat[cols]

In [0]:
# -- set the values in a column (or create the column if it doesn't exist)
wbdat["log_gdp"] = np.log10(wbdat["gdp2017"])

In [0]:
# -- convert columns in DataFrame to different data types
dsny["communitydistrict"]   = dsny["communitydistrict"].astype(int)
dsny["refusetonscollected"] = dsny["refusetonscollected"].astype(float)

## **Sub-selecting data in DataFrames**

In [0]:
# -- sub-select only DSNY data in Manhattan
dsny_sub = dsny[dsny["borough"] == "Manhattan"]

In [0]:
# -- sub-select only DSNY data in Manhattan and in Community District 3
ind_boro  = dsny["borough"] == "Manhattan"
ind_comd  = dsny["communitydistrict"] == 3
ind_tot   = ind_boro & ind_comd
dsny_sub2 = dsny[ind_tot]

In [0]:
# -- use a string function to select only boroughs that contain "Br"
ind_str   = dsny["borough"].str.contains("Br")
dsny_sub3 = dsny[ind_str]

In [0]:
# -- print the value of the Refuse column and 6th row
dsny["refusetonscollected"].iloc[6]

In [0]:
# -- sub-select the 3rd through 5th column and 10th through 17th row
dsny_sub4 = dsny[dsny.columns[3:6]].iloc[10:18]

## **Manipulating data in DataFrames**

In [0]:
# -- finding top 10 GDP in World Bank data
top10 = wbdat.sort_values("gdp2017", ascending=False)[:10]

In [0]:
# -- group DSNY data by borough and sum to get total refuse
dsny_grp = dsny.groupby("borough").sum()
nyc_tot  = dsny_grp["refusetonscollected"].reset_index()

In [0]:
# -- take minimum across columns
cols = ["TOPEX/Poseidon", "Jason-1", "Jason-2", "Jason-3"]
minlev = noaa[cols].min(axis=1)
noaa["minlev"] = minlev

## **Plotting data**

In [0]:
# -- make a line plot with proper axis labels
ax   = noaa.plot("year", "minlev", figsize=[7, 3], legend=False)
xlab = ax.set_xlabel("year")
ylab = ax.set_ylabel("sea level relative to 2000 [mm]")

In [0]:
# -- make a scatter plot with proper axis labels
ax = wbdat.plot.scatter("log_gdp", "leb2017", color="red", figsize=[8, 4])
xlab = ax.set_xlabel("log(GDP in 2017 [USD])")
ylab = ax.set_ylabel("Life Expectancy at Birth in 2017 [years]")

In [0]:
# -- make a bar chart with proper axis labels
ax   = top10.plot.bar("country", "gdp2017", color="k", legend=False, figsize=[6, 6])
xlab = ax.set_xlabel("")
ylab = ax.set_ylabel("2017 GDP in USD", fontsize=15)