In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
wb = pd.read_csv("data/world_bank.csv", index_col=0)
wb = wb.rename(columns={'Antiretroviral therapy coverage: % of people living with HIV: 2015':"HIV rate",
                        'Gross national income per capita, Atlas method: $: 2016':'gni'})
wb.head()

In [None]:
sns.displot(data=wb,
            x="gni",
            kind="hist",
            stat="density")
plt.title("Distribution of gross national income per capita");

In [None]:
sns.displot(data=wb,
            x="gni",
            kind='kde')
plt.title("Distribution of gross national income per capita");

In [None]:
sns.displot(data=wb,
            x="gni",
            kind='ecdf')
plt.title("Cumulative Distribution of gross national income per capita");

In [None]:
plt.scatter(wb['per capita: % growth: 2016'], wb['Adult literacy rate: Female: % ages 15 and older: 2005-14'])
plt.xlabel("% growth per capita")
plt.ylabel("Female adult literacy rate");

In [None]:
sns.scatterplot(data=wb, x='per capita: % growth: 2016', \
                y='Adult literacy rate: Female: % ages 15 and older: 2005-14', hue="Continent")
plt.xlabel("% growth per capita")
plt.ylabel("Female adult literacy rate");

In [None]:
random_x_noise = np.random.uniform(-1, 1, len(wb))
random_y_noise = np.random.uniform(-5, 5, len(wb))

plt.scatter(wb['per capita: % growth: 2016'] + random_x_noise, \
            wb['Adult literacy rate: Female: % ages 15 and older: 2005-14'] + random_y_noise, s=15)

plt.xlabel("% growth per capita (jittered)")
plt.ylabel("Female adult literacy rate (jittered)");

In [None]:
sns.lmplot(data=wb, x='per capita: % growth: 2016', \
           y ='Adult literacy rate: Female: % ages 15 and older: 2005-14');

In [None]:
sns.jointplot(data=wb, x='per capita: % growth: 2016', \
              y='Adult literacy rate: Female: % ages 15 and older: 2005-14'
              kind='hex');

In [None]:
sns.kdeplot(data=wb, x='per capita: % growth: 2016', \
            y='Adult literacy rate: Female: % ages 15 and older: 2005-14', fill=True);

In [None]:
sns.jointplot(data=wb, x='per capita: % growth: 2016', \
              y='Adult literacy rate: Female: % ages 15 and older: 2005-14',
              kind='kde');

In [None]:
df = pd.DataFrame(index=wb.index)
df['lit'] = wb['Adult literacy rate: Female: % ages 15 and older: 2005-14'] \
          + wb["Adult literacy rate: Male: % ages 15 and older: 2005-14"]
df['inc'] = wb['gni']
df.dropna(inplace=True)

plt.scatter(df["inc"], df["lit"])
plt.xlabel("Gross national income per capita")
plt.ylabel("dult literacy rate");

In [None]:
plt.scatter(np.log(df["inc"]), df["lit"])
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate")

In [None]:
plt.scatter(np.log(df["inc"]), df["lit"]**4)
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate (4th power)");

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(np.log(df[["inc"]]), df["lit"]**4)
m, b = model.coef_[0], model.intercept_

print(f"The slope, m, of the transformed data is: {m}")
print(f"The intercept, b, of the transformed data is: {b}")

df = df.sort_values("inc")
plt.scatter(np.log(df["inc"]), df["lit"]**4, label="Transformed data")
plt.plot(np.log(df["inc"]), m*np.log(df["inc"])+b, c="red", label="Linear regression")
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate (4th power)")
plt.legend();

In [None]:
plt.scatter(df["inc"], df["lit"], label="Untransformed data")
plt.plot(df["inc"], (m*np.log(df["inc"])+b)**(1/4), c="red", label="Modeled relationship")
plt.xlabel("Gross national income per capita")
plt.ylabel("Adult literacy rate")
plt.legend();

In [None]:
ppdf = np.DataFrame(dict(Cancer=[2007371, 935573], Abortion=[289750, 327000]),
                    index=pd.Series([2006, 2013],
                    name="Year"))
ppdf

In [None]:
ax = sns.lineplot(data=ppdf, markers=True)
ax.set_title("Planned Parenthood Procedures")
ax.set_xticks([2006, 2013])
ax.set_ylabel("Service count");

In [None]:
rel_change = 100*(ppdf.loc[2013] - ppdf.loc[2006])/ppdf.loc[2006]
rel_change.name = "Percent Change"
rel_change

In [None]:
ax = sns.barplot(x=rel_change.index, y=rel_change)
ax.axhline(0, color='black')
ax.set_title("Percent Change in Number of Procedures");

In [None]:
cps = pd.read_csv("data/edInc2.csv")
cps

In [None]:
cps = cps.replace({'educ':{1:"<HS", 2:"HS", 3:"<BA", 4:"BA", 5:">BA"}})
cps.columns = ['Education', 'Gender', 'Income']
cps

In [None]:
blue_red = ["#397eb7", "#bf1518"]
with sns.color_palette(sns.color_palette(blue_red)):
  ax = sns.pointplot(data=cps, x="Education", y="Income", hue="Gender")

ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");

In [None]:
cps.head()

In [None]:
cg = cps.set_index("Education").groupby("Gender")
men = cg.get_group("Men").drop("Gender", axis="columns")
women = cg.get_group("Women").drop("Gender", axis="columns")
display(men, women)

In [None]:
mfratio = men/women
mfratio.columns = ["Income Ratio (M/F)"]
mfratio

In [None]:
ax = sns.lineplot(data=mfratio, markers = True, legend = False);
ax.set_ylabel("Ratio")
ax.set_title("M/F Income Ratio as a function of education level");

In [None]:
fmratio = women/men
fmratio.columns = ["Income Ratio (F/M)"]
fmratio

In [None]:
ax = sns.lineplot(data=fmratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("F/M Income Ratio as a function of education level");

In [None]:
co2 = pd.read_csv("data/CAITcountryCO2.csv", skiprows=2,
                  names=["Country", "Year", "CO2"], encoding="ISO-8859-1")
co2.tail()

In [None]:
last_year = co2.Year.iloc[-1]
last_year

In [None]:
q = f"Country != 'World' and Country != 'European Union (15)' and Year == {last_year}"
top14_last = co2.query(q).sort_values('CO2', ascending = False).iloc[:14]
top14_lasty

In [None]:
top14 = co2[co2.Country.isin(top14_lasty.Country) & (co2.Year >= 1950)]
print(len(top14.Country.unique()))
top14.head()

In [None]:
from cycler import cycler

linestyle = ['-', '--', ':', '-.' ]
colors = plt.cm.Dark2.colors
lines_c = cycler('linestyle', linestyles)
color_c = cycler('color', colors)

fig, ax = plt.subplots(figsize=(8,8))
ax.set_prop_cycle(color_c * lines_c)

x, y = 'Year', 'CO2'
for name, df in top14.groupby('Country'):
  ax.semilogy(df[x], df[y], label=name)

ax.set_xlabel(x)
ax.set_ylabel(f"{y} Emissions (million tons)")
ax.legend(ncol=2, frameon=True, fontsize=11);