In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
wb = pd.read_csv("data/world_bank.csv", index_col=0)
wb.head()

In [None]:
wb["Continent"].value_counts()

In [None]:
continents = wb["Continent"].value_counts()
plt.bar(continents.index, continents.values)
plt.xlabel("Continents")
plt.ylabel("Counts")
plt.title("Distribution of countries across the continents")

In [None]:
wb["Continent"].value_counts().plot(kind='bar')
plt.ylabel("Counts")
plt.title("Distribution of countries across the continents");

In [None]:
sns.countplot(data=wb, x='Continent')
plt.title("Distribution of countries across the continents")

In [None]:
sns.countplot(data=wb, x='Gross national income per capita, Atlas method: $: 2016')
plt.title("GNI distribution for different countries");

In [None]:
sns.boxplot(data=wb, y="Gross national income per capita, Atlas method: $: 2016")
plt.title("The distribution of GNI per capita in different countries");

In [None]:
sns.violinplot(data=wb, y="Gross national income per capita, Atlas method: $: 2016")
plt.title("The distribution of GNI per capita in different countries")

In [None]:
gdp = wb['Gross domestic product: % growth : 2016']
gdp = gdp[~gdp.isna()]

q1, q2, q3 = np.percentile(gdp, [25, 50, 75])

wb_quartiles = wb.copy()
wb_quartiles['category'] = None
wb_quartiles.loc[(wb_quartiles['Gross domestic product: % growth : 2016'] < q1) | (wb_quartiles['Gross domestic product: % growth : 2016'] > q3), 'category'] = 'Outside of the middle 50%'
wb_quartiles.loc[(wb_quartiles['Gross domestic product: % growth : 2016'] > q1) & (wb_quartiles['Gross domestic product: % growth : 2016'] < q3), 'category'] = 'In the middle 50%'

sns.histplot(wb_quartiles, x="Gross domestic product: % growth : 2016", hue="category")
sns.rugplot([q1, q2, q3], c="firebrick", lw=6, height=0.1)
plt.title("The distribution of GNI per capita with the middle 50% highlighted in blue");

In [None]:
sns.boxplot(data=wb, y='Gross domestic product: % growth : 2016')
plt.title("The distribution of gross domestic product: % growth");

In [None]:
sns.violinplot(data=wb, y='Gross domestic product: % growth : 2016')
plt.title("The distribution of gross domestic product: % growth");

In [None]:
sns.boxplot(data=wb, x="Continent", y = 'Gross domestic product: % growth : 2016')
plt.title("The distribution of gross domestic product for different continents");

In [None]:
gni = wb["Gross national income per capita, Atlas method: $: 2016"]
plt.hist(gni, density=True, edgecolor="white")

plt.xlabel("Gross national income per capita")
plt.ylabel("Density")
plt.title("Distribution of gross national income per capita");

In [None]:
sns.histplot(data=wb, x="Gross national income per capita, Atlas method: $: 2016", stat="density")
plt.title("Distribution of gross national income per capita");

In [None]:
north = ["Asia", "Europe", "N. America"]
south = ["Africa", "Oceania", "S. America"]
wb.loc[wb["Continent"].isin(north), "Hemisphere"] = "Northern"
wb.loc[wb["Continent"].isin(south), "Hemisphere"] = "Southern"

In [None]:
sns.histplot(data=wb, x="Gross national income per capita, Atlas method: $: 2016", hue="Hemisphere", stat="density")
plt.title("Distribution of gross national income per capita highlighted for different hemispheres")

In [None]:
densities, bins, _ = plt.hist(gni, density=True, edgecolor="white", bins=5)
plt.xlabel("Gross national income per capita")
plt.ylabel("Density")
plt.title("A histogram of the distribution of GNI per capita");

print(f"First bin has width {bins[1]-bins[0]} and height {densities[0]}")
print(f"This corresponds to {bins[1]-bins[0]} * {densities[0]} = {(bins[1]-bins[0])*densities[0]*100}% of the data")

In [None]:
wb = wb.rename(columns={'Antiretroviral therapy coverage: % of people living with HIV: 2015':"HIV rate"})

sns.histplot(data=wb, x="HIV rate", stat="density", bins=5)
plt.title("5 histogram bins");

In [None]:
sns.histplot(data=wb, x="HIV rate", stat="density", bins=10)
plt.title("10 histogram bins");

In [None]:
sns.histplot(data=wb, x="HIV rate", stat="density", bins=20)
plt.title("20 histogram bins");

In [None]:
sns.distplot(data=wb, x="HIV rate", kde=True, stat="density")
plt.title("Histogram and overlaid KDE on HIV rate distribution");

In [None]:
points = [2.2, 2.8, 3.7, 5.3, 5.7]

In [None]:
plt.hist(points, bins=range(0,10,2), ec='w', density=True)

In [4]:
def gaussian(x, z, a):
  return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))

def boxcar_basic(x,z,a):
  if np.abs(x-z) <= a/2:
    return 1/a
  return 0

def boxcar(x,z,a):
  cond = np.abs(x - z)
  return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0])

In [None]:
def create_kde(kernel, pts, a):
  def f(x):
    output = 0
    for pt in pts:
      output += kernel(x, pt, a)
    return output/ len(pts)
  return f

def plot_kde(kernel, pts, a):
  f = create_kde(kernel, pts, a)
  x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
  y = [f(xi) for xi in x]
  fig, ax = plt.subplots()
  ax.plot(x,y)
  return fig, ax

def plot_separate_kernels(kernel, pts, a, norm=False):
  fig, ax = plt.subplots()
  x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
  for pt in pts:
    y = kernel(x, pt, a)
    if norm:
      y /= len(pts)
    ax.plot(x, y)
  return fig, ax

In [None]:
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height = 0.5)
plt.title("sample dataset");

In [None]:
fig, ax = plot_separate_kernels(gaussian, points, a=1)
ax.set_title("Overlaid Gaussians on each data point")
ax.set_xlim(-3, 10)
ax.set_ylim(0, 0.5);

In [None]:
fig, ax = plot_separate_kernels(gaussian, points, a=1, norm=True)
ax.set_title("Normalized verlaid Gaussians on each data point")
ax.set_xlim(-3,10)
ax.set_ylim(0,0.5);

In [None]:
fig, ax = plot_kde(gaussian, points, a=1)
ax.set_title("KDE estimate")
ax.set_xlim(-3, 10)
ax.set_ylim(0,0.5);

In [None]:
sns.kdeplot(points, bw_method= 0.65)
sns.histplot(points, stat='density', bins=2);

In [None]:
sns.histplot(points, bins=2, kde=True, stat='density',
             kde_kws=dict(cut=3,bw_method=0.65));

In [None]:
sns.kdeplot(points, bw_adjust=2)
sns.histplot(points, stat='density');

In [None]:
fig, ax = plot_kde(gaussian, points, a=1)
ax.set_title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
ax.set_xlim(-3, 10)
ax.set_ylim(0,0.5);

In [None]:
fig, ax = plot_kde(boxcar, points, a=1)
ax.set_title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
ax.set_xlim(-3, 10)
ax.set_ylim(0, 0.5);

In [None]:
tips = sns.load_dataset('tips')

In [None]:
tips.head()

In [None]:
vals = tips['total_bill']

In [None]:
ax = sns.histplot(vals)
sns.rugplot(vals, color='orange', ax=ax);

In [None]:
fig, ax = plot_kde(gaussian, vals, a=0.1)
ax.set_title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plt.ylim(0,0.15);

In [None]:
fig, ax = plot_kde(gaussian, vals, a=1)
ax.set_title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
ax.set_ylim(0,0.1);

In [None]:
fig, ax = plot_kde(gaussian, vals, a=2)
ax.set_title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
ax.set_ylim(0,0.1);

In [None]:
fig, ax = plot_kde(gaussian, vals, a=5)
ax.set_title(r'KDE of tips with Gaussian kernel and $\alpha$ = 5')
ax.set_ylim(0,0.1);