# Imports

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn

# File Locations

In [None]:
dta_dir = '../data/'

dhs_ind_dir = dta_dir + 'ndhs/dhs_indicators.csv'
ntl_stt_dir = dta_dir + 'nightlights/nightlights_summary_stats.csv'
osm_rod_dir = dta_dir + 'osm/osm_roads.csv'
osm_poi_dir = dta_dir + 'osm/osm_pois.csv'
osm_bld_dir = dta_dir + 'osm/osm_buildings.csv'

# NDHS Data Set Exploratory Data Analysis

In [None]:
dhs_ind = pd.read_csv(dhs_ind_dir)

In [None]:
from scipy.stats import spearmanr, pearsonr

def plot_regplot(
    data,
    x_label,
    y_label,
    y_var
):
    """
    Produces the regression plot for the given data

    Parameters
    ----------
    data: pandas Series
        the data to plot regression plot
    x_var: str
        the variable name of the x-axis
    y_var: str
        the variable name of the y-axis
    x_label: str
        the label of the x-axis
    y_label: str
        the label of the y-axis
    """

    ax = sns.regplot(
        x=x_label,
        y=y_var,
        data=data,
        lowess=True,
        line_kws={'color':'black', 'lw':2},
        scatter_kws={'alpha':0.3}
    )
    
    plt.ticklabel_format(style='sci', axis='x', scilimits=(1,5))
    plt.title(
        'Relationship between {}\nand {}'.format(
            x_label, y_label
        )
        + r" ($\rho$ = %.2f, $r$ = %.2f)"
        % (
            spearmanr(
                data[x_label].tolist(), data[y_var].tolist()
            )[0],
            pearsonr(
                data[x_label].tolist(), data[y_var].tolist()
            )[0]
        )
    )
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    plt.show()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(10,10))

dhs_ind.wealth_index.hist(ax=ax[0])
dhs_ind.education_completed_yrs.hist(ax=ax[1])
dhs_ind.electricity_access.hist(ax=ax[2])