In [3]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import folium
import seaborn as sns
from jupyterthemes import jtplot
jtplot.style()
import statsmodels.api as sm
from patsy import dmatrices
import ipywidgets as widgets
import statsmodels.formula.api as smf

# Problem 6: Presentation

In [4]:
tnt = gpd.read_file('../data/tnt.geojson')
tnmod = (tnt[['tract', 'burgs_per_1000', 'population_density', 'gini_index', 'population', 'aland', 'median_income']]
         .drop_duplicates()
        )
tnmod = tnmod[tnmod['median_income'] >= 0]

In [5]:
def nb2(data, target, offset_col, exp_vars):
    """Create NB2 model"""
    #Copy data to df
    df = data.copy()
    #Setup the regression expression in patsy notation. 
    #We are telling patsy that burgs_per_1000 is our dependent variable and it depends on the regression variables: median_income
    expr = f"{target} ~ {' + '.join(exp_vars)}"

    #Set up the X and y matrices for the training and testing data sets
    y_train, X_train = dmatrices(expr, df, return_type='dataframe')

    #Using the statsmodels GLM class, train the Poisson regression model on the training data set
    poisson_training_results = (sm.GLM(y_train, 
                                      X_train, 
                                      family=sm.families.Poisson(),
                                      offset = np.log((1/1000)*df[offset_col])
                                      )
                                      .fit()
                                )

    #print out the training summary
    print("POISSON SUMMARY")
    print(poisson_training_results.summary(), "\n\n\n")

    #print out the fitted rate vector
    #print(poisson_training_results.mu)

    #Add the λ vector as a new column called 'BB_LAMBDA' to the Data Frame of the training data set
    df['TB_LAMBDA'] = poisson_training_results.mu

    #add a derived column called 'AUX_OLS_DEP' to the pandas DataFrame. This new column will store the values of the dependent variable of the OLS regression
    df['AUX_OLS_DEP'] = df.apply(lambda x: ((x[target] - x['TB_LAMBDA'])**2 - x['TB_LAMBDA']) / x['TB_LAMBDA'], axis=1)

    #use patsy to form the model specification for the OLSR
    ols_expr = """AUX_OLS_DEP ~ TB_LAMBDA - 1"""

    #Configure and fit the OLSR model
    aux_olsr_results = smf.ols(ols_expr, df).fit()

    #Print the regression params
    print(aux_olsr_results.params, "\n\n\n")

    #train the NB2 model on the training data set
    nb2_training_results = (sm.GLM(y_train, 
                                  X_train,
                                  family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0]))
                           .fit()
                           )
    
    #print the training summary
    print("NB2 Summary")
    print(nb2_training_results.summary())
    print(' Poisson AIC: '+str(poisson_training_results.aic), '\n',
          'NB2 AIC: '+str(nb2_training_results.aic)
         )
    
    return nb2_training_results

In [8]:
ALL = 'ALL'
selist = tnmod.columns[2:].tolist()
selist.append(ALL)
select_list = selist[::-1]

w = widgets.SelectMultiple(
    options=select_list,
    value=['ALL'],
    #rows=10,
    description='Exp. Vars',
    disabled=False
)

def widfunc(x):
    if x != ('ALL',):
        exp_vars = x
    else:
        exp_vars = tnmod.columns[2:].tolist()
        
    return nb2(data = tnmod,
            target = 'burgs_per_1000',
            offset_col = 'population',
            exp_vars = exp_vars)

widgets.interact(widfunc, 
                 x = w)

interactive(children=(SelectMultiple(description='Exp. Vars', index=(0,), options=('ALL', 'median_income', 'al…

<function __main__.widfunc(x)>