# Welcome to your Quantitative Social Sciences Analysis Toolkit!



## 1. load survey data into the notebook 
(run this first & run this everytime you close and reopen the notebook)

In [1]:
import pandas as pd # load a specialized piece of software that will help us with the analysis
data = pd.read_csv('data/anes_pilot_2016.csv',low_memory=False) # read in the table of data
data = data[data.V161002 != -1] # remove any responent that wasn't interviewed face-to-face

## 2. display data

In [2]:
data.T # display a snapshot of raw data -- the first column here shows your variables, 
       # the other colums are responses

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1170,1171,1172,1173,1174,1175,1176,1177,1178,1179
Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1170,1171,1172,1173,1174,1175,1176,1177,1178,1179
version,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,...,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904
V160001,1,2,3,4,5,6,7,8,9,10,...,1171,1172,1173,1174,1175,1176,1177,1178,1179,1180
V160001_orig,300001,300002,300003,300004,300006,300007,300008,300012,300018,300020,...,302866,302867,302871,302872,302873,302874,302876,302877,302878,302880
V160101,0.827,1.0806,0.3878,0.3596,0.647,0.7062,3.9604,0.962,0.9761,0.6182,...,1.4991,0.7705,0.4732,0.6149,1.3355,1.4078,1.0449,1.323,0.552,1.4148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
V168524,81,82,82,82,82,81,81,81,82,82,...,82,82,81,82,81,82,-1,21,82,81
V168525,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,...,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,Roy Asberry Cooper,-1. Inap,-1. Inap
V168526,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
V168527,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,...,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,-1. Inap,Patrick L. 'Pat' McCrory,-1. Inap,-1. Inap


## 3. get category counts for a categorical variable


V161002 = gender  
V161027 = for whom did respondent vote?

In [3]:
from prettytable import PrettyTable
from ipywidgets import interact
from IPython.core.display import display, HTML

@interact(variable=data.columns.sort_values())
def categorical_table(variable='V161002'):
    x = PrettyTable()
    x.field_names = [variable, 'Count']
    for i, row in data.groupby(variable).size().reset_index().iterrows():
        x.add_row((row[variable], row[0]))
    display(HTML(x.get_html_string()))

interactive(children=(Dropdown(description='variable', index=18, options=('Unnamed: 0', 'V160001', 'V160001_or…

## 4. get average and spread for a continuous variable

V161267 = age

In [4]:
from prettytable import PrettyTable
from ipywidgets import interact, widgets
import numpy as np
from IPython.core.display import display, HTML
from matplotlib import pyplot as plt

def cast(v):
    try:
        return float(v)
    except:
        return np.nan


variable_select = widgets.Dropdown(options=data.columns.sort_values())

drop_select = widgets.SelectMultiple(options=[])

def update_drop_select(*args):
    drop_select.options=np.sort(data[variable_select.value].unique())

variable_select.observe(update_drop_select, 'value')

def printer(variable, drop_vals, drop_na, zoom=widgets.IntSlider(min=10,max=100,step=5,value=10)):
    df = data.copy()
    df[variable] = df[variable].apply(cast)
    
    df= df[[v not in drop_vals for v in df[variable]]]

    if drop_na:
        df = df[df[variable] > 0]
    
    if len(drop_vals):
        print('dropped values: {}'.format(drop_vals))
        
    x = PrettyTable()
    x.field_names = [variable, 'mean', 'standard deviation']
    mu = np.mean(df[variable])
    sigma = np.std(df[variable])
    
    result = (variable, mu, sigma)
    x.add_row(result)
    
    display(HTML(x.get_html_string()))
    plt.figure(figsize=(10,5))
    plt.hist(df[variable], bins=zoom)
    ax = plt.gca()
    ymin, ymax = ax.get_ylim()
    
    for val in range(-3,3):
        x = val*sigma+mu 
        col='black'
        
        if val==0:
            ax.vlines(x,ymin,ymax, alpha=1, color='red')
        else:
            ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)
            
interact(printer, drop_vals=drop_select, variable=variable_select, drop_na=True);

interactive(children=(Dropdown(description='variable', options=('Unnamed: 0', 'V160001', 'V160001_orig', 'V160…

## 5. Compare two categorical variables (or ordinal)

V165602 = something  
V161002 = gender  


In [7]:
from ipywidgets import interact
import scipy.stats as scs
from scipy.stats import chi2_contingency


dependent_variable_select = widgets.Dropdown(options=data.columns.sort_values())
independent_variable_select = widgets.Dropdown(options=data.columns.sort_values())

dependent_drop_select = widgets.SelectMultiple(options=[])
independent_drop_select = widgets.SelectMultiple(options=[])

def update_dependent_drop_select(*args):
    dependent_drop_select.options=np.sort(data[dependent_variable_select.value].unique())


def update_independent_drop_select(*args):
    independent_drop_select.options=np.sort(data[independent_variable_select.value].unique())

    
dependent_variable_select.observe(update_dependent_drop_select, 'value')
independent_variable_select.observe(update_independent_drop_select, 'value')

def categorical_table(dependent_variable, independent_variable, dep_drop_vals, indep_drop_vals,drop_na=True):
    df = data.copy()
    
    if drop_na:
        for variable in [independent_variable, dependent_variable]:
            try:
                df[variable] = df[variable].astype(float)
                df = df[df[variable]>-1]
            except:
                
                pass
            
    df = df[[v not in dep_drop_vals for v in df[dependent_variable]]]
    df = df[[v not in indep_drop_vals for v in df[independent_variable]]]
            
        
    if len(dep_drop_vals):
        print('dropped dependent values: {}'.format(dep_drop_vals))
        
    if len(indep_drop_vals):
        print('dropped independent values: {}'.format(indep_drop_vals))
        
    cross_tab = pd.crosstab(df[dependent_variable], df[independent_variable])
    stats =  chi2_contingency(cross_tab)
    print("chi-sq = {}, p-val = {}".format(round(stats[0],5), round(stats[1], 5)))
    return cross_tab

interact(categorical_table, dependent_variable=dependent_variable_select ,
         independent_variable=independent_variable_select, dep_drop_vals=dependent_drop_select, 
         indep_drop_vals=independent_drop_select, drop_na=True);



interactive(children=(Dropdown(description='dependent_variable', options=('Unnamed: 0', 'V160001', 'V160001_or…

## 6. Compare a categorical with a numeric/ordinal

V165602 = something  
V161361x = something   
V161002 = gender  
V161267 = age  

In [8]:
from ipywidgets import interact
import scipy.stats as scs
import statsmodels.api as sm
from statsmodels.formula.api import ols


categorical_variable_select = widgets.Dropdown(options=data.columns.sort_values())
numeric_variable_select = widgets.Dropdown(options=data.columns.sort_values())

categorical_drop_select = widgets.SelectMultiple(options=[])
numeric_drop_select = widgets.SelectMultiple(options=[])

def update_categorical_drop_select(*args):
    categorical_drop_select.options=np.sort(data[categorical_variable_select.value].unique())

def update_numeric_drop_select(*args):
    numeric_drop_select.options=np.sort(data[numeric_variable_select.value].unique())

    
categorical_variable_select.observe(update_categorical_drop_select, 'value')
numeric_variable_select.observe(update_numeric_drop_select, 'value')


def categorical_table(categorical_variable, numeric_variable,cat_drop_vals, num_drop_vals, drop_na=True):
    df = data.copy()
    
    if drop_na:
        for variable in [categorical_variable, numeric_variable]:
            try:
                df[variable] = df[variable].astype(float)
                df = df[df[variable]>-1]
            except:
                pass
        
    if len(df[categorical_variable].unique())>15:
        print("PLEASE CHOOSE A CATEGORICAL VARIABLE")
        return
    
    try:
        df[numeric_variable].astype(float)
    except:
        print("PLEASE CHOOSE A NUMERIC VARIABLE")
        return

    
    df = df[[v not in cat_drop_vals for v in df[categorical_variable]]]
    df = df[[v not in num_drop_vals for v in df[numeric_variable]]]
            
        
    if len(cat_drop_vals):
        print('dropped dependent values: {}'.format(cat_drop_vals))
        
    if len(num_drop_vals):
        print('dropped independent values: {}'.format(num_drop_vals))
        
    
    
    plt.figure(figsize=(10,5))
    ax=plt.gca()
    for c in np.sort(df[categorical_variable].unique()):
        dat = df[df[categorical_variable]==c]
        ax.hist(dat[numeric_variable], alpha=.5, bins='doane')
    ax.legend(np.sort(df[categorical_variable].unique()))
    
    X = df[numeric_variable]
    X = sm.add_constant(X)
    
    res = ols("{} ~ C({})".format(numeric_variable, categorical_variable), df).fit()
    pw = res.t_test_pairwise("C({})".format(categorical_variable))
    return pw.result_frame[['t','P>|t|','pvalue-hs','reject-hs']]
    
    
    
interact(categorical_table, categorical_variable=categorical_variable_select,
         numeric_variable=numeric_variable_select, cat_drop_vals=categorical_drop_select, 
         num_drop_vals=numeric_drop_select, drop_na=True)
pass




interactive(children=(Dropdown(description='categorical_variable', options=('Unnamed: 0', 'V160001', 'V160001_…