# Welcome to your Quantitative Social Sciences Analysis Toolkit!



## 1. load survey data into the notebook 
(run this first & run this everytime you close and reopen the notebook)

In [2]:
import pandas as pd # load a specialized piece of software that will help us with the analysis
data = pd.read_csv('data/anes_pilot_2019.csv',low_memory=False) # read in the table of data

## 2. display data

In [3]:
data.T # display a snapshot of raw data -- the first column here shows your variables, 
       # the other colums are responses

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3155,3156,3157,3158,3159,3160,3161,3162,3163,3164
version,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,...,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204,ANES 2019 Pilot Study version 20200204
caseid,1,2,3,4,5,6,7,8,9,10,...,3156,3157,3158,3159,3160,3161,3162,3163,3164,3165
weight,1.34719693063187,.780822076219216,.966366930694957,1.10348514780374,1.09069730256741,1.02140871415171,.964514474045239,.83469258858232,1.53541542020853,1.32458088383641,...,1.17827101584555,.783602487218187,.792508744423736,,,,7.03646496881757,.892833236147303,1.58161278448241,.809576969671362
weight_spss,1.10160293017768,.638478211724453,.790198239229266,.902319805359118,.891863184309371,.835205905561853,.788683485426792,.682528129683763,1.25550918910451,1.08310978871303,...,.963472209656906,.640751753798312,.648034400315289,,,,5.75371740500213,.73006973719765,1.29328477387127,.661991088100273
form,1,1,1,2,2,2,2,1,1,1,...,2,1,2,2,1,2,2,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
starttime,12/31/2019 18:57:33,12/21/2019 4:19:56,12/22/2019 23:03:28,12/31/2019 19:53:14,12/21/2019 4:07:09,12/21/2019 22:45:18,12/27/2019 19:16:05,12/21/2019 23:21:55,12/25/2019 5:39:51,12/28/2019 3:09:16,...,12/31/2019 19:41:53,12/31/2019 19:40:28,12/31/2019 19:40:59,12/31/2019 19:41:26,12/31/2019 19:42:13,12/31/2019 19:38:13,12/31/2019 20:14:34,12/31/2019 20:10:04,12/31/2019 22:10:05,12/31/2019 23:27:51
endtime,12/31/2019 19:39:49,12/21/2019 4:53:19,12/22/2019 23:41:43,12/31/2019 20:23:11,12/21/2019 4:48:50,12/22/2019 0:28:27,12/27/2019 19:45:45,12/21/2019 23:40:20,12/25/2019 5:57:21,12/28/2019 3:35:48,...,12/31/2019 20:08:20,12/31/2019 20:17:50,12/31/2019 20:13:32,12/31/2019 20:22:45,12/31/2019 20:28:23,12/31/2019 20:24:56,12/31/2019 20:53:50,12/31/2019 20:29:15,12/31/2019 22:52:37,1/1/2020 0:21:59
duration,2536,2003,2295,1797,2501,6189,1780,1105,1050,1592,...,1587,2242,1953,2479,2770,2803,2356,1151,2552,3248
pop_density_public,1520,1800,70,7600,4430,11900,700,45000,5700,120,...,400,3700,2000,,,,1800,200,6600,1


## 3. get category counts for a categorical variable


In [5]:
from prettytable import PrettyTable
from ipywidgets import interact
from IPython.core.display import display, HTML

@interact(variable=data.columns.sort_values())
def categorical_table(variable='V161002'):
    x = PrettyTable()
    x.field_names = [variable, 'Count']
    for i, row in data.groupby(variable).size().reset_index().iterrows():
        x.add_row((row[variable], row[0]))
    display(HTML(x.get_html_string()))

interactive(children=(Dropdown(description='variable', options=('CompletedSurveys', 'EnrollmentDate', 'FIPCoun…

## 4. get average and spread for a continuous variable


In [4]:
from prettytable import PrettyTable
from ipywidgets import interact, widgets
import numpy as np
from IPython.core.display import display, HTML
from matplotlib import pyplot as plt

@interact(variable=data.columns.sort_values())
def categorical_table(variable='V161267', zoom=widgets.IntSlider(min=10,max=100,step=5,value=10),drop_na=False):
    
    df = data.copy()
    if drop_na:
        df = df[df[variable] > 0]
    
    x = PrettyTable()
    x.field_names = [variable, 'mean', 'standard deviation']
    mu = np.mean(df[variable])
    sigma = np.std(df[variable])
    
    result = (variable, mu, sigma)
    x.add_row(result)
    
    display(HTML(x.get_html_string()))
    plt.figure(figsize=(10,5))
    plt.hist(df[variable], bins=zoom)
    ax = plt.gca()
    ymin, ymax = ax.get_ylim()
    
    for val in range(-3,3):
        x = val*sigma+mu 
        col='black'
        
        if val==0:
            ax.vlines(x,ymin,ymax, alpha=1, color='red')
        else:
            ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)

interactive(children=(Dropdown(description='variable', index=367, options=('Unnamed: 0', 'V160001', 'V160001_o…

## 5. Compare two categorical variables (or ordinal)
 

In [15]:
from ipywidgets import interact
import scipy.stats as scs

@interact(dependent_variable=data.columns.sort_values(),
          independent_variable=data.columns.sort_values())
def categorical_table(dependent_variable='V161002', independent_variable='V161019', drop_na=True):
    df = data.copy()
    
    if drop_na:
        for variable in [independent_variable, dependent_variable]:
            try:
                df[variable] = df[variable].astype(float)
                df = df[df[variable]>-1]
            except:
                pass
                    
            
    cross_tab = pd.crosstab(df[dependent_variable], df[independent_variable])
    
    stats = scs.chi2_contingency(cross_tab)
    print("chi-sq = {}, p-val = {}".format(round(stats[0],5), round(stats[1], 5)))
    return cross_tab

interactive(children=(Dropdown(description='dependent_variable', index=18, options=('Unnamed: 0', 'V160001', '…

## 6. Compare a categorical with a numeric/ordinal

In [13]:
from ipywidgets import interact
import scipy.stats as scs
import statsmodels.api as sm
from statsmodels.formula.api import ols

@interact(categorical_variable=data.columns.sort_values(),
          numeric_variable=data.columns.sort_values())
def categorical_table(categorical_variable='V161002', numeric_variable='V161086',drop_na=True):
    df = data.copy()
    
    if drop_na:
        for variable in [categorical_variable, numeric_variable]:
            try:
                df[variable] = df[variable].astype(float)
                df = df[df[variable]>-1]
            except:
                pass
        
    if len(df[categorical_variable].unique())>15:
        print("PLEASE CHOOSE A CATEGORICAL VARIABLE")
        return
    
    try:
        df[numeric_variable].astype(float)
    except:
        print("PLEASE CHOOSE A NUMERIC VARIABLE")
        return

    
    plt.figure(figsize=(10,5))
    ax=plt.gca()
    for c in np.sort(df[categorical_variable].unique()):
        dat = df[df[categorical_variable]==c]
        ax.hist(dat[numeric_variable], alpha=.5, bins='doane')
    ax.legend(np.sort(df[categorical_variable].unique()))
    
    X = df[numeric_variable]
    X = sm.add_constant(X)
    
    res = ols("{} ~ C({})".format(numeric_variable, categorical_variable), df).fit()
    pw = res.t_test_pairwise("C({})".format(categorical_variable))
    return pw.result_frame[['t','P>|t|','pvalue-hs','reject-hs']]
    

interactive(children=(Dropdown(description='categorical_variable', index=18, options=('Unnamed: 0', 'V160001',…