# Welcome to your Quantitative Social Sciences Analysis Toolkit!



## 1. load survey data into the notebook 
(run this first & run this everytime you close and reopen the notebook)

In [4]:
import pandas as pd # load a specialized piece of software that will help us with the analysis
data = pd.read_csv('data/anes_pilot_2016.csv',low_memory=False) # read in the table of data
data = data[data.V161002 != -1] # remove any responent that wasn't interviewed face-to-face

## 2. display data

In [5]:
data.T # display a snapshot of raw data -- the first column here shows your variables, 
       # the other colums are responses

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1170,1171,1172,1173,1174,1175,1176,1177,1178,1179
Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1170,1171,1172,1173,1174,1175,1176,1177,1178,1179
version,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,...,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904,ANES2016TimeSeries_20190904
V160001,1,2,3,4,5,6,7,8,9,10,...,1171,1172,1173,1174,1175,1176,1177,1178,1179,1180
V160001_orig,300001,300002,300003,300004,300006,300007,300008,300012,300018,300020,...,302866,302867,302871,302872,302873,302874,302876,302877,302878,302880
V160101,0.827,1.0806,0.3878,0.3596,0.647,0.7062,3.9604,0.962,0.9761,0.6182,...,1.4991,0.7705,0.4732,0.6149,1.3355,1.4078,1.0449,1.323,0.552,1.4148
V160101f,0.8877,1.1605,0.4161,0.3852,0.6931,0.7588,4.2512,1.0325,1.0481,0.6642,...,1.6069,0.8256,0.5026,0.6608,1.4348,1.513,1.1226,1.4202,0.5931,1.5059
V160101w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V160102,0.842,1.0133,0.3672,0.3663,0.6463,0.6881,4.6151,0.9426,1.0097,0.6005,...,1.5394,0.7803,0.4861,0,1.246,1.4238,1.0454,0,0.5419,1.4584
V160102f,0.9271,1.0841,0.3985,0.4183,0.7262,0.7246,4.7902,1.041,1.0733,0.6375,...,1.6521,0.8468,0.4959,0,1.2929,1.4991,1.139,0,0.6142,1.4423
V160102w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 3. get category counts for a categorical variable


V161002 = gender  
V161027 = for whom did respondent vote?

In [6]:
from prettytable import PrettyTable
from ipywidgets import interact
from IPython.core.display import display, HTML

@interact(variable=data.columns.sort_values())
def categorical_table(variable='V161002'):
    x = PrettyTable()
    x.field_names = [variable, 'Count']
    for i, row in data.groupby(variable).size().reset_index().iterrows():
        x.add_row((row[variable], row[0]))
    display(HTML(x.get_html_string()))

interactive(children=(Dropdown(description='variable', index=18, options=('Unnamed: 0', 'V160001', 'V160001_or…

## 4. get average and spread for a continuous variable

V161267 = age

In [42]:
from prettytable import PrettyTable
from ipywidgets import interact, widgets
import numpy as np
from IPython.core.display import display, HTML
from matplotlib import pyplot as plt

@interact(variable=data.columns.sort_values())
def categorical_table(variable='V161267', zoom=widgets.IntSlider(min=10,max=100,step=5,value=10),drop_na=False):
    x = PrettyTable()
    x.field_names = [variable, 'mean', 'standard deviation']
    mu = np.mean(data[variable])
    sigma = np.std(data[variable])
    
    result = (variable, mu, sigma)
    x.add_row(result)
    
    display(HTML(x.get_html_string()))
    plt.figure(figsize=(10,5))
    plt.hist(data[variable], bins=zoom)
    ax = plt.gca()
    ymin, ymax = ax.get_ylim()
    
    for val in range(-3,3):
        x = val*sigma+mu 
        col='black'
        
        if val==0:
            ax.vlines(x,ymin,ymax, alpha=1, color='red')
        else:
            ax.vlines(x,ymin,ymax/np.abs(val)/zoom, alpha=1/np.abs(val), color=col)

interactive(children=(Dropdown(description='variable', index=367, options=('Unnamed: 0', 'V160001', 'V160001_o…

## 5. Compare two categorical variables

In [89]:
from ipywidgets import interact
import scipy.stats as scs

@interact(dependent_variable=data.columns.sort_values(),
          independent_variable=data.columns.sort_values())
def categorical_table(dependent_variable='V161027', independent_variable='V161002', drop_na=False):
    df = data.copy()
    
    if drop_na:
        try:
            df[dependent_variable] = df[dependent_variable].astype(float)
            df = df[df[dependent_variable]>-1]
        except:
            pass
        
        try:
            df[independent_variable] = df[independent_variable].astype(float)
            df = df[df[independent_variable]>-1]
        except:
            pass
                  
    cross_tab = pd.crosstab(df[dependent_variable], df[independent_variable])
    
    stats = scs.chi2_contingency(cross_tab)
    print("chi-sq = {}, p-val = {}".format(round(stats[0],5), round(stats[1], 5)))
    return cross_tab

interactive(children=(Dropdown(description='dependent_variable', index=58, options=('Unnamed: 0', 'V160001', '…