# Welcome to your Quantitative Social Sciences Analysis Toolkit!

Use this notebook to perform the following operations:
    - load data
    - make histograms of variables of interest
    - calculate group differences on variables of interest
    - calculate correlations between variables of interest

## 1. load survey data into the notebook 
(run this first & run this everytime you close and reopen the notebook)

In [1]:
import pandas as pd
import logging
import statsmodels.api as sm

df = pd.read_csv('data/anes_pilot_2018.csv', low_memory=False)

pc = sm.PCA(df[['ftasian', 'ftblack', 'fthisp', 'ftmuslim']])
df['race_positive'] = pc.scores['comp_0']

## 2. generate demographics table


In [2]:

df['age'] = 2018 - df['birthyr']

(df.replace({'gender':{1:'male',2:'female'}}).groupby('gender')
     .agg({'age':['mean','std','size']})).reset_index()

Unnamed: 0_level_0,gender,age,age,age
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,size
0,female,47.834882,17.396965,1399
1,male,51.562216,16.381059,1101


## 3. visualize distribution of an individual variable


In [3]:
%matplotlib inline 

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from matplotlib import pyplot as plt

def cast(s):
    try:
        return float(s)
    except:
        return None
    
@interact(x=df.columns.sort_values())
def categorical_plot(x='gender', drop_non_responders=False):
    
    plt.figure(figsize=(10,5))
    ax=plt.gca()
    dff = df.copy()
    dff[x] = dff[x].apply(cast)
    
    if drop_non_responders:
        dff = dff[dff[x]!=-7]        
        dff = dff[dff[x]<=900]

    if dff[x].nunique() > 90:
        return dff[x].plot(ax=ax, kind='density')        
    else:
        return dff.groupby(x).size().plot(ax=ax, kind='bar')

pass

interactive(children=(Dropdown(description='x', index=210, options=('acaapprove', 'acaapprove_page_timing', 'a…

## 4. visualize relationship between two variables 

In [4]:
%matplotlib inline 

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import statsmodels.api as sm
from matplotlib import pyplot as plt
import numpy as np

def cast(s):
    try:
        return float(s)
    except:
        return None


    
@interact(dependent=df.columns.sort_values(), independent=df.columns.sort_values())
def comparison_plot(dependent='fttrump', independent='impeach1', 
                    bins=widgets.IntSlider(min=10,max=100,step=5,value=10),
                   drop_non_responders=False, easy_plot=False):
    
    plt.figure(figsize=(10,5))
    ax = plt.gca()
    
    msg = "nrows: {}; nrows after:{}".format(str(len(df[dependent])),  str(len(df[dependent].dropna())))
    logging.warning(msg)

    dff = df.copy()
    dff = dff.dropna(subset=[dependent, independent])
    dff[dependent] = dff[dependent].apply(cast)
    dff[independent] = dff[independent].apply(cast)
    
    if drop_non_responders:
        
        dff = dff[dff[dependent]!=-7]
        dff = dff[dff[independent]!=-7]
        
        dff = dff[dff[dependent]<=900]
        dff = dff[dff[independent]<=900]
        
    
    if len(dff[independent].unique()) > 15:
        plt.scatter(dff[independent], dff[dependent])
        return ax
    
    n_cols = dff[independent].nunique()
    if n_cols > 5:
        ax.set_prop_cycle('color',[plt.cm.tab20(i) for i in np.linspace(0, 1, n_cols)])            
    else:
        ax.set_prop_cycle('color',[plt.cm.jet(i) for i in np.linspace(0, 1, n_cols)])            
    
    try:
        
        if dff[dependent].mean() < 900:
            dff = dff[dff[dependent]<900]
    
        if easy_plot:
            dff.groupby(independent)[dependent].plot(ax=ax, kind='density', alpha=1,linewidth=3)
        else:
            dff.groupby(independent)[dependent].plot(ax=ax, kind='hist', alpha=.4, bins=bins)
        
    except Exception as e:
        logging.warn(e)

        
        if easy_plot:
            dff.groupby(independent)[dependent].plot(ax=ax, kind='density', alpha=1, linewidth=3)
        else:
            dff.groupby(independent)[dependent].plot(ax=ax, kind='hist', alpha=.4, bins=bins)
        
    ax.legend()
    return ax
pass

interactive(children=(Dropdown(description='dependent', index=196, options=('acaapprove', 'acaapprove_page_tim…

## 5. statistically analyze relationship between two *continuous variables* 

In [5]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols


def cast(s):
    try:
        return int(s)
    except:
        return None

@interact(dependent=df.columns.sort_values(), independent=df.columns.sort_values())
def continuous_comparison_test(dependent='fttrump', independent='impeach1', 
                    bins=widgets.IntSlider(min=10,max=100,step=5,value=10),
                   drop_non_responders=False):
    
    msg = "nrows: {}; nrows after:{}".format(str(len(df[dependent])),  str(len(df[dependent].dropna())))
    logging.warning(msg)

    dff = df.copy()
    dff = dff.dropna(subset=[dependent, independent])
    
    dff[dependent] = dff[dependent].apply(cast)
    dff[independent] = dff[independent].apply(cast)
    
    if drop_non_responders:
        logging.warning('dropping na')
        dff = dff[dff[dependent]!=-7]
        dff = dff[dff[independent]!=-7]
        
        dff = dff[dff[dependent]<=900]
        dff = dff[dff[independent]<=900]
        logging.warning(len(dff[dff[dependent]==-7]))
    try:
        
        if dff[dependent].mean() <900:
            dff = dff[dff[dependent]<900]
            
        results = sm.OLS(dff[dependent], dff[independent], hasconst=False).fit()
        results = ols('{} ~ {}'.format(dependent, independent), data=dff).fit()
        
        sig={True : 'significant', False:'insignificant'}
        sign={True : 'positive', False:'negative'}
        
        t_val = results.tvalues[independent]
        p_val = results.pvalues[independent]
        
        if p_val<0.05:
            explain={True: " People who respond more highly to `{}` also respond more highly to `{}`.".format(independent, dependent), 
                    False: " People who respond more highly to `{}` tend to respond less highly to `{}`.".format(independent, dependent)}
        else:
            explain={True:'', False:''}
        
        result = """The there is a {} {} relationship between `{}` and `{}` (t={}, p={}).{}
""".format( 
                                                 sig[p_val<0.05],
                                                 sign[t_val>=0],independent, dependent,
                                                 round(t_val,2),
                                                 round(p_val,15), explain[t_val>=0])
        print(result)
        #return pd.crosstab(dff[independent], dff[dependent])
    
    except Exception as e:
        logging.error(e)


interactive(children=(Dropdown(description='dependent', index=196, options=('acaapprove', 'acaapprove_page_tim…

## 6. statistically analyze relationship between a continuous dependent variable and a categorical dependent variable.

In [16]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

def cast(s):
    try:
        return int(s)
    except:
        return None

@interact(dependent=df.columns.sort_values(), independent=df.columns.sort_values())
def continuous_comparison_test(dependent='fttrump', independent='impeach1', 
                    bins=widgets.IntSlider(min=10,max=100,step=5,value=10),
                   drop_non_responders=False):
    
    msg = "nrows: {}; nrows after:{}".format(str(len(df[dependent])),  str(len(df[dependent].dropna())))
    logging.warning(msg)

    dff = df.copy()
    dff = dff.dropna(subset=[dependent, independent])
    
    dff[dependent] = dff[dependent].apply(cast)
    dff[independent] = dff[independent].apply(cast)
    
    if dff[independent].nunique()>50:
        logging.exception(" Independent variable must be categorical.")
        return 
    
    if drop_non_responders:
        logging.warning('dropping na')
        dff = dff[dff[dependent]!=-7]
        dff = dff[dff[independent]!=-7]
        
        dff = dff[dff[dependent]<=900]
        dff = dff[dff[independent]<=900]
        logging.warning(len(dff[dff[dependent]==-7]))
    try:
        
        if dff[dependent].mean() <900:
            dff = dff[dff[dependent]<900]

        results = sm.OLS(dff[dependent], dff[independent], hasconst=False).fit()
        results = ols('{} ~ C({})'.format(dependent, independent), data=dff).fit()

        sig={True : 'significant', False:'insignificant'}
        sign={True : 'positive', False:'negative'}

        F_val = results.fvalue
        p_val = results.f_pvalue

        if p_val<0.05:
            explain={True: " People who respond more highly to `{}` also respond more highly to `{}`.".format(independent, dependent), 
                    False: " People who respond more highly to `{}` tend to respond less highly to `{}`.".format(independent, dependent)}
        else:
            explain={True:'', False:''}


        result = """People who respond differently to `{}` also respond respond differently to `{}` (F={}, p={}).
        """.format(independent, dependent,round(F_val,2),round(p_val,15))
        print(result)
        #return pd.crosstab(dff[independent], dff[dependent])
    
    except Exception as e:
        logging.error(e)


interactive(children=(Dropdown(description='dependent', index=196, options=('acaapprove', 'acaapprove_page_tim…

## 7. statistically analyze relationship between two *categorical variables* 

In [18]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

from scipy.stats import chisquare, chi2_contingency

def cast(s):
    try:
        return int(s)
    except:
        return None

@interact(dependent=df.columns.sort_values(), independent=df.columns.sort_values())
def categorical_comparison_test(dependent='gender', independent='direct2', 
                    bins=widgets.IntSlider(min=10,max=100,step=5,value=10),
                   drop_non_responders=False):
    
    msg = "nrows: {}; nrows after:{}".format(str(len(df[dependent])),  str(len(df[dependent].dropna())))
    logging.warning(msg)

    dff = df.copy()
    dff = dff.dropna(subset=[dependent, independent])
    
    dff[dependent] = dff[dependent].apply(cast)
    dff[independent] = dff[independent].apply(cast)
    
    if drop_non_responders:
        logging.warning('dropping na')
        dff = dff[dff[dependent]!=-7]
        dff = dff[dff[independent]!=-7]
        
        dff = dff[dff[dependent]<=900]
        dff = dff[dff[independent]<=900]
        logging.warning(len(dff[dff[dependent]==-7]))
    try:
        
        if dff[dependent].mean() <900:
            dff = dff[dff[dependent]<900]
            
        ct = pd.crosstab(dff[independent], dff[dependent])

        c, p, dof, expected = chi2_contingency(ct)
        
        sig={True : 'significant', False:'insignificant'}
        
        result = """The proportion of people responding in different ways to `{}` 
        differs according to `{}` in an {} way (chi={}, p={})""".format(independent, 
                                                 dependent, 
                                                 sig[p<0.05],
                                                 round(c,2),
                                                 round(p,15))
        if dff[independent].nunique()>50 or dff[dependent].nunique()>50:
            logging.warning('One of your variables might be continuous.')
        
        print(result.replace('\n','').replace('  ',''))

    
    except Exception as e:
        logging.error(e)


interactive(children=(Dropdown(description='dependent', index=210, options=('acaapprove', 'acaapprove_page_tim…