# Welcome to your Quantitative Social Sciences Analysis Toolkit!

Use this notebook to perform the following operations:
    - load data
    - make histograms of variables of interest
    - calculate group differences on variables of interest
    - calculate correlations between variables of interest

## 1. load survey data into the notebook 
(run this first & run this everytime you close and reopen the notebook)

In [130]:
import pandas as pd
import logging
import statsmodels.api as sm

df = pd.read_csv('data/anes_pilot_2018.csv', low_memory=False)

pc = sm.PCA(df[['ftasian', 'ftblack', 'fthisp', 'ftmuslim']])
df['race_positive'] = pc.scores['comp_0']

## 2. generate demographics table


In [127]:

df['age'] = 2018 - df['birthyr']

(df.groupby('gender').replace({'gender':{1:'male',2:'female'}}
     .agg({'age':['mean','std','size']})).reset_index()

Unnamed: 0_level_0,gender,age,age,age
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,size
0,female,47.834882,17.396965,1399
1,male,51.562216,16.381059,1101


## 3. visualize distribution of an individual variable


In [128]:
%matplotlib inline 

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from matplotlib import pyplot as plt

@interact(x=df.columns)
def categorical_plot(x='gender'):
    plt.figure(figsize=(10,5))
    ax=plt.gca()
    return df.groupby(x).size().plot(ax=ax, kind='bar')

pass

interactive(children=(Dropdown(description='x', index=284, options=('version', 'caseid', 'weight', 'weight_sps…

## 4. visualize relationship between two variables 

In [155]:
%matplotlib inline 

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import statsmodels.api as sm
from matplotlib import pyplot as plt

def cast(s):
    try:
        return float(s)
    except:
        return None


    
@interact(dependent=df.columns.sort_values(), independent=df.columns.sort_values())
def comparison_plot(dependent='fttrump', independent='impeach1', bins=list(range(10,100,5))):
    plt.figure(figsize=(10,5))
    ax = plt.gca()
    
    msg = "nrows: {}; nrows after:{}".format(str(len(df[dependent])),  str(len(df[dependent].dropna())))
    logging.warning(msg)

    dff = df.copy()
    dff = dff.dropna(subset=[dependent, independent])
    dff[dependent] = dff[dependent].apply(cast)
    dff[independent] = dff[independent].apply(cast)
    
    if len(dff[independent].unique()) > 15:
        plt.scatter(dff[independent], dff[dependent])
        return ax
    
    try:

        if dff[dependent].mean() < 900:
            dff = dff[dff[dependent]<900]
        dff.groupby(independent)[dependent].plot(ax=ax, kind='hist', alpha=.4, bins=bins)
        
    except Exception as e:
        logging.warn(e)
        dff.groupby(independent)[dependent].plot(ax=ax, kind='hist', alpha=.4, bins=bins)
        
    ax.legend()
    return ax


interactive(children=(Dropdown(description='dependent', index=195, options=('acaapprove', 'acaapprove_page_tim…

In [157]:
## 5. statistically analyze relationship between two variables 

In [65]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols


def cast(s):
    try:
        return int(s)
    except:
        return None


@interact(dependent=df.columns.sort_values(), independent=df.columns.sort_values())
def comparison_test(dependent='birthyr', independent='gender'):
    
    
    msg = "nrows: {}; nrows after:{}".format(str(len(df[dependent])),  str(len(df[dependent].dropna())))
    logging.warning(msg)

    dff = df.copy()
    dff = dff.dropna(subset=[dependent, independent])
    
    try:
        dff[dependent] = dff[dependent].apply(cast)
        dff[independent] = dff[independent].apply(cast)
        
        if dff[dependent].mean() <900:
            dff = dff[dff[dependent]<900]
            
        results = sm.OLS(dff[dependent], dff[independent], hasconst=False).fit()
        
        if len(dff[independent].unique()) < 10 or 'ord_' in independent:

            results = ols('{} ~ C({})'.format(dependent, independent), data=dff).fit()
        else:
            results = ols('{} ~ {}'.format(dependent, independent), data=dff).fit()
        
        print(results.summary())
        return pd.crosstab(df[independent], df[dependent])
    
    except Exception as e:
        logging.error(e)


interactive(children=(Dropdown(description='dependent', index=20, options=('acaapprove', 'acaapprove_page_timi…