# Welcome to your Quantitative Social Sciences analysis toolkit!

In this notebook, you will perform the following operations:
    - load data
    - make histograms of variables of interest
    - calculate correlations between variables of interest
    - calculate group differences on variables of interest

In [1]:
import pandas as pd

df = pd.read_csv('data/anes_pilot_2018.csv', low_memory=False)

In [97]:
%matplotlib inline 

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from matplotlib import pyplot as plt

@interact(x=df.columns)
def categorical_plot(x='gender'):
    plt.figure(figsize=(10,5))
    ax=plt.gca()
    return df.groupby(x).size().plot(ax=ax, kind='bar')

pass

interactive(children=(Dropdown(description='x', index=284, options=('version', 'caseid', 'weight', 'weight_sps…

In [32]:
%matplotlib inline 

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

def cast(s):
    try:
        return float(s)
    except:
        return None

@interact(dependent=df.columns.sort_values(), independent=df.columns.sort_values())
def comparison_plot(dependent='ord_ftmetoo', independent='gender', bins=list(range(10,100,5))):
    plt.figure(figsize=(10,5))
    ax = plt.gca()
    
    print(len(df[dependent]), len(df[dependent].dropna()))

    dff = df.copy()
    dff = dff.dropna(subset=[dependent, independent])
    
    if len(dff[independent].unique()) > 15:
        plt.scatter(dff[independent], dff[dependent])
        return ax
    
    try:
        dff[dependent] = dff[dependent].apply(cast)
        if dff[dependent].mean() < 900:
            dff = dff[dff[dependent]<900]
        dff.groupby(independent)[dependent].plot(ax=ax, kind='hist', alpha=.33, bins=bins)
    except:
        dff.groupby(independent)[dependent].plot(ax=ax, kind='hist', alpha=.33, bins=bins)
        
    ax.legend()
    return ax


interactive(children=(Dropdown(description='dependent', index=428, options=('acaapprove', 'acaapprove_page_tim…

In [45]:
import statsmodels.api as sm

from statsmodels.formula.api import ols


def cast(s):
    try:
        return int(s)
    except:
        return None


@interact(dependent=df.columns.sort_values(), independent=df.columns.sort_values())
def comparison_plot(dependent='fttrump', independent='gender'):
    
    print(len(df[dependent]), len(df[dependent].dropna()))

    dff = df.copy()
    dff = dff.dropna(subset=[dependent, independent])
    
    try:
        dff[dependent] = dff[dependent].apply(cast)
        dff[independent] = dff[independent].apply(cast)
        
        if dff[dependent].mean() <900:
            dff = dff[dff[dependent]<900]
            
        results = sm.OLS(dff[dependent], dff[independent], hasconst=False).fit()
        
        if len(dff[independent].unique()) < 10:
            results = ols('{} ~ C({})'.format(dependent, independent), data=dff).fit()
        else:
            results = ols('{} ~ {}'.format(dependent, independent), data=dff).fit()
        
        print(results.summary())
        #return pd.crosstab(df[independent], df[dependent])
    
    except Exception as e:
        return str(e)


interactive(children=(Dropdown(description='dependent', index=195, options=('acaapprove', 'acaapprove_page_tim…

In [37]:
%debug

> [0;32m<ipython-input-36-57ad18846727>[0m(39)[0;36mcomparison_plot[0;34m()[0m
[0;32m     35 [0;31m        [0mprint[0m[0;34m([0m[0mresults[0m[0;34m.[0m[0msummary[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     36 [0;31m        [0;32mreturn[0m [0mpd[0m[0;34m.[0m[0mcrosstab[0m[0;34m([0m[0mdf[0m[0;34m[[0m[0mindependent[0m[0;34m][0m[0;34m,[0m [0mdf[0m[0;34m[[0m[0mdependent[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     37 [0;31m[0;34m[0m[0m
[0m[0;32m     38 [0;31m    [0;32mexcept[0m [0mException[0m [0;32mas[0m [0me[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 39 [0;31m        [0;32mreturn[0m [0;34m'comparison does not run'[0m [0;34m+[0m [0me[0m[0;34m.[0m[0mmessage[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> u
*** Oldest frame
ipdb> e
*** NameError: name 'e' is not defined
ipdb> d
*** Newest frame
ipdb> e
*** NameError: name 'e' is not defined
ipdb> d
*** Ne