In [None]:
import os, re, glob, json, sys, scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
base_dir = os.path.realpath('../..')
print(base_dir)
data_dir = base_dir + '/Data'

In [None]:
ID_dat = pd.read_csv(data_dir + 
             '/Cleaned/Surveys/ID_dat.csv',
                    index_col = 0, dtype = {'SubID':str})

In [None]:
ID_dat.head()

In [None]:
colnames = ','.join(ID_dat.columns)
colnames

## Exclusions

In [None]:
excl = pd.read_csv(base_dir + '/Data/Subjects_and_exclusions/exclude_ideology.csv')['sub'].values.tolist()
excl

In [None]:
ID_dat_clean = ID_dat.loc[~ID_dat['SubID'].astype(int).isin(excl),:]
ID_dat_clean.shape

## Plotting settings

In [None]:
ic_palette = sns.color_palette('RdBu_r',7)
sns.set_palette([ic_palette[0],ic_palette[6]])

## Did we manage to approximately control some variables?

In [None]:
vars_to_control = ['Age','school_num','OwnIncome','IdeologyScale_1']
vars_nice_names = ['Age', 'Education\n(years)','Annual\nIncome (bin)', 'Ideology\n(Conservatism)']

In [None]:
income_mapper = {'Less than $20,000':1,'$20,000 to $34,999':2,
                '$35,000 to $49,999':3,'$50,000 to $74,999':4,
                '$75,000 to $99,999':5,'$100,000 to $149,999':6,
                '$150,000 to $199,999':7,'$200,000 or more':8}

In [None]:
sns.set_context('talk')
precision = 5
poles = ['Liberal','Conservative']
fig, axes = plt.subplots(ncols = len(vars_to_control), nrows = 1,
                         figsize = [len(vars_to_control)*4,4])
for vi,var in enumerate(vars_to_control):
    ax = axes[vi]
    compdat = []
    for pi,pole in enumerate(poles):
        abbr = pole[0]
        if (var == 'OwnIncome'):
            dat = ID_dat_clean.query('pole == @abbr')[var].map(income_mapper)
        else:
            dat = ID_dat_clean.query('pole == @abbr')[var]
        compdat.append(dat)
        sns.distplot(dat, rug = True, kde = True, label = pole, ax = ax)
    stats = scipy.stats.ttest_ind(compdat[0],compdat[1])
    df = len(compdat[0]) + len(compdat[1]) - 2
    ax.set(ylabel = 'Frequency', xlabel = vars_nice_names[vi],
           title = 't(%i) = %.2f, p = %.3f'%(df,stats[0],stats[1]))
    h,l = ax.get_legend_handles_labels()
    ax.legend(h,['L','C'])
plt.tight_layout()
# plt.legend(loc = [1.1,.5])

# plt.savefig(base_dir + '/Results/Individual_differences/Ideology_self_report.pdf',
#             transparent = True, bbox_inches = 'tight')
# plt.savefig(base_dir + '/Results/Individual_differences/Ideology_self_report.png',
#             transparent = True, bbox_inches = 'tight', dpi = 500)

## Count Gender

In [None]:
ID_dat_clean[['pole','Gender','Age']].groupby(['pole','Gender'],as_index=False).count()

## Compute age

In [None]:
ID_dat_clean[['pole','Age']].groupby(['pole'],as_index=False).describe()