This script loads survey data, turns all responses into numerical type, computes subscale and total sums (taking reverse-coded items into account), and stores as ID_dat.csv.

In [None]:
import os
import numpy as np
import pandas as pd
import scipy as sp
import scipy.spatial
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import re

## Load data

In [None]:
base_dir = os.path.realpath('../..')
print(base_dir)
data_dir = base_dir + '/Data'

In [None]:
ID_dat = pd.read_csv(data_dir + '/Raw/Surveys' +
             '/DUMP_Political_Polarization_IDItems_October+30%2C+2019_18.15.csv',
                    dtype = {'SubID':str},
                    )
ID_dat = ID_dat.iloc[2:,:] # Remove extra header rows

In [None]:
ID_dat.head()

In [None]:
ID_dat['SubID'].head()

## Clean data

##### Remove nans

In [None]:
ID_dat = ID_dat[pd.to_numeric(ID_dat['SubID'], errors = 'coerce').notnull()].reset_index(drop=True)

In [None]:
ID_dat['SubID'] = pd.to_numeric(ID_dat['SubID'])

In [None]:
ID_dat = ID_dat.query('SubID < 90')

In [None]:
exclusions = [1,3]
ID_dat = ID_dat.loc[~ID_dat['SubID'].isin(exclusions),:].reset_index(drop=True)

In [None]:
ID_dat['SubID'].unique()

In [None]:
ID_dat['SubID'].head()

##### Remove columns of no interest

In [None]:
ID_dat = ID_dat.iloc[:,17:].reset_index(drop=True)

## Explore data

In [None]:
colnames = ','.join(ID_dat.columns)
colnames

## Add prescreen data

In [None]:
file_path = data_dir + '/Cleaned/Surveys/Prescreen_data.csv'
prescreen_dat = pd.read_csv(file_path)
prescreen_dat = prescreen_dat.iloc[2:,:]
prescreen_dat['subNr'] = prescreen_dat['subNr'].astype(int)
prescreen_dat['IdeologyScale_1'] = prescreen_dat['IdeologyScale_1'].astype(int)
prescreen_dat['Age'] = prescreen_dat['Age'].astype(int)
prescreen_dat = prescreen_dat.sort_values(by = 'subNr')
prescreen_dat['activism_score'] = prescreen_dat['ActivismList'].apply(lambda x: len(x.replace(', ',' ').split(',')))
prescreen_dat['school_num'] = prescreen_dat['SchoolCompleted'].apply(lambda x: int(x[:2]))
poles = np.array(['L','C'])
prescreen_dat['pole'] = poles[np.array(prescreen_dat['IdeologyScale_1'] > 50).astype(int)]
prescreen_dat = prescreen_dat[['subNr','Age','Gender','Race','school_num',
                               'IdeologyScale_1','pole','IdentityChoice','PartyChoice',
                               'activism_score','ControlLife']]
display(prescreen_dat.head())
# prescreen_dat['pole'].value_counts()

In [None]:
ID_dat = ID_dat.merge(prescreen_dat, left_on = 'SubID', right_on = 'subNr')

In [None]:
ID_dat['pole'].value_counts()

##### Visualize prescreen data

In [None]:
colors = ['b','r']
cols = ['IdeologyScale_1','activism_score','Age','school_num']
bins = [np.arange(0,101,5), np.arange(0.5,9.6,1), np.arange(0,90,5), np.arange(11.5,22.6,1)]
fig, axes = plt.subplots(1,len(cols), figsize = [16,4])
for ai,ax in enumerate(axes[0:len(cols)]):
    col = cols[ai]
    for pi,pole in enumerate(poles):
        sns.distplot(prescreen_dat.query('pole == @pole')[col],
                    kde = True, rug = True, bins = bins[ai],
                    color = colors[pi], label = pole, ax = ax)
    plt.legend()
    ax.set(title = col)
axes[0].set_title('Ideology\n0 = liberal, 100 = conservative');
axes[0].set(xlim = [-10,110]);
axes[1].set(xlim = [0,10]);

## Parse political value surveys

##### SECS

In [None]:
survey_basename = 'SECS'
survey_dtype = int

Identify survey columns

In [None]:
survey_cols = []
for match in re.finditer(survey_basename,colnames):
    survey_cols.append(colnames[match.span()[0]:match.span()[1]+3].strip(','))
[print(survey_cols[i], end = '\t') for i in range(len(survey_cols))];

Adjust dtype

In [None]:
survey_dat = ID_dat[survey_cols].copy()
survey_dat = survey_dat.astype(survey_dtype)
ID_dat[survey_cols] = survey_dat[survey_cols]

Check missing data

In [None]:
survey_dat.isna().sum().sum()

Compute subscales and total

In [None]:
social_col_indices = [0,2,3,6,7,10,11]
economic_col_indices = [1,4,5,8,9]
reverse = [0,4]
survey_dat.iloc[:,reverse] = 100 - survey_dat.iloc[:,reverse]
survey_dat['SECS_total'] = survey_dat.sum(axis=1)
survey_dat['SECS_social'] = survey_dat.iloc[:,social_col_indices].sum(axis=1)
survey_dat['SECS_economic'] = survey_dat.iloc[:,economic_col_indices].sum(axis=1)
survey_dat['SubID'] = ID_dat['SubID']
survey_dat.head()

Add to ID_dat

In [None]:
ID_dat = ID_dat.merge(survey_dat[['SubID','SECS_total','SECS_social','SECS_economic']], on = 'SubID')

##### RWA

In [None]:
survey_basename = 'RWA'

Identify survey columns

In [None]:
survey_cols = []
for match in re.finditer(survey_basename,colnames):
    survey_cols.append(colnames[match.span()[0]:match.span()[1]+3].strip(','))
[print(survey_cols[i], end = '\t') for i in range(len(survey_cols))];

In [None]:
survey_dat = ID_dat[survey_cols].copy()

Set dtype or map data

In [None]:
response_mapping = {'Very strongly disagree':1,
                    'Strongly disagree':2,
                    'Moderately disagree':3,
                    'Slightly disagree':4,
                    'Neutral':5,
                    'Slightly agree':6,
                    'Moderately agree':7,
                    'Strongly agree':8,
                    'Very strongly agree':9}
survey_dat = survey_dat.applymap(lambda x: response_mapping[x])

In [None]:
ID_dat[survey_cols] = survey_dat[survey_cols]

Check for missing data

In [None]:
survey_dat.isna().sum().sum()

Compute survey totals

In [None]:
reverse = [0,2,4,6,8,10]
survey_dat.iloc[:,reverse] = 10 - survey_dat.iloc[:,reverse]

In [None]:
subscale_items = {'RWA_Aggression':[1,4,7,10],
                 'RWA_Submission':[2,5,8,11],
                 'RWA_Conventionalism':[0,3,6,9]}
survey_dat['RWA_total'] = survey_dat.sum(axis=1)
for subscale, subscale_cols in subscale_items.items():
    print(subscale, subscale_cols)
    survey_dat[subscale] = survey_dat.iloc[:,subscale_cols].sum(axis=1)
survey_dat['SubID'] = ID_dat['SubID']
survey_dat.head()

Add to ID_dat

In [None]:
ID_dat = ID_dat.merge(survey_dat[[
    'SubID','RWA_total','RWA_Aggression',
    'RWA_Submission','RWA_Conventionalism']], on = 'SubID')

##### LWA

In [None]:
survey_basename = 'LWA'

Identify survey columns

In [None]:
survey_cols = []
for match in re.finditer(survey_basename,colnames):
    survey_cols.append(colnames[match.span()[0]:match.span()[1]+3].strip(','))
[print(survey_cols[i], end = '\t') for i in range(len(survey_cols))];

In [None]:
survey_dat = ID_dat[survey_cols].copy()

Set dtype or map data

In [None]:
response_mapping = {'I disagree completely':1,
                    'I disagree somewhat':2,
                    'I disagree slightly':3,
                    'Neutral/undecided':4,
                    'I slightly agree':5,
                    'I somewhat agree':6,
                    'I completely agree':7}
survey_dat = survey_dat.applymap(lambda x: response_mapping[x])

In [None]:
ID_dat[survey_cols] = survey_dat[survey_cols].copy()

Check for missing data

In [None]:
survey_dat.isna().sum().sum()

Compute survey totals

Note: LWA_16 was not listed as reverse coded in the list of surveys for this study, but it clearly should be.

In [None]:
reverse = [1,3,5,6,8,10,12,15,17,18]
survey_dat.iloc[:,reverse] = 8 - survey_dat.iloc[:,reverse]

In [None]:
survey_dat['LWA_total'] = survey_dat.sum(axis=1)
survey_dat['SubID'] = ID_dat['SubID']
survey_dat.head()

Add to ID_dat

In [None]:
ID_dat = ID_dat.merge(survey_dat[[
    'SubID','LWA_total']], on = 'SubID')

##### S-SVS

In [None]:
survey_basename = 'SSVS'
survey_dtype = int
survey_cols = []
for match in re.finditer(survey_basename,colnames):
    survey_cols.append(colnames[match.span()[0]:match.span()[1]+3].strip(','))
[print(survey_cols[i], end = '\t') for i in range(len(survey_cols))];

In [None]:
ID_dat[survey_cols] = ID_dat[survey_cols].applymap(lambda x: int(x[0][0]))

Mean-center per subject to find hierarchy of values (see Schwartz, 2012):

In [None]:
ID_dat[survey_cols] = (ID_dat[survey_cols].values 
                       - pd.DataFrame(np.tile(ID_dat[survey_cols].mean(axis=1),[10,1])).T)

In [None]:
ID_dat[survey_cols].head()

Check for missing data

In [None]:
ID_dat[survey_cols].isna().sum().sum()

##### SDO-7(s)

In [None]:
survey_basename = 'SDO'
survey_dtype = int
survey_cols = []
for match in re.finditer(survey_basename,colnames):
    survey_cols.append(colnames[match.span()[0]:match.span()[1]+3].strip(','))
[print(survey_cols[i], end = '\t') for i in range(len(survey_cols))];

In [None]:
survey_dat = ID_dat[survey_cols].copy()

Set dtype or map data

In [None]:
response_mapping = {'Strongly Oppose':1,
                    'Somewhat Oppose':2,
                    'Slightly Oppose':3,
                    'Neutral':4,
                    'Slightly Favor':5,
                    'Somewhat Favor':6,
                    'Strongly Favor':7}
survey_dat = survey_dat.applymap(lambda x: response_mapping[x])

In [None]:
ID_dat[survey_cols] = survey_dat[survey_cols]

Check for missing data

In [None]:
survey_dat.isna().sum().sum()

Compute survey totals

In [None]:
reverse = [2,3,6,7]
survey_dat.iloc[:,reverse] = 8 - survey_dat.iloc[:,reverse]

In [None]:
subscale_items = {'SDO_dominance':[0,1,2,3],
                 'SDO_antiegalitarianism':[4,5,6,7]}
survey_dat['SDO_total'] = survey_dat.sum(axis=1)
for subscale, subscale_cols in subscale_items.items():
    print(subscale, subscale_cols)
    survey_dat[subscale] = survey_dat.iloc[:,subscale_cols].sum(axis=1)
survey_dat['SubID'] = ID_dat['SubID']
survey_dat.head()

Add to ID_dat

In [None]:
ID_dat = ID_dat.merge(survey_dat[[
    'SubID','SDO_total','SDO_dominance',
    'SDO_antiegalitarianism']], on = 'SubID')

## Parse cognitive surveys

##### Need for closure (NFC) (short version)

In [None]:
survey_basename = 'NFC'
survey_dtype = int
survey_cols = []
for match in re.finditer(survey_basename,colnames):
    survey_cols.append(colnames[match.span()[0]:match.span()[1]+3].strip(','))
[print(survey_cols[i], end = '\t') for i in range(len(survey_cols))];

In [None]:
survey_dat = ID_dat[survey_cols].copy()

In [None]:
np.unique(survey_dat.values)

In [None]:
response_mapping = {'Completely Disagree':1,
                    'Moderately Disagree':2,
                    'Slightly Disagree':3,
                    'Slightly Agree':4,
                    'Moderately Agree':5,
                    'Completely Agree':6}
survey_dat = survey_dat.applymap(lambda x: response_mapping[x])

Check for missing data

In [None]:
print(survey_dat.shape)
print(survey_dat.isna().sum().sum())

Compute survey totals

In [None]:
survey_dat['NFC_mean'] = survey_dat.mean(axis=1,skipna=True)
survey_dat['SubID'] = ID_dat['SubID']
survey_dat.head()

In [None]:
sns.distplot(survey_dat['NFC_mean'], rug=True)

Add to ID_dat

In [None]:
ID_dat = ID_dat.merge(survey_dat[[
    'SubID','NFC_mean']], on = 'SubID')

In [None]:
sns.scatterplot(data = ID_dat, x = 'IdeologyScale_1', y = 'NFC_mean')

##### Intolerance of uncertainty (IUS)

In [None]:
survey_basename = 'IUS'
survey_dtype = int
survey_cols = []
for match in re.finditer(survey_basename,colnames):
    survey_cols.append(colnames[match.span()[0]:match.span()[1]+3].strip(','))
[print(survey_cols[i], end = '\t') for i in range(len(survey_cols))];

In [None]:
survey_dat = ID_dat[survey_cols].copy()

In [None]:
response_mapping = {'1 (Not at all characteristic of me)':1,
                    '2':2,
                    '3 (Somewhat characteristic of me)':3,
                    '4':4,
                    '5 (Entirely characteristic of me)':5,
                    'Prefer Not to Respond':np.nan}
survey_dat = survey_dat.applymap(lambda x: response_mapping[x])

In [None]:
ID_dat[survey_cols] = survey_dat[survey_cols]

Check for missing data

In [None]:
survey_dat.isna().sum().sum()

Compute survey totals

In [None]:
survey_dat['IUS_mean'] = survey_dat.mean(axis=1,skipna=True)
survey_dat['IUS_sum'] = survey_dat['IUS_mean'] * 27
survey_dat['SubID'] = ID_dat['SubID']
survey_dat.head()

In [None]:
sns.distplot(survey_dat['IUS_mean'], rug=True)

Add to ID_dat

In [None]:
ID_dat = ID_dat.merge(survey_dat[[
    'SubID','IUS_mean','IUS_sum']], on = 'SubID')

In [None]:
sns.scatterplot(data = ID_dat, x = 'IdeologyScale_1', y = 'IUS_sum')

##### Correlate NFC and IUS

In [None]:
scipy.stats.pearsonr(ID_dat['NFC_mean'],ID_dat['IUS_mean'])

In [None]:
sns.scatterplot(data = ID_dat, x = 'NFC_mean', y = 'IUS_mean')

## Interpersonal reactivity index

In [None]:
survey_basename = 'IRI'
survey_dtype = int
survey_cols = []
for match in re.finditer(survey_basename,colnames):
    survey_cols.append(colnames[match.span()[0]:match.span()[1]+3].strip(','))
[print(survey_cols[i], end = '\t') for i in range(len(survey_cols))];

In [None]:
survey_dat = ID_dat[survey_cols].copy()

Set dtype or map data

In [None]:
response_mapping = {np.nan:np.nan,
                    'Does not describe me at all':0,
                    'Does not describe me':1,
                    'Describes me somewhat':2,
                    'Describes me well':3,
                    'Describes me very well':4}
survey_dat = survey_dat.applymap(lambda x: response_mapping[x])

In [None]:
ID_dat[survey_cols] = survey_dat[survey_cols]

Check for missing data

In [None]:
survey_dat.isna().sum().sum()

Compute survey totals

In [None]:
reverse = [0,1,16,17,18,19] ## UPDATE!
survey_dat.iloc[:,reverse] = 4 - survey_dat.iloc[:,reverse]

In [None]:
# subscale_items = 
survey_dat['IRI_total'] = survey_dat.sum(axis=1, skipna=False)
# for subscale, subscale_cols in subscale_items.items():
#     print(subscale, subscale_cols)
#     survey_dat[subscale] = survey_dat.iloc[:,subscale_cols].sum(axis=1)
survey_dat['SubID'] = ID_dat['SubID']
survey_dat.head()

Add to ID_dat

In [None]:
ID_dat = ID_dat.merge(survey_dat[[
    'SubID','IRI_total']], on = 'SubID')

## Add labels for student participants

In [None]:
Brown_undergrad_dat = pd.read_csv(data_dir +'/Cleaned/Surveys/Brown_undergrad_data.csv')
display(Brown_undergrad_dat.head())
print(Brown_undergrad_dat.shape)

In [None]:
scan_day_dat = pd.read_csv(data_dir +'/Cleaned/Surveys/Scan_day_data.csv')
display(scan_day_dat.head())
print(scan_day_dat.shape)

In [None]:
len(Brown_undergrad_dat['SubID'].unique())

In [None]:
len(ID_dat['SubID'].unique())

In [None]:
print(ID_dat.shape)
ID_dat = ID_dat.merge(Brown_undergrad_dat, on = 'SubID').merge(scan_day_dat, on = 'SubID')
print(ID_dat.shape)

## Save

In [None]:
ID_dat.to_csv(data_dir +'/Cleaned/Surveys/ID_dat.csv')