In [None]:
import os
import numpy as np
import pandas as pd
import scipy as sp
import scipy.spatial
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
base_dir = os.path.realpath('../../..')
print(base_dir)
data_dir = base_dir + '/Data'

In [None]:
ID_dat = pd.read_csv(data_dir +'/Cleaned/Surveys/ID_dat.csv', index_col=0)

In [None]:
[print(asd, end = '\t\t') for asd in ID_dat.columns];

In [None]:
[a for a in ID_dat.columns if 'IUS' in a]

In [None]:
cols = ['SubID','IdeologyScale_1','pole','activism_score','NFC_mean','IUS_mean','PCA_comp1','PCA_comp2',
        'Scan_day','Age','Gender','Undergrad','Brown_community']

In [None]:
ius_cols = [i for i in ID_dat.columns if 'IUS' in i][:-2]
cols.extend(ius_cols)

In [None]:
news_cols = [i for i in ID_dat.columns if 'consume' in i]
news_cols.append('SubID')

In [None]:
news_dat = ID_dat[news_cols].copy()

In [None]:
ID_dat = ID_dat[cols]

In [None]:
ID_dat.head()

## Make predictor RDMs

##### Predictors of interest
- scale_ideology_similarity
- ideology_pair (within c, within l, across)
- ideology_same (within, across)
- joint_activism
(The two activism scores multiplied and normalized to be between 0 and 1 => low scores indicate neither subject has high activism, the highest scores indicate both subjects have high activism)
- joint_NFC
(The two NFC scores multiplied and normalized to be between 0 and 1 => low scores indicate neither subject has high NFC, the highest scores indicate both subjects have high NFC)
- joint_IUS
(The two IUS scores multiplied and normalized to be between 0 and 1 => low scores indicate neither subject has high IUS, the highest scores indicate both subjects have high IUS)
- sum_IUS
The two IUS scores added together and normalized to be between 0 and 1 => low scores indicate neither subject has high IUS, the highest scores indicate both subjects have high IUS)
- IUS_sim_univariate
1 - distance between the two subjects' IUS scores as a % of the maximal distance
- IUS_sim_multivariate
Cosine similarity between IUS response set of the two subjects
- scale_PCA1_similarity, scale_PCA2_similarity
(First and second principal components from the PCA on the political individual difference measures (social and economic conservatism scale, social dominance orientation, right-wing authoritarianism, left-wing authoritarianism, Schwarz’s short value survey))

##### Controls
- scale_age_distance
- scale_scan_day_distance
- same_gender
- same_undergrad
- same_community

In [None]:
ID_dat.head()

In [None]:
RDMs = pd.DataFrame()
k=0
k_lim = 2000
for sub1 in ID_dat['SubID'].unique():
    print('sub-%03d '%sub1, end = '')
    for sub2 in ID_dat['SubID'].unique():
        print('.', end = '')
        if sub1 != sub2:
            sub1dat = ID_dat.query('SubID == @sub1').iloc[0,:]
            sub2dat = ID_dat.query('SubID == @sub2').iloc[0,:]
            to_append = pd.Series()
            to_append['SubID1'] = sub1
            to_append['SubID2'] = sub2
            # Ideology
            to_append['ideology_similarity'] = 100 - np.abs(sub1dat['IdeologyScale_1'] - sub2dat['IdeologyScale_1'])
            # Party (con/lib)
            if (sub1dat['pole'] == 'L') and (sub2dat['pole'] == 'L'):
                to_append['ideology_pair'] = 'Within_lib'
                to_append['ideology_same'] = 'Within_group'
            elif (sub1dat['pole'] == 'C') and (sub2dat['pole'] == 'C'):
                to_append['ideology_pair'] = 'Within_con'
                to_append['ideology_same'] = 'Within_group'
            else:
                to_append['ideology_pair'] = 'Between'
                to_append['ideology_same'] = 'Between_group'
            # Activism
            to_append['joint_activism'] = (sub1dat['activism_score'] * sub2dat['activism_score'])/(7*7)
            # joint_NFC
            to_append['joint_NFC'] = (sub1dat['NFC_mean'] * sub2dat['NFC_mean'])/(6*6)
            # joint_IUS
            to_append['joint_IUS'] = (sub1dat['IUS_mean'] * sub2dat['IUS_mean'])/(5*5)
            # Sum IUS
            to_append['sum_IUS'] = (sub1dat['IUS_mean'] + sub2dat['IUS_mean'])/(5 + 5)
            # IUS_sim_univariate
            to_append['IUS_sim_univariate'] = 1 - np.abs(sub1dat['IUS_mean'] - sub2dat['IUS_mean'])/4.
            # IUS_sim_multivariate
            dat1 = sub1dat[ius_cols].values
            dat2 = sub2dat[ius_cols].values
            nonan = np.where((~sub1dat[ius_cols].isnull()) & (~sub2dat[ius_cols].isnull()))[0] # Drop nans
            to_append['IUS_sim_multivariate'] = scipy.stats.pearsonr(dat1[nonan],dat2[nonan])[0] # This can still be nan if subject put the same score everywhere
            # PCA
            to_append['PCA1_similarity'] = 16 - np.abs(sub1dat['PCA_comp1'] - sub2dat['PCA_comp1'])
            to_append['PCA2_similarity'] = 16 - np.abs(sub1dat['PCA_comp2'] - sub2dat['PCA_comp2'])
            # Controls
            to_append['scan_day_distance'] = np.abs(sub1dat['Scan_day'] - sub2dat['Scan_day'])
            to_append['age_distance'] = np.abs(sub1dat['Age'] - sub2dat['Age'])
            to_append['same_gender'] = sub1dat['Gender'] == sub2dat['Gender']
            to_append['same_undergrad'] = sub1dat['Undergrad'] == sub2dat['Undergrad']
            to_append['same_community'] = sub1dat['Brown_community'] == sub2dat['Brown_community']
            # Append
            RDMs = RDMs.append(pd.DataFrame(to_append).T).reset_index(drop=True)
            k += 1
        
        if k > k_lim:
            break
    print('')
    if k > k_lim:
        break
if k < k_lim:
    print('All subject pairs completed.')

In [None]:
RDMs.query('SubID1 == 34')

In [None]:
np.shape(RDMs)

In [None]:
RDMs.to_csv(data_dir +'/Cleaned/Surveys/predictor_RDMs_4.csv', index=None)