In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import scale
from tqdm import tqdm
from scipy.stats.distributions import halfcauchy
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import matplotlib
import matplotlib.font_manager as font_manager
# import seaborn as sns
from sklearn.model_selection import train_test_split
import joblib

from facsimile.eval import FACSIMILEOptimiser

In [29]:
#Sort out directories
import os
cwd = os.getcwd()
cwd

##### SETUP STABLE PARAMS ######
matplotlib.rcParams['font.weight'] = 'light'
matplotlib.rcParams['axes.facecolor'] = '#fbfbfb'

pal = ['#4f4f4f', '#B80044', '#0e79b2']

### SPECITY LOAD AND SAVE FILES
from utils import get_timestamp
#data_file = 'results/surveys/20230530034621_survey_responses_diff_effs_combined.csv'

namenote = 'preprocessed_FACSIMILE'
#exp_date = '29B'
#exp_date = '5-25-28'
#exp_date = '6-19'
exp_date = '7-6'

#data_file = 'results/surveys/20230623183156_survey_responses_diff_effs_combined.csv'
#data_file = 'results/surveys/20230530034621_survey_responses_diff_effs_combined.csv'
#data_file = 'results/surveys/20230623183156_survey_responses_diff_effs_combined.csv'
data_file = 'results/surveys/20230710105652_survey_responses_diff_effs_combined.csv'
#data_file = 'results/surveys/20230710105652_survey_responses_diff_effs_combined.csv'
#data_file = 'results/surveys/20240214133158_survey_responses_diff_effs_combined_22A.csv'
#data_file = 'results/surveys/20240214133504_survey_responses_diff_effs_combined_22B.csv'
#data_file = 'results/surveys/20240214133618_survey_responses_diff_effs_combined_29A.csv'
#data_file = 'results/surveys/20240214133648_survey_responses_diff_effs_combined_29B.csv'
save_file = 'results/transdiagnostic_scores/%s_transdiagnostic_scores_diff_effs_%s_%s.csv' %(get_timestamp(), exp_date, namenote)


In [30]:
#transQs = pd.read_csv('results/surveys/20230527050343_survey_responses_diff_effs.csv')
transQs = pd.read_csv(data_file, index_col=0)
#transQs
transQs['SDS_18']

5c9d88bb2b3c77001744ec1c    2
62dc6fd89dd8473b3671357e    0
5c41e9263be7b70001fcf4ab    2
5f4412624052727181839e3a    1
5d0c90dc3382180015624ad9    0
                           ..
641441b04f9c12eac8c1be7b    2
55da1c4669dbc30010b67569    0
5df50a49a98a7a3924e4137e    2
640785ce903930be7597e7c1    2
613dd5fb512aabe8d5d32393    1
Name: SDS_18, Length: 80, dtype: int64

### Preprocessing steps
As described in GitHub issue: https://github.com/alexKhopkins/optimising-transdiagnostic-measurement/issues/1

In [31]:
### STEP 1: MOVE ZUNG 15 to ZUNG 16
transQs['SDS_16'] = transQs['SDS_15'].values
transQs['SDS_15'] = 0

### STEP 2: REVERSE CODING FOR SDS, STAI, BIS, AES
reversed_items = {
    'SDS': [11, 12, 14, 16, 17, 18, 20],
    'STAI': [1, 3, 7, 10, 13, 16, 19],
    'BIS': [9, 13, 20],
    'AES': [1, 2, 7, 8, 16, 17, 18],
}

number_of_options = {
    'SDS': 4,
    'STAI': 4,
    'BIS': 4,
    'AES': 4,
}

for measure, items in reversed_items.items():
    for item in items:
        item_str = measure + '_' + str(item)
        assert transQs[item_str].max() == number_of_options[measure] - 1, "Unexpected maximum choice %f for measure %s %s! Double-check" %(transQs[item_str].max(), measure, item)
        transQs[item_str] = number_of_options[measure] - transQs[item_str] - 1

### STEP 3: subtract 1 for OCI-R & LSAS across all items [i.e. scores start at 0 rather than 1]

## add 1 to all columns that have items
for col in transQs.columns:
    if transQs[col].any():
        transQs[col] = transQs[col] + 1

measures_to_subtract = ['OCI', 'LSAS']

for measure in measures_to_subtract:
    items = [col for col in transQs.columns if measure in col and transQs[col].any()]
    for item in items:
        transQs[item] = transQs[item] - 1

### STEP 4: EAT has a particular scoring system, such that rating 1 (always) should be coded as "3", rating 2 (usually) as "2", rating 3 (often) as "1", and ratings 4:6 (sometimes to never) as "0".
items = [col for col in transQs.columns if 'EAT' in col and transQs[col].any()]
for item in items:
    transQs[item] = transQs[item].apply(lambda x: max(4 - x, 0))

transQs.to_csv(os.path.join('results', 'surveys', '%s_survey_responses_preprocessed_%s.csv' %(get_timestamp(), exp_date)))

### Continuing with remaining categorization

In [32]:
weights = pd.read_csv("weights_for_nura_for71itemsCollected.csv", index_col=0)

weights

Unnamed: 0,AD,Compul,SW
SDS_10,0.023366,0.061498,0.003532
SDS_11,0.025924,0.022927,-0.005279
SDS_12,0.024973,-0.008134,-0.002747
SDS_13,0.007590,0.092179,-0.001233
SDS_14,0.048909,-0.031658,0.004617
...,...,...,...
AES_8,0.048456,-0.005335,-0.022130
AES_16,0.058373,-0.011705,-0.006310
AES_17,0.062410,-0.035699,0.004229
AES_18,0.082899,-0.003842,-0.013277


In [33]:
transQs

Unnamed: 0,SDS_1,SDS_2,SDS_3,SDS_4,SDS_5,SDS_6,SDS_7,SDS_8,SDS_9,SDS_10,...,AES_9,AES_10,AES_11,AES_12,AES_13,AES_14,AES_15,AES_16,AES_17,AES_18
5c9d88bb2b3c77001744ec1c,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,2,3,2
62dc6fd89dd8473b3671357e,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,4,4,4
5c41e9263be7b70001fcf4ab,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,2,1,2
5f4412624052727181839e3a,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,2,2,2
5d0c90dc3382180015624ad9,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641441b04f9c12eac8c1be7b,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,2,1,1
55da1c4669dbc30010b67569,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,3,3,3
5df50a49a98a7a3924e4137e,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,1
640785ce903930be7597e7c1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,3,1,3


In [34]:
## DO A LINEAR PREDICTION BASED ON THE WEIGHTS
X = transQs[weights.index[:-1]]

factor_score_pred = X @ weights.values[:-1] + weights.values[-1]
factor_score_pred.columns = ['AD', 'Compul', 'SW']
factor_score_pred

Unnamed: 0,AD,Compul,SW
5c9d88bb2b3c77001744ec1c,-0.045044,-0.910653,-0.117039
62dc6fd89dd8473b3671357e,2.190002,-1.258477,-1.651439
5c41e9263be7b70001fcf4ab,-0.268940,-0.708605,-1.108053
5f4412624052727181839e3a,-0.325557,-0.638963,0.481617
5d0c90dc3382180015624ad9,2.190002,-1.258477,-1.651439
...,...,...,...
641441b04f9c12eac8c1be7b,-1.044494,-1.041249,-0.470772
55da1c4669dbc30010b67569,1.641487,0.769038,1.081201
5df50a49a98a7a3924e4137e,-1.776493,-0.229570,-0.080461
640785ce903930be7597e7c1,-0.499722,-0.215601,0.135432


In [35]:
factor_score_pred = pd.DataFrame(factor_score_pred, columns=['AD', 'Compul', 'SW'])
factor_score_pred.index = transQs.index
factor_score_pred.head()

Unnamed: 0,AD,Compul,SW
5c9d88bb2b3c77001744ec1c,-0.045044,-0.910653,-0.117039
62dc6fd89dd8473b3671357e,2.190002,-1.258477,-1.651439
5c41e9263be7b70001fcf4ab,-0.26894,-0.708605,-1.108053
5f4412624052727181839e3a,-0.325557,-0.638963,0.481617
5d0c90dc3382180015624ad9,2.190002,-1.258477,-1.651439


In [36]:
#factor_score_pred.index = transQs.index

factor_score_pred.to_csv(save_file)
