In [1]:
import numpy as np
import mat73
import scipy
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import fcluster
from matplotlib import pyplot as plt
from scipy.spatial.distance import pdist
import torch
import pandas as pd
import pickle
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet

In [2]:
my_directory = ''

In [3]:
df = mat73.loadmat('workspace13d.mat')

In [4]:
X_nIDPs = np.concatenate((np.concatenate((df['vars'] , df['sex12'].reshape(df['sex12'].shape[0],1)) , axis = 1) ,
                          df['age1'].reshape(df['age1'].shape[0],1)**2) , axis = 1)

In [5]:
nIDP_numbers = np.concatenate((np.array(df['varsVARS']) , np.array(['0-0.0','1-0.0'])))

In [6]:
nIDP_names = np.concatenate((np.array(df['varsHeader']) , np.array(['sex','age_squared'])))

In [7]:
y_IDP = np.log(np.array(df['IDPs1'])[: , (np.array(df['IDP_names']) == 'IDP_T1_SIENAX_grey_normalised_volume').reshape(-1)])

y_missing_inds = np.isnan(y_IDP)

y_IDP_comp = y_IDP[y_missing_inds == False]

In [8]:
X_nIDPs_comp = X_nIDPs[y_missing_inds.reshape(-1) == False , :]

In [9]:
nums_to_names_dict = {'0':'sex', '1':'age_sq','137':'number_of_treatments', '1558':'alc_intake_freq', '1568':'avg_weekly_rw', 
                      '1588':'avg_weekly_beer_cider', '1797':'father_still_alive', '20002-1220.0':'diabetes_self_rep', 
                      '20002-1065.0':'hypertension_self_rp',
                      '1835':'mother_still_alive', '20009':'age_first_non_cancer_disease', '20116':'smoking_status',
                      '22671':'mean_carotid_IMT_120', '22674':'mean_carotid_IMT_150', '23101':'body_fat_free_mass',
                      '23106':'impedance_whole_body', '23109':'impedance_right_arm','23113':'leg_fat_free_mass_r',
                      '23121':'arm_fat_free_mass_r','23202':'L1_L4_average_width', '23213':'femur_MBD_T_score_r',
                      '23293':'femur_MBD_T_score_l',
                      '23301':'femur_wards_MBD_T_score_l','23305':'head_bone_area_bone_size', '23306':'head_BMC',
                      '23323':'num_digit_matches_attp','23325':'femur_neck_bone_area_size_l','23326':'femur_neck_bone_area_size_r',
                      '23342':'femur_wards_bone_area_r' , '2443':'diabetes_diagnosis',
                      '23333':'femur_total_area_l', '23334':'femur_total_area_r','2345':'ever_bowel_cancer_screening',
                      '30720':'cystatin_c', '30750':'glycated_haemoglobin',
                      '4100':'ankle_spacing_width_l', '50':'standing_height', '6150-4.0':'high_blood_pressure',
                      '4537':'job_satisfaction', '48':'waist_circumference', '699':'length_current_address',
                      '23334':'femur_total_area_right','4080':'systolic_BP',
                      '20003-1140868226.0':'treatment_aspirin', '20003-1140884600.0':'treatment_metformin',
                      '41270-I10':'ICD10_hypertension_primary', '41270-E119':'ICD10_diabetes_primary',
                      '41270-Z864':'ICD10_psychoactive_SA_primary',
                      '41204-I10.secondary':'ICD10_hypertension_secondary',
                      '41204-Z864.secondary':'ICD10_psychoactive_SA_secondary',
                      '41204-Z921.secondary':'ICD10_antineoplastic_chemotherapy_secondary',
                      '12143':'weight_pre_imaging','12144':'height_pre_imaging'
                     }

In [10]:
exceptions_list = ['20002-1220.0' , '20002-1065.0','20002-1220.0','20002-1065.0','6150-4.0',
                  '20003-1140868226.0', '20003-1140884600.0', '41270-I10', '41270-E119',
                  '41270-Z864','41204-I10.secondary','41204-Z864.secondary','41204-Z921.secondary']

In [91]:
for n_vars in [15]:
    for method in ['CompVars' , 'mean','SoftImpute','IterativeImputer']:
        my_vars= pd.read_csv(my_directory+"selected_var_numbers_unique_"+method+"_LASSO_"+str(n_vars)+".csv").to_numpy()[: , 1].reshape(-1)
        my_cols = ['normalised_grey_matter_log', 'townsend_indx']+[nums_to_names_dict[str(v)] for v in my_vars]
        my_df = pd.DataFrame(np.full((y_IDP_comp.shape[0],2+n_vars) , np.nan) ,
                          columns=my_cols)
        my_df['normalised_grey_matter_log'] = y_IDP_comp
        my_df['townsend_indx'] = X_nIDPs_comp[: , nIDP_names == 'Townsend deprivation index at recruitment (0.0)'].reshape(-1)
        for i in range(n_vars):
            if str(my_vars[i]) in exceptions_list:
                my_vals = X_nIDPs_comp[: , my_vars[i] == nIDP_numbers].reshape(-1)
                my_entries = np.logical_and(np.isnan(my_vals) == False , np.isnan(my_df.iloc[: , i+2]) == True)
                my_df.iloc[my_entries , i+2] = my_vals[my_entries]
            else:
                for ending in ['-2.0' , '-1.0' , '-0.0']:
                    my_string = str(my_vars[i])+ending
                    if my_string in nIDP_numbers:
                        my_vals = X_nIDPs_comp[: , my_string == nIDP_numbers].reshape(-1)
                        my_entries = np.logical_and(np.isnan(my_vals) == False , np.isnan(my_df.iloc[: , i+2]) == True)
                        my_df.iloc[my_entries , i+2] = my_vals[my_entries]
        
        print(np.isnan(my_df).mean(axis = 0))
        my_df.to_csv(my_directory+'final_df_'+method+'_'+str(n_vars)+'.csv')




        

normalised_grey_matter_log                     0.000000
townsend_indx                                  0.000928
sex                                            0.000000
age_sq                                         0.000000
weight_pre_imaging                             0.000000
height_pre_imaging                             0.000000
hypertension_self_rp                           0.000000
diabetes_self_rep                              0.000000
treatment_aspirin                              0.000000
treatment_metformin                            0.000000
ICD10_hypertension_secondary                   0.000000
ICD10_psychoactive_SA_secondary                0.000000
ICD10_antineoplastic_chemotherapy_secondary    0.000000
ICD10_diabetes_primary                         0.000000
ICD10_hypertension_primary                     0.000000
ICD10_psychoactive_SA_primary                  0.000000
high_blood_pressure                            0.000000
dtype: float64
normalised_grey_matter_log     0.