In [1]:
# import pytorch libraries
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from torch_geometric.deprecation import deprecated

import os
import csv
import copy
import math
import time
import random
import argparse
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse
from decimal import Decimal
from scipy import stats
from scipy.stats import spearmanr, rankdata, norm
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

In [2]:
# concat PCs from all the tissue.txt file
data_dir = '/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/Pytorch/02162024/eqtl_analysis/GTEx_Analysis_v8_eQTL_covariates/'
data_dict = {}
files = os.listdir(data_dir)
# read all the covariate files from all the tissues
for file in files:
    if file.endswith('.v8.covariates.txt'):
        prefix = file.split('.v8.covariates.txt')[0]
        df = pd.read_csv(os.path.join(data_dir, file), low_memory=False, index_col=0, sep="\t")
        PCs = df.iloc[:5]
        others = df.iloc[-3:]
        df_sub = pd.concat([PCs, others])
        data_dict[prefix] = df_sub
# build a matrix with all the subjects
combined_df = pd.concat(data_dict.values(), axis=1)
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]
combined_df = combined_df.reindex(sorted(combined_df.columns), axis=1)

In [3]:
combined_df

Unnamed: 0_level_0,GTEX-1117F,GTEX-111CU,GTEX-111FC,GTEX-111VG,GTEX-111YS,GTEX-1122O,GTEX-1128S,GTEX-113IC,GTEX-113JC,GTEX-117XS,...,GTEX-ZYFC,GTEX-ZYFD,GTEX-ZYFG,GTEX-ZYT6,GTEX-ZYVF,GTEX-ZYW4,GTEX-ZYY3,GTEX-ZZ64,GTEX-ZZPT,GTEX-ZZPU
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PC1,-0.0867,0.0107,0.0099,0.0144,0.0154,0.0139,0.0145,-0.0728,0.0106,0.0139,...,-0.0934,0.0134,0.0147,0.0134,0.0118,-0.0245,0.0111,0.0137,0.0158,0.0135
PC2,-0.0132,-0.0026,-0.005,-0.0081,-0.0093,-0.0097,-0.0093,-0.0077,0.0037,-0.0056,...,-0.0143,-0.0047,-0.0026,-0.0088,0.0005,0.2918,-0.0048,-0.0048,-0.0043,-0.0053
PC3,-0.0062,0.0004,0.0304,0.0133,0.0107,0.0067,0.0327,-0.0044,-0.0116,-0.0096,...,0.0089,-0.0111,0.0054,0.0044,-0.0678,0.0227,-0.0795,0.0171,0.0446,-0.0058
PC4,0.0046,-0.0015,0.0076,0.0035,-0.0038,-0.0144,0.0056,-0.0146,-0.0239,-0.0022,...,0.0064,-0.0098,0.0112,0.0052,0.0063,0.121,-0.0113,-0.0016,0.0277,0.0014
PC5,0.0054,0.0112,0.0347,0.0018,-0.0083,0.0296,0.051,0.0063,-0.0036,0.0177,...,0.001,0.0208,0.0151,0.0148,0.0326,0.0376,0.01,-0.0138,-0.0995,0.0087
pcr,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
platform,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
sex,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,...,1.0,1.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0


In [4]:
region_pick = ['Amygdala', 'Anterior_cingulate_cortex_BA24', 'Caudate_basal_ganglia', 
               'Cerebellar_Hemisphere', 'Frontal_Cortex_BA9', 'Hippocampus', 'Hypothalamus', 
               'Nucleus_accumbens_basal_ganglia', 'Putamen_basal_ganglia', 'Substantia_nigra']

# read the new covariates
for region in region_pick:
    data_dir = '/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/Pytorch/02162024/eqtl_analysis/PEER_result/'
    filename = region+'.PEER_covariates.txt'
    old_dir = '/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/Pytorch/02162024/eqtl_analysis/GTEx_Analysis_v8_eQTL_covariates/'
    old_filename = 'Brain_'+region+'.v8.covariates.txt'
    df = pd.read_csv(data_dir+filename, sep='\t')
    old_df = pd.read_csv(old_dir+old_filename, sep='\t')
    x1 = len([x for x in old_df.columns if x in combined_df.columns])
    y1 = len(old_df.columns)-1
    x2 = len([x for x in df.columns if x in combined_df.columns])
    y2 = len(df.columns)
    print(f'{region}, subjects with PCs/subjects with tpm = {x2}/{y2}; origin : {x1}/{y1}')
    
# check tpm count file
# data_dir = '/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/Pytorch/12052023/'
# gt = pd.read_csv(data_dir+"new_normed_gtex_gtex_allen_gene.txt", low_memory=False, index_col=0, sep="\t")
# len(set(gt.loc['subject'].tolist()))

Amygdala, subjects with PCs/subjects with tpm = 198/198; origin : 129/129
Anterior_cingulate_cortex_BA24, subjects with PCs/subjects with tpm = 210/210; origin : 147/147
Caudate_basal_ganglia, subjects with PCs/subjects with tpm = 218/218; origin : 194/194
Cerebellar_Hemisphere, subjects with PCs/subjects with tpm = 214/214; origin : 175/175
Frontal_Cortex_BA9, subjects with PCs/subjects with tpm = 211/211; origin : 175/175
Hippocampus, subjects with PCs/subjects with tpm = 209/209; origin : 165/165
Hypothalamus, subjects with PCs/subjects with tpm = 209/209; origin : 170/170
Nucleus_accumbens_basal_ganglia, subjects with PCs/subjects with tpm = 217/217; origin : 202/202
Putamen_basal_ganglia, subjects with PCs/subjects with tpm = 204/204; origin : 170/170
Substantia_nigra, subjects with PCs/subjects with tpm = 204/204; origin : 114/114


In [31]:
# build the new covariate files for more subjects
for region in region_pick:
    data_dir = '/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/Pytorch/02162024/eqtl_analysis/PEER_result/'
    filename = region+'.PEER_covariates.txt'
    cov_mat = pd.read_csv(data_dir+filename, sep='\t')
    other_info = combined_df[cov_mat.columns]
    # add the PCs and other factor
    df = np.vstack((other_info[:5], cov_mat))
    df = np.vstack((df, other_info[-3:]))
    df = pd.DataFrame(df, columns=cov_mat.columns)
    ID_list = [f"PC{i}" for i in range(1, 6)] + [f"InferredCov{i}" for i in range(1, 31)] + ['pcr', 'platform', 'sex']
    # build the final mat
    df['ID'] = ID_list
    cols = ['ID'] + [col for col in df.columns if col != 'ID']
    df = df[cols]
    save_dir = '/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/Pytorch/02162024/eqtl_analysis/new_covariates_file/'
    df.to_csv(save_dir+f'{region}_new_covariates.txt', sep='\t', index=False)

In [32]:
# read the file and take a look
save_dir = '/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/Pytorch/02162024/eqtl_analysis/new_covariates_file/'
region = region_pick[0]
filename = f'{region}_new_covariates.txt'
df = pd.read_csv(save_dir+filename, sep='\t')