In [7]:
import pandas as pd
import numpy as np

## Reference Metadata 

In [2]:
meta = pd.read_csv('/home/vrasik35/group/personal/rasika/ROSMAP_TWAS/Metadata/ROSMAP_biospecimen_metadata.csv', sep = ',')
meta[['individualID', 'specimenID']]

Unnamed: 0,individualID,specimenID
0,R1571846,Microglia_MO_AD1
1,R9188267,Microglia_MO_AD2
2,R3728445,Microglia_MO_AD3
3,R1246326,Microglia_MO_AD4
4,R9443041,Microglia_MO_AD5
...,...,...
12272,R2757104,DUKE-08135
12273,R9354381,DUKE-08136
12274,R2711188,DUKE-08137
12275,R9047934,DUKE-08138


## Genomic Data

In [18]:
array_data = pd.read_csv('Genomics/ROSMAP_arrayGenotype_geno0.01_maf0.01_hwe-8_prunedextract.fam', sep = '\t', header = None)
array_data[['Study', 'projid']] = array_data[1].str.split(r'ROS|MAP', expand = True)
array_data['projid'] = array_data['projid'].astype(int)
array_data['Study'] = array_data[1].str[:3]
array_data

Unnamed: 0,0,1,2,3,4,5,Study,projid
0,KronosII_P01_6.0_A01_ROS20275399.CEL,ROS20275399,0,0,2,-9,ROS,20275399
1,KronosII_P01_6.0_A02_ROS10442701.CEL,ROS10442701,0,0,1,-9,ROS,10442701
2,KronosII_P01_6.0_A03_ROS20152393.CEL,ROS20152393,0,0,2,-9,ROS,20152393
3,KronosII_P01_6.0_A04_ROS20626558.CEL,ROS20626558,0,0,2,-9,ROS,20626558
4,KronosII_P01_6.0_A05_ROS15176592.CEL,ROS15176592,0,0,1,-9,ROS,15176592
...,...,...,...,...,...,...,...,...
1703,PT-BY8W,MAP50408491,0,0,2,-9,MAP,50408491
1704,PT-BY87,MAP42063693,0,0,2,-9,MAP,42063693
1705,PT-BYK9,MAP50405330,0,0,1,-9,MAP,50405330
1706,PT-C13X,ROS20254452,0,0,2,-9,ROS,20254452


In [4]:
array_data[1]

0       ROS20275399
1       ROS10442701
2       ROS20152393
3       ROS20626558
4       ROS15176592
           ...     
1703    MAP50408491
1704    MAP42063693
1705    MAP50405330
1706    ROS20254452
1707    ROS20701008
Name: 1, Length: 1708, dtype: object

In [31]:
biomap = array_data.merge(meta[['individualID', 'specimenID']], left_on = 0, right_on = 'specimenID')

Unnamed: 0,0,1,2,3,4,5,Study,projid,individualID,specimenID


In [14]:
clinmeta = pd.read_csv('/home/vrasik35/group/datasets/rosmap/Metadata/ROSMAP_clinical.csv')

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [34]:
mappings = array_data.merge(clinmeta, how = 'outer').dropna(subset = 1)[[1, 'individualID']]
mappings.rename(columns = {1:'specimenID'}, inplace = True)
mappings.to_csv('Genomics/specimentoindiv_mapping.csv', index = False)

In [43]:
array_data.merge(mappings, left_on = 1, right_on = 'specimenID')[[0, 'individualID', 2,3,4,5]].dropna().to_csv('Genomics/individualID_mapped/ROSMAP_arrayGenotype_indiv.fam', sep = '\t', 
                                                                                                      header = None, index = None)

## Transcriptomic Data

In [45]:
rnaseq = pd.read_csv('/home/vrasik35/group/personal/rasika/ROSMAP_TWAS/RNAseq/ROSMAP_RNAseq_FPKM_individualids.tsv', sep = '\t')
rnaseq = rnaseq.dropna().set_index('individualID')

In [46]:
# Function to convert FPKM to TPM
def fpkm_to_tpm(fpkm):
    return np.exp(np.log(fpkm) - np.log(np.sum(fpkm)) + np.log(1e6))

rnaseq_tpm = rnaseq.astype('float').apply(fpkm_to_tpm)
rnaseq_tpm

  result = getattr(ufunc, method)(*inputs, **kwargs)
  return np.exp(np.log(fpkm) - np.log(np.sum(fpkm)) + np.log(1e6))


Unnamed: 0_level_0,ENSG00000167578,ENSG00000242268,ENSG00000078237,ENSG00000263642,ENSG00000225275,ENSG00000060642,ENSG00000201788,ENSG00000263089,ENSG00000172137,ENSG00000240423,...,ENSG00000232668,ENSG00000089177,ENSG00000216352,ENSG00000267117,ENSG00000148943,ENSG00000265520,ENSG00000231119,ENSG00000105063,ENSG00000123685,ENSG00000181518
individualID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R9976232,1886.964096,1174.10265,1882.974810,,,2206.525893,,0.0,904.418827,523.491690,...,0.0,1896.589339,,0.0,1792.860063,,2852.162890,1720.592672,1889.586318,0.0
R9936070,1192.970950,1509.56055,1103.645904,,,1397.055500,,0.0,956.519764,1308.729224,...,0.0,1073.376482,,0.0,1362.786030,,1901.441927,1058.111030,1367.373372,0.0
R9907075,2014.159857,1174.10265,1137.529769,,,1376.510566,,0.0,2655.811860,1439.602146,...,0.0,1687.150025,,0.0,1281.372839,,1426.081445,2028.820973,1216.206467,0.0
R9905342,1573.468762,3522.30795,1941.061436,,,1844.935057,,0.0,1902.352157,1439.602146,...,0.0,1751.145371,,0.0,1555.699897,,3010.616384,1245.465487,1779.646750,0.0
R9904978,1706.928982,4528.68165,929.386024,,,1253.240963,,0.0,2646.460409,2486.585525,...,0.0,2548.178314,,0.0,1387.563958,,1980.668674,1592.280438,2820.637031,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R1360065,1343.045606,1174.10265,1950.742541,,,2424.302191,,0.0,651.929672,916.110457,...,0.0,954.112428,,0.0,1530.921969,,1584.534939,858.669188,803.933088,0.0
R1327471,843.250699,1341.83160,2420.276105,,,2128.455144,,0.0,1612.457200,1046.983379,...,0.0,2106.028652,,0.0,3065.383642,,1346.854698,1793.116978,1257.433804,0.0
R1224782,1011.573997,0.00000,2420.276105,,,1733.992415,,0.0,3610.995702,0.000000,...,0.0,1986.764599,,0.0,2090.195197,,1188.401204,1100.881775,1460.134882,0.0
R1133959,1389.620499,0.00000,2497.724940,,,1988.749594,,0.0,932.473178,1308.729224,...,0.0,1890.771580,,0.0,2637.079461,,2218.348915,1033.936261,924.179490,0.0


In [48]:
coding_list = set(pd.read_csv('/home/vrasik35/group/personal/rasika/ROSMAP_TWAS/human_protein_coding.txt', sep= '\t')['Gene stable ID'])
keep_coding = list(coding_list.intersection(rnaseq_tpm.columns))

# of the 22.5k coding genes, there are 18952 coding genes quantified in ROSMAP RNAseq
rnaseq_tpm_coding = rnaseq_tpm[keep_coding]

In [50]:
rnaseq_tpm_coding.to_csv('Transcriptomics/ROSMAP_RNAseq_FPKM_individualids_codinggenes.tsv', sep = '\t')

In [51]:
## Filter genes w/ minimal expression: 5 TPM
gene_filt = []
for gene in rnaseq_tpm_coding.columns:
    if (rnaseq_tpm_coding[gene].median() > 5):
        gene_filt.append(gene)
rnaseq_tpm_coding_tpm5 = rnaseq_tpm_coding[gene_filt]

## log2 transform - first replace any values below 1 to 1
rnaseq_tpm_coding_tpm5_log2 = rnaseq_tpm_coding_tpm5.apply(lambda x: np.where(x < 1, 1.00, x))
rnaseq_tpm_coding_tpm5_log2 = np.log2(rnaseq_tpm_coding_tpm5_log2.astype('float'))
# rnaseq_gene_tpm_log2

## Normalize by mean of log transformed
# # merged_gene_exp_norm = (merged_gene_exp_filt-merged_gene_exp_filt.mean())/merged_gene_exp_filt.std()
# # merged_gene_exp_norm_df = pd.DataFrame(merged_gene_exp_norm, columns=merged_gene_exp.columns)
# # merged_gene_exp_norm = StandardScaler().fit_transform(merged_gene_exp_filt)  
# # merged_gene_exp_norm_df = pd.DataFrame(merged_gene_exp_norm, columns=merged_gene_exp_filt.columns)
# # merged_gene_exp_norm_df

rnaseq_gene_exp_norm_dict = {}
for i, j in rnaseq_tpm_coding_tpm5_log2.items():
    rnaseq_gene_exp_norm_dict[i] = j/rnaseq_tpm_coding_tpm5_log2[i].mean()
    
rnaseq_gene_exp_norm = pd.DataFrame.from_dict(rnaseq_gene_exp_norm_dict)
rnaseq_gene_exp_norm

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Unnamed: 0_level_0,ENSG00000204272,ENSG00000112852,ENSG00000102678,ENSG00000112531,ENSG00000185386,ENSG00000164128,ENSG00000165457,ENSG00000100060,ENSG00000139163,ENSG00000183718,...,ENSG00000205838,ENSG00000141027,ENSG00000172568,ENSG00000170837,ENSG00000069345,ENSG00000044459,ENSG00000187323,ENSG00000114127,ENSG00000164600,ENSG00000237489
individualID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R9976232,1.047697,1.040724,1.071657,1.020772,0.979168,1.044808,1.086860,1.135251,1.092700,1.042382,...,0.955895,0.986228,1.110088,1.020366,1.031209,1.041792,1.008793,1.030783,0.937878,0.948803
R9936070,0.972089,1.045903,0.972021,1.006092,0.962130,0.974430,0.907814,0.947872,0.989352,0.965295,...,0.802631,0.987134,0.882113,0.943877,0.935575,1.005036,0.842237,0.946073,0.866274,0.948803
R9907075,1.019208,1.047587,1.029024,0.927839,1.038134,1.000642,1.004631,0.984556,0.933884,1.006428,...,0.949088,1.033208,0.991970,1.010588,0.996236,1.020128,0.995497,0.967888,1.023185,1.051918
R9905342,1.032022,0.979919,1.011148,1.037932,0.957708,0.975628,1.120568,1.044768,1.002193,1.036552,...,1.062219,0.977527,1.023559,0.972760,1.005094,1.034640,0.938933,1.010815,0.848973,0.940471
R9904978,1.007231,0.901482,0.999027,0.893218,1.028558,1.010864,0.809837,0.961546,0.991134,0.971413,...,0.917958,1.040732,1.010391,0.979799,0.941828,1.018071,1.140784,0.960109,0.944659,1.046069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R1360065,1.045191,0.915160,0.985285,1.133737,1.017960,1.030889,1.084925,1.046944,1.005425,1.010088,...,1.119813,0.879188,0.987073,1.046874,1.011966,0.954618,0.938933,1.061133,0.984713,0.940471
R1327471,1.012148,0.942646,0.979756,1.054786,0.975904,1.106007,0.899490,0.869701,0.988452,1.063296,...,0.980218,1.068093,1.046381,1.082800,1.086636,1.056637,0.952229,1.045373,1.105024,0.900935
R1224782,1.024036,1.029749,1.051705,1.003681,0.983624,1.073402,0.912357,0.960541,1.000548,1.043330,...,0.990955,1.044262,1.149087,1.063547,1.082744,1.013863,1.052061,1.055931,1.107277,0.931602
R1133959,1.041993,1.004797,1.080757,1.041866,0.948135,1.097568,0.850020,0.809716,1.034189,1.046597,...,1.090214,1.046700,1.055008,1.092921,1.103465,1.048589,1.082569,1.023702,1.095128,1.022202


In [52]:
rnaseq_gene_exp_norm.to_csv('Transcriptomics/ROSMAP_RNAseq_TPM5_log2norm_individualids_codinggenes.tsv', sep = '\t')

## Methylomics Data

In [57]:
methylation = pd.read_csv('~/gMethylomics/ROSMAP_arrayMethylation_imputed.tsv.gz', sep = '\t')
methyl_meta = pd.read_csv('Methylomics/ROSMAP_arrayMethylation_metaData.tsv', sep = '\t')

In [65]:
keep_transcripts = list(methyl_meta.dropna(subset = 'RefGene')['TargetID'])
methylation_refgene = methylation[methylation['TargetID'].isin(keep_transcripts)].T
methylation_refgene.columns = [methylation_refgene.iloc[0]]
methylation_refgene = methylation_refgene[1:]

In [79]:
colnames = methylation_refgene.columns 

In [85]:
methylation_refgene_individualid = methylation_refgene.merge(meta[['individualID', 'specimenID']], left_index=True, right_on = 'specimenID').set_index('individualID').drop(columns = ['specimenID'])
methylation_refgene_individualid.columns = colnames

In [88]:
methylation_refgene_individualid

TargetID,cg00000957,cg00001349,cg00001364,cg00001446,cg00001534,cg00001583,cg00002028,cg00002593,cg00002646,cg00002719,...,ch.22.78028F,ch.22.163059F,ch.22.427671F,ch.22.439136F,ch.22.441164F,ch.22.533187F,ch.22.740407F,ch.22.757911F,ch.22.772318F,ch.22.909671F
individualID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R3978789,0.77622,0.865488,0.772899,0.842025,0.88392,0.099888,0.026821,0.736051,0.889046,0.002725,...,0.085649,0.187368,0.120821,0.066196,0.083472,0.128748,0.139123,0.089335,0.233366,0.409177
R8140052,0.818279,0.91957,0.711099,0.831457,0.877727,0.07624,0.035839,0.728072,0.888627,0.011835,...,0.092679,0.125934,0.107421,0.074446,0.069204,0.12509,0.12784,0.089504,0.19837,0.463476
R7881801,0.630316,0.882256,0.694639,0.835075,0.843842,0.075861,0.020016,0.674905,0.84912,0.008358,...,0.059054,0.164964,0.096951,0.083344,0.147787,0.09544,0.11876,0.059411,0.176775,0.307656
R6108690,0.849261,0.902912,0.651986,0.834157,0.917063,0.101837,0.014101,0.698835,0.877378,0.0,...,0.093534,0.095124,0.115163,0.051879,0.018833,0.119385,0.091408,0.074204,0.141996,0.302488
R9662437,0.861136,0.90761,0.748538,0.839395,0.863272,0.071947,0.03139,0.764737,0.908272,0.0,...,0.07029,0.099851,0.110676,0.046465,0.120507,0.120238,0.108485,0.06273,0.16761,0.426867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R6016948,0.692638,0.904996,0.733207,0.852642,0.898319,0.059607,0.019535,0.742575,0.887808,0.000781,...,0.099152,0.067795,0.100549,0.092159,0.151308,0.123356,0.134127,0.057825,0.193311,0.472419
R5259690,0.785597,0.927201,0.750224,0.833056,0.855475,0.116837,0.011352,0.720815,0.897804,0.00103,...,0.090359,0.141899,0.108123,0.064988,0.158463,0.113744,0.116528,0.049111,0.149942,0.340013
R1977848,0.712386,0.856637,0.737402,0.842108,0.895146,0.069391,0.020888,0.696251,0.878899,0.0,...,0.072494,0.112585,0.129393,0.07438,0.081349,0.144721,0.125282,0.08227,0.135414,0.436682
R6536689,0.839168,0.917827,0.685959,0.841747,0.850789,0.061016,0.035047,0.738012,0.916524,0.009139,...,0.09315,0.205678,0.09869,0.047593,0.001287,0.088929,0.104598,0.101021,0.123727,0.413712


In [89]:
methylation_refgene_individualid.to_csv('Methylomics/Methylation_refgene_individualid.tsv', sep = '\t')

KeyboardInterrupt: 

In [None]:
methylation_individualid = methylation.merge(meta[['individualID', 'specimenID']], left_index=True, right_on = 'specimenID').set_index('individualID').drop(columns = ['specimenID'])
methylation_individualid.columns = colnames
methylation_individualid.to_csv('Methylomics/Methylation_alltranscripts_individualid.tsv', sep = '\t')

In [None]:
pd.read_csv('Methylomics/ROSMAP_arrayMethylation_imputed.tsv.gz', sep = '\t')