# import modules

In [None]:
import pandas as pd

In [None]:
import re

In [None]:
import numpy as np

In [None]:
import warnings

In [None]:
warnings.filterwarnings('ignore') 

# read in input files

In [None]:
biospec = pd.read_csv('rosmap/Metadata/ROSMAP_biospecimen_metadata.csv')
biospec.head()

In [None]:
rnaseq_meta = pd.read_csv('rosmap/Metadata/ROSMAP_assay_rnaSeq_metadata.csv')
rnaseq_meta.head()

In [None]:
raw_rna = pd.read_csv('rosmap/Gene_Expression/Gene Expression (RNA seq)/Gene Expression (RNA seq - bulk brain)/ROSMAP_RNAseq_FPKM_gene.tsv',
                      sep = '\t')
raw_rna.head()

In [None]:
rna_clean = pd.read_csv('rosmap/ROSMAP_RNAseq.TPM5_log2norm.individualids.codinggenes.pathway_genes.ensembl_ids.tsv',
                        sep = '\t')
rna_clean.head()

# parse rnaseq meta data

## extract ids from raw rnaseq file

In [None]:
raw_rna_transposed = raw_rna.transpose()
raw_rna_transposed['rawID'] = raw_rna_transposed.index
raw_rna_transposed[['id_1', 'id_2', 'id_3']] = raw_rna_transposed['rawID'].str.split('_', expand = True)
raw_rna_transposed['rawID_no_batch'] = raw_rna_transposed['id_1']+ '_' + raw_rna_transposed['id_2']
raw_rna_transposed = raw_rna_transposed[['rawID', 'rawID_no_batch']]
raw_rna_transposed =  raw_rna_transposed.iloc[2:]
print(len(raw_rna_transposed.index))
raw_rna_transposed.head()

## extract ids from rnaseq meta file

In [None]:
print(len(rnaseq_meta.index))
rnaseq_meta.head()

In [None]:
rnaseq_meta_filt=rnaseq_meta[rnaseq_meta['specimenID'].isin(raw_rna_transposed['rawID_no_batch'])]
print(len(rnaseq_meta_filt.index))
print(rnaseq_meta_filt['notes'].unique())
print(rnaseq_meta_filt['libraryBatch'].unique())
print(rnaseq_meta_filt['sequencingBatch'].unique())
rnaseq_meta_filt.head()

## extract IDs from biospec file

In [None]:
biospec_filt = biospec[biospec['specimenID'].isin(raw_rna_transposed['rawID_no_batch'])]
biospec_filt = biospec_filt[biospec_filt['assay'].isin(['rnaSeq'])]
print(len(biospec_filt.index))
print(len(biospec_filt['specimenID'].unique()))
print(biospec_filt['tissue'].unique())
biospec_filt.head()

In [None]:
biospec_filt[biospec_filt['specimenID'].isin(['510_120515'])]

In [None]:
rnaseq_meta[rnaseq_meta['notes'].isin(['data contribution batch 2'])]

In [None]:
rnaseq_meta[rnaseq_meta['notes'].isin(['data contribution batch 3'])]

# select one of duplicate ids

In [None]:
dup = raw_rna.transpose()
dup.columns = dup.iloc[1]
dup.insert(0, 'ID' ,dup.index)
dup = dup[dup['ID'].isin(['492_120515_0', '492_120515_6', '492_120515_7'])]
dup.head()

In [None]:
raw_gene_list = dup.columns.tolist()
clean_gene_list =[]
for gene in raw_ gene_list:
    clean_gene = gene.split('.')[0]
    clean_gene_list.append(clean_gene)

In [None]:
dup = dup.set_axis(clean_gene_list, axis = 1)
dup.head()

In [None]:
rna_clean_dup = rna_clean[rna_clean['individualID'].isin(['R5693901'])]
rna_clean_dup.head()

In [None]:
rna_clean_dup_col_list = rna_clean_dup.columns.tolist()
rna_clean_dup_col_list.remove('individualID')
rna_clean_dup_col_list.insert(0, 'ID')

In [None]:
dup_pathway = dup[rna_clean_dup_col_list]
dup_pathway.head()

# convert fpkm to tpm

In [None]:
# Function to convert FPKM to TPM
def fpkm_to_tpm(fpkm):
    return np.exp(np.log(fpkm) - np.log(np.sum(fpkm)) + np.log(1e6))

dup_tpm = dup_pathway.astype('float').apply(fpkm_to_tpm)
dup_tpm.head()

In [None]:
rnaseq_tpm_coding = dup_tpm

# remove genes with median TPM <= 5

In [None]:
## Filter genes w/ minimal expression: 5 TPM
gene_filt = []
for gene in rnaseq_tpm_coding.columns:
    if (rnaseq_tpm_coding[gene].median() > 5):
        gene_filt.append(gene)
rnaseq_tpm_coding_tpm5 = rnaseq_tpm_coding[gene_filt]

## log2 transform - first replace any values below 1 to 1
rnaseq_tpm_coding_tpm5_log2 = rnaseq_tpm_coding_tpm5.apply(lambda x: np.where(x < 1, 1.00, x))
rnaseq_tpm_coding_tpm5_log2 = np.log2(rnaseq_tpm_coding_tpm5_log2.astype('float'))
# rnaseq_gene_tpm_log2

## Normalize by mean of log transformed
# # merged_gene_exp_norm = (merged_gene_exp_filt-merged_gene_exp_filt.mean())/merged_gene_exp_filt.std()
# # merged_gene_exp_norm_df = pd.DataFrame(merged_gene_exp_norm, columns=merged_gene_exp.columns)
# # merged_gene_exp_norm = StandardScaler().fit_transform(merged_gene_exp_filt)  
# # merged_gene_exp_norm_df = pd.DataFrame(merged_gene_exp_norm, columns=merged_gene_exp_filt.columns)
# # merged_gene_exp_norm_df

rnaseq_gene_exp_norm_dict = {}
for i, j in rnaseq_tpm_coding_tpm5_log2.items():
    rnaseq_gene_exp_norm_dict[i] = j/rnaseq_tpm_coding_tpm5_log2[i].mean()
    
rnaseq_gene_exp_norm = pd.DataFrame.from_dict(rnaseq_gene_exp_norm_dict)
print(len(rnaseq_gene_exp_norm.columns))
rnaseq_gene_exp_norm

## export ID list

In [None]:
print(len(biospec_filt.index))
biospec_filt.head()

In [None]:
biospec_filt_id=biospec_filt[['individualID','specimenID']]

In [None]:
biospec_filt_id.to_csv('rosmap/ID_mapping/ROSMAP_RNAseq.batch1.DLPFC.ID_map.txt',
                       sep='\t',
                       index=None)