# load packages

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
from scipy.stats import zscore

# read in input files

In [None]:
id_map = pd.read_csv('rosmap/Metadata/ROSMAP_clinical.csv')
id_map.head()

In [None]:
metadata = pd.read_csv('rosmap/Metadata/ROSMAP_assay_proteomics_metadata.csv')
print(metadata.shape)
metadata.head()

In [None]:
biospec = pd.read_csv('osmap/Metadata/ROSMAP_biospecimen_metadata.csv')
biospec.head()

In [None]:
adsp_rosmap_id_map = pd.read_csv('rosmap/ID_mapping/ROSMAP.ADSP_PHENO.ID_map.txt',
                                 sep = '\t')
adsp_rosmap_id_map.head()

In [None]:
somoscan = pd.read_csv('rosmap/Proteomics/Proteomics_SomoScan/OhNM2025_ROSMAP_plasma_Soma7k_protein_level_ANML_log10.csv')
somoscan.head()

In [None]:
somoscan_protein_meta = pd.read_csv('rosmap/Proteomics/Proteomics_SomoScan/OhNM2025_ROSMAP_plasma_Soma7k_protein_metadata.csv')
somoscan_protein_meta.head()

In [None]:
somoscan_indiv_meta = pd.read_csv('rosmap/Proteomics/Proteomics_SomoScan/OhNM2025_ROSMAP_plasma_Soma7k_sample_metadata.csv')
somoscan_indiv_meta.head()

In [None]:
common_id_map = pd.read_csv('id_map/ADSP.ROSMAP.MSBB.id_map.txt',
                            sep = '\t')
common_id_map.head()

# clean somoscan data

## identify duplicate IDs

In [None]:
somoscan_indiv_meta_dup = somoscan_indiv_meta[somoscan_indiv_meta['projid'].isin(somoscan_indiv_meta['projid'][somoscan_indiv_meta['projid'].duplicated()])].sort_values(by = 'projid')
print(len(somoscan_indiv_meta_dup.index))
print(len(somoscan_indiv_meta_dup['projid'].unique()))
print(somoscan_indiv_meta_dup['Diagnosis'].unique())
somoscan_indiv_meta_dup.head()

## fix dups by picked latest dx

In [None]:
somoscan_indiv_meta_dup.sort_values(by = ['projid', 'age_at_visit'], inplace = True, ascending = False)
somoscan_indiv_meta_dup.head()

In [None]:
somoscan_indiv_meta_dup_fixed = somoscan_indiv_meta_dup.drop_duplicates(subset = 'projid', keep = 'first')
print(len(somoscan_indiv_meta_dup_fixed.index))
somoscan_indiv_meta_dup_fixed.head()

## check dup dx in rosmap clinical data

In [None]:
somoscan_indiv_meta_dup_pheno = id_map[id_map['individualID'].isin(somoscan_indiv_meta_dup['projid'])]
somoscan_indiv_meta_dup_pheno = somoscan_indiv_meta_dup_pheno[['individualID', 'cogdx', 'dcfdx_lv']]
somoscan_indiv_meta_dup_pheno['Final_Dx'] = np.where(somoscan_indiv_meta_dup_pheno['cogdx'].isna() == False, somoscan_indiv_meta_dup_pheno['cogdx'], somoscan_indiv_meta_dup_pheno['dcfdx_lv'])
print(len(somoscan_indiv_meta_dup_pheno.index))
somoscan_indiv_meta_dup_pheno.head()

In [None]:
somoscan_indiv_meta_dup[~somoscan_indiv_meta_dup['projid'].isin(id_map['individualID'])]

In [None]:
id_map[id_map['projid'].isin([14410843, 54712250, 22894364])]

In [None]:
print(len(id_map.index))
print(len(id_map['individualID'].unique()))

In [None]:
somoscan_indiv_meta_dup['Final_Dx'] = somoscan_indiv_meta_dup['Diagnosis'].map({
    'NCI' : 1.0,
    'MCI' : 2.0,
    'MCI+' : 3.0,
    'AD' : 4.0,
    'AD+' : 5.0,
    'OtherDem' : 6.0})
print(somoscan_indiv_meta_dup['Final_Dx'].unique())
somoscan_indiv_meta_dup.head()

In [None]:
somoscan_indiv_meta_dup_pheno_for_merge = somoscan_indiv_meta_dup_pheno[['individualID', 'Final_Dx']]
somoscan_indiv_meta_dup_pheno_for_merge.rename(columns = {'individualID' : 'projid'}, inplace = True)

In [None]:
merge = somoscan_indiv_meta_dup.merge(somoscan_indiv_meta_dup_pheno_for_merge, on = ['projid','Final_Dx'], how = 'inner')
print(len(merge.index))
merge.head()

In [None]:
somoscan_indiv_meta_dup[somoscan_indiv_meta_dup['projid'].isin(['R1807730'])]

## create df without dups

In [None]:
somoscan_indiv_meta_no_dup = somoscan_indiv_meta[~somoscan_indiv_meta['projid'].isin(somoscan_indiv_meta['projid'][somoscan_indiv_meta['projid'].duplicated()])].sort_values(by = 'projid')
print(len(somoscan_indiv_meta_no_dup.index))

## concatenate no dup + dup fixed

In [None]:
somoscan_indiv_meta_id_fix = pd.concat([somoscan_indiv_meta_no_dup, somoscan_indiv_meta_dup_fixed], axis = 0)
print(len(somoscan_indiv_meta_id_fix.index))
somoscan_indiv_meta_id_fix.head()

## create id map

In [None]:
somoscan_indiv_meta_id_map = somoscan_indiv_meta_id_fix[['projid', 'projid_visit']]
somoscan_indiv_meta_id_map.rename(columns = {'projid' : 'individualID'}, inplace = True)
somoscan_indiv_meta_id_map.head()

## add individual IDs to proteomics data and filter dup measurements

In [None]:
somoscan_id_map = somoscan_indiv_meta_id_map.merge(somoscan, how = 'inner', on = 'projid_visit')
print(len(somoscan_id_map.index))
print(len(somoscan.index))
print(len(somoscan_indiv_meta_id_map.index))
somoscan_id_map.head()

## map proteins to gene names and uniprot ids

### subset map

In [None]:
somoscan_protein_meta_sub = somoscan_protein_meta[['SeqId', 'EntrezGeneSymbol', 'UniProt']]
somoscan_protein_meta_sub.head()

### split duplicate genes

In [None]:
print(len(somoscan_protein_meta_sub[somoscan_protein_meta_sub['EntrezGeneSymbol'].str.contains(r'\|', regex = True, na = False)].index))
print(len(somoscan_protein_meta_sub.index))

In [None]:
somoscan_protein_meta_sub["GENE"] = somoscan_protein_meta_sub['EntrezGeneSymbol'].str.split('|')
somoscan_protein_meta_split = somoscan_protein_meta_sub.explode('GENE')
print(len(somoscan_protein_meta_split.index))
print(len(somoscan_protein_meta_split[somoscan_protein_meta_split['GENE'].str.contains(r'\|', regex = True, na = False)].index))
somoscan_protein_meta_split.head()

In [None]:
somoscan_protein_meta_split_sub = somoscan_protein_meta_split[['SeqId', 'GENE', 'UniProt']]

### save duplicate genes

In [None]:
somoscan_protein_dups_df = somoscan_protein_meta_sub[somoscan_protein_meta_sub['EntrezGeneSymbol'].str.contains(r'\|', regex = True, na = False)]
print(len(somoscan_protein_dups_df.index))
somoscan_protein_dups_df.head()

In [None]:
somoscan_id_map[['4179-57', '8901-40']]

### transpose

In [None]:
somoscan_id_map_transpose = somoscan_id_map.transpose().reset_index()
somoscan_id_map_transpose.rename(columns = {'index' : 'SeqId'}, inplace = True)
somoscan_id_map_transpose.head()

### extract ID rows

In [None]:
id_rows = somoscan_id_map_transpose.head(2)
id_rows

### merge

In [None]:
somoscan_id_map_merge = somoscan_protein_meta_split_sub.merge(somoscan_id_map_transpose, on = 'SeqId')
print(len(somoscan_id_map_merge.index))
print(len(somoscan_protein_meta_split_sub.index))
print(len(somoscan_id_map_transpose.index))
somoscan_id_map_merge.head()

## check for duplicates

In [None]:
no_na = somoscan_id_map_merge.dropna(subset = 'GENE')
gene_dup = no_na[no_na['GENE'].duplicated(keep = False)].sort_values(by = 'GENE')
print(len(gene_dup.index))
gene_dup

In [None]:
somoscan_protein_meta[somoscan_protein_meta['SeqId'].isin(gene_dup['SeqId'])].sort_values(by = 'EntrezGeneID')

In [None]:
pd.set_option('display.max_columns', None)
somoscan_protein_meta[somoscan_protein_meta['EntrezGeneSymbol'].isin(['YWHAG'])]

### create gene and protein lists

In [None]:
somoscan_gene_list = somoscan_id_map_merge[['GENE']].drop_duplicates()
somoscan_gene_list = somoscan_gene_list[~somoscan_gene_list['GENE'].isna()]
print(len(somoscan_gene_list))

### drop extra columns and rename columns

In [None]:
somoscan_gene_sub = somoscan_id_map_merge.drop(columns = ['SeqId', 'UniProt'])
somoscan_gene_sub.rename(columns = {'GENE' : 'SeqId'}, inplace = True)

### add id back in

In [None]:
somoscan_gene_sub_id = pd.concat([id_rows, somoscan_gene_sub], axis = 0)
somoscan_gene_sub_id.head()

### remove missing gene names

In [None]:
somoscan_gene_no_missing = somoscan_gene_sub_id[~somoscan_gene_sub_id['SeqId'].isna()]
print(len(somoscan_gene_sub_id.index))
print(len(somoscan_gene_no_missing.index))

### remake index

In [None]:
somoscan_gene_no_missing.set_index('SeqId', inplace = True)
somoscan_gene_no_missing.head()

### tranpose back

In [None]:
somoscan_gene = somoscan_gene_no_missing.transpose()
print(somoscan_gene_no_missing.shape)
print(somoscan_gene.shape)
somoscan_gene.head()

### add common id and keep genes as columns

In [None]:
common_id_map_sub = common_id_map.drop(columns = ['SampleID'])

In [None]:
somoscan_gene_no_missing = somoscan_gene[~somoscan_gene['individualID'].isna()]

In [None]:
somoscan_gene_common_id = common_id_map_sub.merge(somoscan_gene_no_missing, on = 'individualID', how = 'inner')
print(len(somoscan_gene_no_missing.index))
print(len(somoscan_gene_common_id.index))
somoscan_gene_common_id.head()

In [None]:
somoscan_gene_common_id_sub = somoscan_gene_common_id.drop(columns = ['individualID', 'projid_visit'])

In [None]:
somoscan_gene_common_id_sub.set_index('CommonID', drop = True, inplace = True)

### transpose common id file so genes are column names and ids are row names

In [None]:
somoscan_gene_common_id_transpose = somoscan_gene_common_id_sub.transpose()
somoscan_gene_common_id_transpose.insert(0, 'GENE', somoscan_gene_common_id_transpose.index)
print(somoscan_gene_common_id_transpose.shape)
somoscan_gene_common_id_transpose.head()

# export

In [None]:
srm_round1_id.to_csv('rosmap/ROSMAP.proteomics.LC_SRM.round1.individualID.txt',
                     sep = '\t',
                     index = None)

In [None]:
somoscan_gene.to_csv('rosmap/ROSMAP.proteomics.somoscan.individualID.entrez_gene_symbol.txt',
                     sep = '\t',
                     index = None)

In [None]:
somoscan_gene_common_id_transpose.to_csv('rosmap/ROSMAP.proteomics.somoscan.individualID.entrez_gene_symbol.common_id.transpose.txt.gz',
                                         sep = '\t',
                                         index = None,
                                         compression = 'gzip')

In [None]:
somoscan_gene_list.to_csv('rosmap/ROSMAP.proteomics.somoscan.individualID.entrez_gene_symbol.gene_list.txt',
                     sep = '\t',
                     index = None)