# load packages

In [None]:
import pandas as pd

In [None]:
import numpy as np

# read in input files

In [None]:
id_map = pd.read_csv('rosmap/Metadata/ROSMAP_clinical.csv')
id_map.head()

In [None]:
metadata = pd.read_csv('rosmap/Metadata/ROSMAP_assay_proteomics_metadata.csv')
print(metadata.shape)
metadata.head()

In [None]:
biospec = pd.read_csv('rosmap/Metadata/ROSMAP_biospecimen_metadata.csv')
biospec.head()

In [None]:
adsp_rosmap_id_map = pd.read_csv('pathway_score/rosmap/ID_mapping/ROSMAP.ADSP_PHENO.ID_map.txt',
                                 sep = '\t')
adsp_rosmap_id_map.head()

In [None]:
somoscan = pd.read_csv('rosmap/Proteomics/Proteomics_SomoScan/OhNM2025_ROSMAP_plasma_Soma7k_protein_level_ANML_log10.csv')
somoscan.head()

In [None]:
somoscan_protein_meta = pd.read_csv('rosmap/Proteomics/Proteomics_SomoScan/OhNM2025_ROSMAP_plasma_Soma7k_protein_metadata.csv')
somoscan_protein_meta.head()

In [None]:
somoscan_indiv_meta = pd.read_csv('rosmap/Proteomics/Proteomics_SomoScan/OhNM2025_ROSMAP_plasma_Soma7k_sample_metadata.csv')
somoscan_indiv_meta.head()

In [None]:
common_id_map = pd.read_csv('pathway_score/id_map/ADSP.ROSMAP.MSBB.id_map.txt',
                            sep = '\t')
common_id_map.head()

# clean somoscan data

## identify duplicate IDs

In [None]:
somoscan_indiv_meta_dup = somoscan_indiv_meta[somoscan_indiv_meta['projid'].isin(somoscan_indiv_meta['projid'][somoscan_indiv_meta['projid'].duplicated()])].sort_values(by = 'projid')
print(len(somoscan_indiv_meta_dup.index))
print(len(somoscan_indiv_meta_dup['projid'].unique()))
print(somoscan_indiv_meta_dup['Diagnosis'].unique())
somoscan_indiv_meta_dup.head()

## fix dups by picked latest dx

In [None]:
somoscan_indiv_meta_dup.sort_values(by = ['projid', 'age_at_visit'], inplace = True, ascending = False)
somoscan_indiv_meta_dup.head()

In [None]:
somoscan_indiv_meta_dup_fixed = somoscan_indiv_meta_dup.drop_duplicates(subset = 'projid', keep = 'first')
print(len(somoscan_indiv_meta_dup_fixed.index))
somoscan_indiv_meta_dup_fixed.head()

## check dup dx in rosmap clinical data

In [None]:
somoscan_indiv_meta_dup_pheno = id_map[id_map['individualID'].isin(somoscan_indiv_meta_dup['projid'])]
somoscan_indiv_meta_dup_pheno = somoscan_indiv_meta_dup_pheno[['individualID', 'cogdx', 'dcfdx_lv']]
somoscan_indiv_meta_dup_pheno['Final_Dx'] = np.where(somoscan_indiv_meta_dup_pheno['cogdx'].isna() == False, somoscan_indiv_meta_dup_pheno['cogdx'], somoscan_indiv_meta_dup_pheno['dcfdx_lv'])
print(len(somoscan_indiv_meta_dup_pheno.index))
somoscan_indiv_meta_dup_pheno.head()

In [None]:
somoscan_indiv_meta_dup[~somoscan_indiv_meta_dup['projid'].isin(id_map['individualID'])]

In [None]:
id_map[id_map['projid'].isin([14410843, 54712250, 22894364])]

In [None]:
print(len(id_map.index))
print(len(id_map['individualID'].unique()))

In [None]:
somoscan_indiv_meta_dup['Final_Dx'] = somoscan_indiv_meta_dup['Diagnosis'].map({
    'NCI' : 1.0,
    'MCI' : 2.0,
    'MCI+' : 3.0,
    'AD' : 4.0,
    'AD+' : 5.0,
    'OtherDem' : 6.0})
print(somoscan_indiv_meta_dup['Final_Dx'].unique())
somoscan_indiv_meta_dup.head()

In [None]:
somoscan_indiv_meta_dup_pheno_for_merge = somoscan_indiv_meta_dup_pheno[['individualID', 'Final_Dx']]
somoscan_indiv_meta_dup_pheno_for_merge.rename(columns = {'individualID' : 'projid'}, inplace = True)

In [None]:
merge = somoscan_indiv_meta_dup.merge(somoscan_indiv_meta_dup_pheno_for_merge, on = ['projid', 'Final_Dx'], how = 'inner')
print(len(merge.index))
merge.head()

In [None]:
somoscan_indiv_meta_dup[somoscan_indiv_meta_dup['projid'].isin(['R1807730'])]

## create df without dups

In [None]:
somoscan_indiv_meta_no_dup = somoscan_indiv_meta[~somoscan_indiv_meta['projid'].isin(somoscan_indiv_meta['projid'][somoscan_indiv_meta['projid'].duplicated()])].sort_values(by = 'projid')
print(len(somoscan_indiv_meta_no_dup.index))

## concatenate no dup + dup fixed

In [None]:
somoscan_indiv_meta_id_fix = pd.concat([somoscan_indiv_meta_no_dup, somoscan_indiv_meta_dup_fixed], axis = 0)
print(len(somoscan_indiv_meta_id_fix.index))
somoscan_indiv_meta_id_fix.head()

## create id map

In [None]:
somoscan_indiv_meta_id_map = somoscan_indiv_meta_id_fix[['projid', 'projid_visit']]
somoscan_indiv_meta_id_map.rename(columns = {'projid' : individualID'}, inplace = True)
somoscan_indiv_meta_id_map.head()

## add individual IDs to proteomics data and filter dup measurements

In [None]:
somoscan_id_map = somoscan_indiv_meta_id_map.merge(somoscan, how = 'inner', on = 'projid_visit')
print(len(somoscan_id_map.index))
print(len(somoscan.index))
print(len(somoscan_indiv_meta_id_map.index))
somoscan_id_map.head()

## map proteins to gene names and uniprot ids

### subset map

In [None]:
somoscan_protein_meta_sub = somoscan_protein_meta[['SeqId', 'EntrezGeneSymbol', 'UniProt']]
somoscan_protein_meta_sub.head()

### split duplicate genes

In [None]:
print(len(somoscan_protein_meta_sub[somoscan_protein_meta_sub['EntrezGeneSymbol'].str.contains(r'\|', regex = True,  na = False)].index))
print(len(somoscan_protein_meta_sub.index))

In [None]:
somoscan_protein_meta_sub["GENE"] = somoscan_protein_meta_sub['EntrezGeneSymbol'].str.split('|')
somoscan_protein_meta_split = somoscan_protein_meta_sub.explode('GENE')
print(len(somoscan_protein_meta_split.index))
print(len(somoscan_protein_meta_split[somoscan_protein_meta_split['GENE'].str.contains(r'\|', regex = True,  na = False)].index))
somoscan_protein_meta_split.head()

In [None]:
somoscan_protein_meta_split_sub = somoscan_protein_meta_split[['SeqId', 'GENE', 'UniProt']]

### save duplicate genes

In [None]:
somoscan_protein_dups_df = somoscan_protein_meta_sub[somoscan_protein_meta_sub['EntrezGeneSymbol'].str.contains(r'\|', regex = True,  na = False)]
print(len(somoscan_protein_dups_df.index))
somoscan_protein_dups_df.head()

In [None]:
somoscan_id_map[['4179-57', '8901-40']]

### transpose

In [None]:
somoscan_id_map_transpose = somoscan_id_map.transpose().reset_index()
somoscan_id_map_transpose.rename(columns = {'index' : 'SeqId'}, inplace = True)
somoscan_id_map_transpose.head()

### extract ID rows

In [None]:
id_rows = somoscan_id_map_transpose.head(2)
id_rows

### merge

In [None]:
somoscan_id_map_merge = somoscan_protein_meta_split_sub.merge(somoscan_id_map_transpose, on = 'SeqId')
print(len(somoscan_id_map_merge.index))
print(len(somoscan_protein_meta_split_sub.index))
print(len(somoscan_id_map_transpose.index))
somoscan_id_map_merge.head()

## remove flagged sequences

In [None]:
flagged = somoscan_protein_meta[somoscan_protein_meta['ColCheck'] == 'FLAG']
print(len(flagged.index))

In [None]:
print(len(somoscan_id_map_merge.index))
somoscan_id_map_merge = somoscan_id_map_merge[~somoscan_id_map_merge['SeqId'].isin(flagged['SeqId'])]
print(len(somoscan_id_map_merge.index))

## check for duplicate genes

In [None]:
no_na = somoscan_id_map_merge.dropna(subset = 'GENE')
gene_dup = no_na[no_na['GENE'].duplicated(keep = False)].sort_values(by = 'GENE')
print(len(gene_dup.index))
protein_dup = no_na[no_na['UniProt'].duplicated(keep = False)].sort_values(by = 'UniProt')
print(len(protein_dup.index))
gene_dup

### investigate duplicates

In [None]:
somoscan_protein_meta.columns

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
somoscan_protein_meta[somoscan_protein_meta['SeqId'].isin(gene_dup['SeqId'])].sort_values(by = 'EntrezGeneID').head()

In [None]:
somoscan_protein_meta[somoscan_protein_meta['SeqId'].isin(gene_dup['SeqId'])].sort_values(by = 'EntrezGeneID').tail()

In [None]:
gene_dup[gene_dup['UniProt'].isin(['O95150'])]

In [None]:
print(somoscan_protein_meta['ColCheck'].unique())

In [None]:
print(somoscan_protein_meta['Dilution'].value_counts(dropna = False))

## calculate weight
- 1 / # genes

In [None]:
gene_count = pd.DataFrame(somoscan_id_map_merge[['SeqId', 'GENE']].drop_duplicates()['SeqId'].value_counts()).reset_index()
print(gene_count['count'].describe())
gene_count.head()

In [None]:
somoscan_gene_weight = somoscan_id_map_merge.merge(gene_count, on = 'SeqId')
somoscan_gene_weight['weight'] = 1/somoscan_gene_weight['count']
print(len(somoscan_id_map_merge.index))
print(len(somoscan_gene_weight.index))
somoscan_gene_weight.head()

## remove missing

In [None]:
print(len(somoscan_gene_weight.index))
print(len(somoscan_gene_weight['GENE'].unique()))
somoscan_gene_weight = somoscan_gene_weight.dropna(subset = 'GENE')
print(len(somoscan_gene_weight.index))
print(len(somoscan_gene_weight['GENE'].unique()))

## compute weighted average

In [None]:
def weighted_average(values, weights):
    mask = values.notna() & weights.notna()
    if weights[mask].sum() == 0:
        return np.nan
    else:
        return (values * weights).sum(skipna = True) / weights[mask].sum()

In [None]:
avg_columns = somoscan_gene_weight.columns.tolist()
avg_columns.remove('SeqId')
avg_columns.remove('GENE')
avg_columns.remove('UniProt')
avg_columns.remove('weight')
avg_columns.remove('count')

somoscan_avg = (somoscan_gene_weight.groupby('GENE').apply(lambda g: pd.Series({col: weighted_average(g[col], g['weight']) for col in avg_columns})).reset_index())

In [None]:
print(somoscan_gene_weight.shape)
print(somoscan_avg.shape)
print(len(somoscan_avg['GENE'].unique()))
somoscan_avg.head()

## create gene and protein lists

In [None]:
somoscan_gene_list = somoscan_avg[['GENE']].drop_duplicates()
print(len(somoscan_gene_list))
somoscan_gene_list = somoscan_gene_list[~somoscan_gene_list['GENE'].isna()]
print(len(somoscan_gene_list))

## add id back in

In [None]:
somoscan_avg = somoscan_avg.rename(columns = {'GENE' : 'SeqId'})
somoscan_gene_sub_id = pd.concat([id_rows, somoscan_avg], axis = 0)
somoscan_gene_sub_id.head()

## remake index

In [None]:
somoscan_gene_sub_id.set_index('SeqId', inplace = True)
somoscan_gene_sub_id.head()

## tranpose back

In [None]:
somoscan_gene = somoscan_gene_sub_id.transpose()
print(somoscan_gene_sub_id.shape)
print(somoscan_gene.shape)
somoscan_gene.head()

## add common id and keep genes as columns

In [None]:
common_id_map_sub = common_id_map.drop(columns = ['SampleID'])

In [None]:
somoscan_gene_no_missing = somoscan_gene[~somoscan_gene['individualID'].isna()]

In [None]:
somoscan_gene_common_id = common_id_map_sub.merge(somoscan_gene_no_missing, on = 'individualID', how = 'inner')
print(len(somoscan_gene_no_missing.index))
print(len(somoscan_gene_common_id.index))
somoscan_gene_common_id.head()

In [None]:
somoscan_gene_common_id_sub = somoscan_gene_common_id.drop(columns = ['individualID', 'projid_visit'])

In [None]:
somoscan_gene_common_id_sub.set_index('CommonID', drop = True, inplace = True)

## transpose common id file so genes are column names and ids are row names

In [None]:
somoscan_gene_common_id_transpose = somoscan_gene_common_id_sub.transpose()
somoscan_gene_common_id_transpose.insert(0, 'GENE', somoscan_gene_common_id_transpose.index)
print(somoscan_gene_common_id_transpose.shape)
somoscan_gene_common_id_transpose.head()

# export

In [None]:
somoscan_gene.to_csv('pathway_score/rosmap/ROSMAP.proteomics.somoscan.individualID.entrez_gene_symbol.txt',
                     sep = '\t',
                     index = None)

In [None]:
somoscan_gene_common_id_transpose.to_csv('pathway_score/rosmap/ROSMAP.proteomics.somoscan.individualID.entrez_gene_symbol.common_id.transpose.txt.gz',
                                         sep = '\t',
                                         index = None,
                                         compression = 'gzip')

In [None]:
somoscan_gene_list.to_csv('pathway_score/rosmap/ROSMAP.proteomics.somoscan.individualID.entrez_gene_symbol.gene_list.txt',
                     sep = '\t',
                     index = None)