# load packages

In [None]:
import pandas as pd

In [None]:
import numpy as np

# read in input files

In [None]:
gene_score = pd.read_csv('/project/ritchie/projects/AD_KMI/common_var_gene_score/igap_adsp_gene_score/merged_outputs/AOU_ALL.UKBB.metasoft.ADSP.all.VEP_v113.gene_by_position.r2_0.1_clump_variants_excluded.RE_pval_threshold_0.05.gene_symbol.average_gene_score.merged.txt.gz',
                            sep = '\t')
gene_score.head()

In [None]:
msbb_id_map = pd.read_csv('/project/ritchie/projects/AD_KMI/pathway_score/msbb/MSBB.ADSP.ID_map.csv')
msbb_id_map.head()

In [None]:
rosmap_id_map = pd.read_csv('/project/ritchie/projects/AD_KMI/pathway_score/rosmap/ID_mapping/ROSMAP.ADSP_PHENO.ID_map.txt',
                            sep = '\t')
rosmap_id_map.head()

# clean ID maps

## subset

In [None]:
rosmap_id_map['ID'] = rosmap_id_map['SampleID'] + ':' + rosmap_id_map['individualID']
rosmap_id_map_sub = rosmap_id_map[['SampleID', 'individualID', 'ID']]
rosmap_id_map_sub.head()

In [None]:
msbb_id_map['ID'] = msbb_id_map['SampleID'] + ':' + msbb_id_map['individualID']
msbb_id_map_sub = msbb_id_map[['SampleID', 'individualID', 'ID']]
msbb_id_map_sub.drop_duplicates(inplace = True)
msbb_id_map_sub.head()

## merge

In [None]:
all_id_map = pd.concat([msbb_id_map_sub, rosmap_id_map_sub], axis = 0)
print(len(rosmap_id_map_sub.index))
print(len(msbb_id_map_sub.index))
print(len(all_id_map.index))
all_id_map.head()

# map gene scores to MSBB and ROSMAP IDs

In [None]:
gene_score.rename(columns = {'ID' : 'SampleID'}, inplace = True)

In [None]:
gene_score_map = all_id_map.merge(gene_score, on = 'SampleID', how = 'inner')
gene_score_map.set_index('ID', inplace = True)
gene_score_map_sub = gene_score_map.copy()
print(len(gene_score_map_sub.index))
gene_score_map_sub.head()

# create common ID in people that don't map

In [None]:
gene_score_no_map = gene_score[~gene_score['SampleID'].isin(gene_score_map['SampleID'])]
gene_score_no_map['ID'] = gene_score_no_map['SampleID'] + ':NA'
gene_score_no_map.set_index('ID', inplace = True)
gene_score_no_map_sub = gene_score_no_map.copy()
gene_score_no_map_sub.insert(1, 'individualID', np.nan)
print(len(gene_score_no_map_sub.index))
gene_score_no_map_sub.head()

# concatenate w new ids

In [None]:
gene_score_cat = pd.concat([gene_score_no_map_sub, gene_score_map_sub], axis = 0)
print(len(gene_score_cat.index))
gene_score_cat.head()

# make id map

In [None]:
id_map = gene_score_cat[['SampleID', 'individualID']]
id_map['CommonID'] = id_map.index
print(len(gene_score_cat.index))
id_map.head()

In [None]:
print(len(rosmap_id_map_sub[~rosmap_id_map_sub['ID'].isin(id_map['CommonID'])]))
print(len(msbb_id_map_sub[~msbb_id_map_sub['ID'].isin(id_map['CommonID'])]))

In [None]:
rosmap_no_match = rosmap_id_map_sub[~rosmap_id_map_sub['ID'].isin(id_map['CommonID'])]
rosmap_no_match.rename(columns = {'ID' : 'CommonID'}, inplace = True)
id_map = pd.concat([id_map, rosmap_no_match], axis = 0)
print(len(id_map.index))
id_map.head()

# transpose

In [None]:
gene_score_cat.drop(columns = ['SampleID', 'individualID'], inplace = True)
gene_score_transpose = gene_score_cat.transpose()
gene_score_transpose.insert(0, 'GENE', gene_score_transpose.index)
gene_score_transpose.head()

# export

In [None]:
gene_score_transpose.to_csv('/project/ritchie/projects/AD_KMI/common_var_gene_score/igap_adsp_gene_score/merged_outputs/AOU_ALL.UKBB.metasoft.ADSP.all.VEP_v113.gene_by_position.r2_0.1_clump_variants_excluded.RE_pval_threshold_0.05.gene_symbol.average_gene_score.merged.common_id.transpose.txt',
                            sep = '\t',
                           index = None)

In [None]:
id_map.to_csv('/project/ritchie/projects/AD_KMI/pathway_score/id_map/ADSP.ROSMAP.MSBB.id_map.txt',
              sep = '\t',
              index = None)