# load packages

In [None]:
import pandas as pd

In [None]:
import numpy as np

# read in input files

## scores

### pathway scores

In [None]:
avg_pathway = pd.read_csv('avg_pathways/AOU_ALL.UKBB.metasoft.gene_score.ROSMAP.RNAseq.methylation.somoscan_proteomics.MSBB.RNAseq.methylation.tmt_proteomics.ADSP.pathway_average.pathway_scores.standard_scaled.go.keep_quest_comb.txt',
                          sep = '\t')
print(avg_pathway.shape)
avg_pathway.head()

In [None]:
avg_gene = pd.read_csv('avg_pathways/AOU_ALL.UKBB.metasoft.gene_score.ROSMAP.RNAseq.methylation.somoscan_proteomics.MSBB.RNAseq.methylation.tmt_proteomics.ADSP.gene_average.pathway_scores.standard_scaled.go.keep_quest_comb.txt',
                       sep = '\t')
print(avg_gene.shape)
avg_gene.head()

## pheno

In [None]:
adsp_pheno = pd.read_csv('/project/ritchie/projects/ADSP_Projects/QC_ADSPv11/ADSPphenotype_forAnalysis.txt',
                         sep = '\t')
print(len(adsp_pheno.index))
adsp_pheno.head()

## keep quest comb
- ADSP samples filtered down to only represent AD phenos

In [None]:
keep_quest_comb = pd.read_csv('/project/ritchie/projects/AD_KMI/adsp_filt_phenos/ADSPIntegratedPhenotypes_DS_2023.08.08.keep_quest_comb_cohorts_samples.csv',
                              header = None)
print(len(keep_quest_comb.index))
keep_quest_comb.head()

## id map

In [None]:
id_map = pd.read_csv('/project/ritchie/projects/AD_KMI/pathway_score/id_map/ADSP.ROSMAP.MSBB.id_map.txt',
                      sep = '\t')
id_map.head()

# transpose

In [None]:
avg_pathway_transpose = avg_pathway.set_index('PATHWAY_ID')
avg_pathway_transpose = avg_pathway_transpose.transpose()
avg_pathway_transpose.insert(0, 'ID', avg_pathway_transpose.index)
print(avg_pathway_transpose.shape)
avg_pathway_transpose.head()

In [None]:
avg_gene_transpose = avg_gene.set_index('PATHWAY_ID')
avg_gene_transpose = avg_gene_transpose.transpose()
avg_gene_transpose.insert(0, 'ID', avg_gene_transpose.index)
print(avg_gene_transpose.shape)
avg_gene_transpose.head()

# clean adsp pheno

In [None]:
id_map = id_map.rename(columns = {'SampleID' : 'IID'})
adsp_pheno_id = id_map.merge(adsp_pheno, on = 'IID', how = 'inner')
adsp_pheno_id = adsp_pheno_id[adsp_pheno_id['IID'].isin(keep_quest_comb[0])]
adsp_pheno_id.rename(columns = {'CommonID' : 'ID',
                                'DX_harmonized' : 'AD',
                                'Age_harmonized' : 'AGE',
                                'Sex' : 'SEX'}, inplace = True)
adsp_pheno_id = adsp_pheno_id.dropna(subset = ['ID', 'AGE', 'SEX', 'PC1', 'PC2', 'PC3', 'PC4', 'AD'])

adsp_pheno_outcome = adsp_pheno_id[['ID', 'AD']]
adsp_pheno_covar = adsp_pheno_id[['ID', 'AGE', 'SEX', 'PC1', 'PC2', 'PC3', 'PC4']]
print(len(adsp_pheno_id.index))

# filter pathway scores to pheno ids

In [None]:
avg_gene_transpose = avg_gene_transpose.merge(adsp_pheno_covar, on = 'ID', how = 'inner')
print(len(avg_gene_transpose.index))

In [None]:
avg_pathway_transpose = avg_pathway_transpose.merge(adsp_pheno_covar, on = 'ID', how = 'inner')
print(len(avg_pathway_transpose.index))

# split to 80% train / 20% test

## training split

### pathway scores

In [None]:
avg_gene_sample = avg_gene_transpose.sample(frac = 0.8, random_state = 7)
print(len(avg_gene_sample.index))

In [None]:
avg_pathway_sample = avg_pathway_transpose.sample(frac = 0.8, random_state = 7)
print(len(avg_pathway_sample.index))

### pheno

In [None]:
adsp_pheno_sample = adsp_pheno_outcome[adsp_pheno_outcome['ID'].isin(avg_gene_sample['ID'])]
print(len(adsp_pheno_sample.index))

## testing split

### pathway scores

In [None]:
avg_gene_test = avg_gene_transpose[~avg_gene_transpose['ID'].isin(avg_gene_sample['ID'])]
print(len(avg_gene_test.index))

In [None]:
avg_pathway_test = avg_pathway_transpose[~avg_pathway_transpose['ID'].isin(avg_pathway_sample['ID'])]
print(len(avg_pathway_test.index))

### pheno

In [None]:
adsp_pheno_test = adsp_pheno_outcome[adsp_pheno_outcome['ID'].isin(avg_gene_test['ID'])]
print(len(adsp_pheno_test.index))

# export

## scores

### full datasets

In [None]:
avg_gene_transpose.to_csv('input/AOU_ALL.UKBB.metasoft.gene_score.ROSMAP.RNAseq.methylation.somoscan_proteomics.MSBB.RNAseq.methylation.tmt_proteomics.ADSP.gene_average.pathway_scores.standard_scaled.go.keep_quest_comb.athena_input.txt',
                          sep = ' ',
                          index = None,
                          na_rep = 'NaN')

In [None]:
avg_pathway_transpose.to_csv('input/AOU_ALL.UKBB.metasoft.gene_score.ROSMAP.RNAseq.methylation.somoscan_proteomics.MSBB.RNAseq.methylation.tmt_proteomics.ADSP.pathway_average.pathway_scores.standard_scaled.go.keep_quest_comb.athena_input.txt',
                          sep = ' ',
                          index = None,
                          na_rep = 'NaN')

### train

In [None]:
avg_gene_sample.to_csv('input/AOU_ALL.UKBB.metasoft.gene_score.ROSMAP.RNAseq.methylation.somoscan_proteomics.MSBB.RNAseq.methylation.tmt_proteomics.ADSP.gene_average.pathway_scores.standard_scaled.go.keep_quest_comb.80%_train.athena_input.txt',
                       sep = ' ',
                       index = None,
                       na_rep = 'NaN')

In [None]:
avg_pathway_sample.to_csv('input/AOU_ALL.UKBB.metasoft.gene_score.ROSMAP.RNAseq.methylation.somoscan_proteomics.MSBB.RNAseq.methylation.tmt_proteomics.ADSP.pathway_average.pathway_scores.standard_scaled.go.keep_quest_comb.80%_train.athena_input.txt',
                          sep = ' ',
                          index = None,
                          na_rep = 'NaN')

### test

In [None]:
avg_gene_test.to_csv('input/AOU_ALL.UKBB.metasoft.gene_score.ROSMAP.RNAseq.methylation.somoscan_proteomics.MSBB.RNAseq.methylation.tmt_proteomics.ADSP.gene_average.pathway_scores.standard_scaled.go.keep_quest_comb.20%_test.athena_input.txt',
                     sep = ' ',
                     index = None,
                     na_rep = 'NaN')

In [None]:
avg_pathway_test.to_csv('input/AOU_ALL.UKBB.metasoft.gene_score.ROSMAP.RNAseq.methylation.somoscan_proteomics.MSBB.RNAseq.methylation.tmt_proteomics.ADSP.pathway_average.pathway_scores.standard_scaled.go.keep_quest_comb.20%_test.athena_input.txt',
                        sep = ' ',
                        index = None,
                        na_rep = 'NaN')

## pheno

In [None]:
adsp_pheno_outcome.to_csv('input/ADSP_phenotype.keep_quest_comb.txt',
                          sep = ' ',
                          index = None,
                          na_rep = 'NaN')

In [None]:
adsp_pheno_sample.to_csv('input/ADSP_phenotype.keep_quest_comb.80%_train.txt',
                         sep = ' ',
                         index = None,
                         na_rep = 'NaN')

In [None]:
adsp_pheno_test.to_csv('input/ADSP_phenotype.keep_quest_comb.20%_test.txt',
                       sep = ' ',
                       index = None,
                       na_rep = 'NaN')