# load packages

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, balanced_accuracy_score
import numpy as np
import xgboost as xgb

# read in input files

In [None]:
gene_score_adsp = pd.read_csv(('common_var_gene_score/igap_adsp_gene_score/merged_outputs/AOU_ALL.UKBB.metasoft.ADSP.all.VEP_v113.gene_by_position.r2_0.1_clump_variants_excluded.RE_pval_threshold_0.05.gene_symbol.average_gene_score.merged.common_id.transpose.scaled.txt.gz'),
                              sep = '\t', low_memory = False)

In [None]:
rnaseq_rosmap = pd.read_csv(('pathway_score/gene_mapping/ROSMAP.RNAseq.TPM5_log2norm.individualids.codinggenes.VEP_v113_genes.gene_symbol.common_id.duplicate_ids_removed.transpose.scaled.txt'),
                            sep = '\t')

In [None]:
rnaseq_10_msbb = pd.read_csv(('pathway_score/msbb/MSBB.RNAseq.BA_10.19batch.gene_symbol.individualID.mvalue_norm.tpm.log2.coding_genes_only.covar_corrected.common_id.transpose.scaled.txt'),
                             sep = '\t')

In [None]:
rnaseq_22_msbb = pd.read_csv(('pathway_score/msbb/MSBB.RNAseq.BA_22.19batch.gene_symbol.individualID.mvalue_norm.tpm.log2.coding_genes_only.covar_corrected.common_id.transpose.scaled.txt'),
                             sep = '\t')

In [None]:
rnaseq_36_msbb = pd.read_csv(('pathway_score/msbb/MSBB.RNAseq.BA_36.19batch.gene_symbol.individualID.mvalue_norm.tpm.log2.coding_genes_only.covar_corrected.common_id.transpose.scaled.txt'),
                             sep = '\t')

In [None]:
rnaseq_44_msbb = pd.read_csv(('pathway_score/msbb/MSBB.RNAseq.BA_44.19batch.gene_symbol.individualID.mvalue_norm.tpm.log2.coding_genes_only.covar_corrected.common_id.transpose.scaled.txt'),
                             sep = '\t')

In [None]:
methyl_rosmap = pd.read_csv(('pathway_score/rosmap/ROSMAP_arrayMethylation_imputed.gene_symbol.individualID.mvalue_norm.weighted_gene_average.common_id.transpose.scaled.txt'),
                            sep = '\t')

In [None]:
methyl_msbb = pd.read_csv(('pathway_score/msbb/MSBB.methylation_array.19batch.gene_symbol.individualID.mvalue_norm.weighted_gene_average.common_id.transpose.scaled.txt'),
                          sep = '\t')

In [None]:
somoscan_rosmap = pd.read_csv(('pathway_score/rosmap/ROSMAP.proteomics.somoscan.individualID.entrez_gene_symbol.common_id.transpose.scaled.txt'),
                              sep = '\t')

In [None]:
tmt_msbb = pd.read_csv(('pathway_score/msbb/MSBB.TMT_proteomics.19batch.normalized.gene_symbol.individualID.log2_transformed.common_id.transpose.scaled.txt'),
                       sep = '\t')

In [None]:
all_path_map = pd.read_csv(('pathway_score/pathway_annotation/go/AD_KMI.ADSP.ROSMAP.all_omics.MSBB.all_omics.VEP_113.ref_gene.go.gene_to_pathway.no_duplicates.pathway_mapping.txt'),
                            sep = '\t')

In [None]:
id_map_keep_quest_comb = pd.read_csv('pathway_score/id_map/ADSP.ROSMAP.MSBB.keep_quest_comb.id_map.txt',
                                     sep = '\t')

In [None]:
adsp_pheno = pd.read_csv('/project/ritchie/projects/ADSP_Projects/QC_ADSPv11/ADSPphenotype_forAnalysis.txt',
                         sep = '\t')

In [None]:
id_map = pd.read_csv('pathway_score/id_map/ADSP.ROSMAP.MSBB.id_map.txt',
                      sep = '\t')

In [None]:
best_model = pd.read_csv('ML/statistical_models/xgboost_output/ALL_SPLITS.MEAN_METRICS.XGBoost.avg_pathway.standard_scaled.go.keep_quest_comb.csv')
best_model.head()

# clean files

In [None]:
adsp_pheno = adsp_pheno.rename(columns = {'IID' : 'SampleID'})
adsp_pheno_id = id_map.merge(adsp_pheno, on = 'SampleID', how = 'inner')
adsp_pheno_id = adsp_pheno_id[adsp_pheno_id['SampleID'].isin(id_map_keep_quest_comb['SampleID'])]
adsp_pheno_id = adsp_pheno_id[['CommonID', 'DX_harmonized', 'Age_harmonized', 'Sex', 'PC1', 'PC2', 'PC3', 'PC4']]
adsp_pheno_id.rename(columns = {'CommonID' : 'ID',
                                'DX_harmonized' : 'AD',
                                'Age_harmonized' : 'Age'}, inplace = True)
adsp_pheno_id = adsp_pheno_id.set_index('ID', drop = True)
print(len(adsp_pheno_id.index))
adsp_pheno_id.dropna(inplace = True)
print(len(adsp_pheno_id.index))
adsp_pheno_id.head()

In [None]:
gene_score_adsp = gene_score_adsp[~gene_score_adsp['GENE'].isin(['ALZ_STATUS', 'AGE', 'SEX', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8'])]
gene_score_adsp = gene_score_adsp[gene_score_adsp['GENE'].isin(all_path_map['GENE'])]
gene_score_adsp.set_index('GENE', inplace = True)
gene_score_adsp = gene_score_adsp.transpose()
gene_score_adsp = gene_score_adsp.join(adsp_pheno_id, how = 'inner')
gene_score_adsp = gene_score_adsp.dropna(axis = 1, how = 'all')
print(gene_score_adsp.shape)
gene_score_adsp.head()

In [None]:
rnaseq_rosmap = rnaseq_rosmap[rnaseq_rosmap['GENE'].isin(all_path_map['GENE'])]
rnaseq_rosmap.set_index('GENE', inplace = True)
rnaseq_rosmap = rnaseq_rosmap.transpose()
rnaseq_rosmap = rnaseq_rosmap.join(adsp_pheno_id, how = 'inner')
rnaseq_rosmap = rnaseq_rosmap.dropna(axis = 1, how = 'all')
print(rnaseq_rosmap.shape)
rnaseq_rosmap.head()

In [None]:
rnaseq_10_msbb = rnaseq_10_msbb[rnaseq_10_msbb['GENE'].isin(all_path_map['GENE'])]
rnaseq_10_msbb.set_index('GENE', inplace = True)
rnaseq_10_msbb = rnaseq_10_msbb.transpose()
rnaseq_10_msbb = rnaseq_10_msbb.join(adsp_pheno_id, how = 'inner')
rnaseq_10_msbb = rnaseq_10_msbb.dropna(axis = 1, how = 'all')
print(rnaseq_10_msbb.shape)
rnaseq_10_msbb.head()

In [None]:
rnaseq_22_msbb = rnaseq_22_msbb[rnaseq_22_msbb['GENE'].isin(all_path_map['GENE'])]
rnaseq_22_msbb.set_index('GENE', inplace = True)
rnaseq_22_msbb = rnaseq_22_msbb.transpose()
rnaseq_22_msbb = rnaseq_22_msbb.join(adsp_pheno_id, how = 'inner')
rnaseq_22_msbb = rnaseq_22_msbb.dropna(axis = 1, how = 'all')
print(rnaseq_22_msbb.shape)
rnaseq_22_msbb.head()

In [None]:
rnaseq_36_msbb = rnaseq_36_msbb[rnaseq_36_msbb['GENE'].isin(all_path_map['GENE'])]
rnaseq_36_msbb.set_index('GENE', inplace = True)
rnaseq_36_msbb = rnaseq_36_msbb.transpose()
rnaseq_36_msbb = rnaseq_36_msbb.join(adsp_pheno_id, how = 'inner')
rnaseq_36_msbb = rnaseq_36_msbb.dropna(axis = 1, how = 'all')
print(rnaseq_36_msbb.shape)
rnaseq_36_msbb.head()

In [None]:
rnaseq_44_msbb = rnaseq_44_msbb[rnaseq_44_msbb['GENE'].isin(all_path_map['GENE'])]
rnaseq_44_msbb.set_index('GENE', inplace = True)
rnaseq_44_msbb = rnaseq_44_msbb.transpose()
rnaseq_44_msbb = rnaseq_44_msbb.join(adsp_pheno_id, how = 'inner')
rnaseq_44_msbb = rnaseq_44_msbb.dropna(axis = 1, how = 'all')
print(rnaseq_44_msbb.shape)
rnaseq_44_msbb.head()

In [None]:
methyl_rosmap = methyl_rosmap[methyl_rosmap['GENE'].isin(all_path_map['GENE'])]
methyl_rosmap.set_index('GENE', inplace = True)
methyl_rosmap = methyl_rosmap.transpose()
methyl_rosmap = methyl_rosmap.join(adsp_pheno_id, how = 'inner')
methyl_rosmap = methyl_rosmap.dropna(axis = 1, how = 'all')
print(methyl_rosmap.shape)
methyl_rosmap.head()

In [None]:
methyl_msbb = methyl_msbb[methyl_msbb['GENE'].isin(all_path_map['GENE'])]
methyl_msbb.set_index('GENE', inplace = True)
methyl_msbb = methyl_msbb.transpose()
methyl_msbb = methyl_msbb.join(adsp_pheno_id, how = 'inner')
methyl_msbb = methyl_msbb.dropna(axis = 1, how = 'all')
print(methyl_msbb.shape)
methyl_msbb.head()

In [None]:
somoscan_rosmap = somoscan_rosmap[somoscan_rosmap['GENE'].isin(all_path_map['GENE'])]
somoscan_rosmap.set_index('GENE', inplace = True)
somoscan_rosmap = somoscan_rosmap.transpose()
somoscan_rosmap = somoscan_rosmap.join(adsp_pheno_id, how = 'inner')
somoscan_rosmap = somoscan_rosmap.dropna(axis = 1, how = 'all')
print(somoscan_rosmap.shape)
somoscan_rosmap.head()

In [None]:
tmt_msbb = tmt_msbb[tmt_msbb['GENE'].isin(all_path_map['GENE'])]
tmt_msbb.set_index('GENE', inplace = True)
tmt_msbb = tmt_msbb.transpose()
tmt_msbb = tmt_msbb.join(adsp_pheno_id, how = 'inner')
tmt_msbb = tmt_msbb.dropna(axis = 1, how = 'all')
print(tmt_msbb.shape)
tmt_msbb.head()

# run tests

In [None]:
df = gene_score_adsp.copy()

train = df.sample(frac = 0.7, random_state = 7)
test = df.drop(train.index)

x_train = train.drop(columns = ['AD'])
y_train = train[['AD']]
x_test = test.drop(columns = ['AD'])
y_test = test[['AD']]

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 7, n_jobs = -1)
model.fit(x_train, y_train)

y_pred_bin = model.predict(x_test)
y_pred_cont = model.predict_proba(x_test)[:, 1]

gs_adsp_auroc = roc_auc_score(y_test, y_pred_cont)
gs_adsp_auprc = average_precision_score(y_test, y_pred_cont)
gs_adsp_f1 = f1_score(y_test, y_pred_bin)
gs_adsp_balanced_acc = balanced_accuracy_score(y_test, y_pred_bin)
gs_adsp_num = len(df.index)

print(gs_adsp_auroc)
print(gs_adsp_auprc)
print(gs_adsp_f1)
print(gs_adsp_balanced_acc)
print(gs_adsp_num)

In [None]:
df = rnaseq_rosmap.copy()

train = df.sample(frac = 0.7, random_state = 7)
test = df.drop(train.index)

x_train = train.drop(columns = ['AD'])
y_train = train[['AD']]
x_test = test.drop(columns = ['AD'])
y_test = test[['AD']]

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 7, n_jobs = -1)
model.fit(x_train, y_train)

y_pred_bin = model.predict(x_test)
y_pred_cont = model.predict_proba(x_test)[:, 1]

rs_rosmap_auroc = roc_auc_score(y_test, y_pred_cont)
rs_rosmap_auprc = average_precision_score(y_test, y_pred_cont)
rs_rosmap_f1 = f1_score(y_test, y_pred_bin)
rs_rosmap_balanced_acc = balanced_accuracy_score(y_test, y_pred_bin)
rs_rosmap_num = len(df.index)

print(rs_rosmap_auroc)
print(rs_rosmap_auprc)
print(rs_rosmap_f1)
print(rs_rosmap_balanced_acc)
print(rs_rosmap_num)

In [None]:
df = rnaseq_10_msbb.copy()

train = df.sample(frac = 0.7, random_state = 7)
test = df.drop(train.index)

x_train = train.drop(columns = ['AD'])
y_train = train[['AD']]
x_test = test.drop(columns = ['AD'])
y_test = test[['AD']]

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 7, n_jobs = -1)
model.fit(x_train, y_train)

y_pred_bin = model.predict(x_test)
y_pred_cont = model.predict_proba(x_test)[:, 1]

rs_10_msbb_auroc = roc_auc_score(y_test, y_pred_cont)
rs_10_msbb_auprc = average_precision_score(y_test, y_pred_cont)
rs_10_msbb_f1 = f1_score(y_test, y_pred_bin)
rs_10_msbb_balanced_acc = balanced_accuracy_score(y_test, y_pred_bin)
rs_10_msbb_num = len(df.index)

print(rs_10_msbb_auroc)
print(rs_10_msbb_auprc)
print(rs_10_msbb_f1)
print(rs_10_msbb_balanced_acc)
print(rs_10_msbb_num)

In [None]:
df = rnaseq_22_msbb.copy()

train = df.sample(frac = 0.7, random_state = 7)
test = df.drop(train.index)

x_train = train.drop(columns = ['AD'])
y_train = train[['AD']]
x_test = test.drop(columns = ['AD'])
y_test = test[['AD']]

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 7, n_jobs = -1)
model.fit(x_train, y_train)

y_pred_bin = model.predict(x_test)
y_pred_cont = model.predict_proba(x_test)[:, 1]

rs_22_msbb_auroc = roc_auc_score(y_test, y_pred_cont)
rs_22_msbb_auprc = average_precision_score(y_test, y_pred_cont)
rs_22_msbb_f1 = f1_score(y_test, y_pred_bin)
rs_22_msbb_balanced_acc = balanced_accuracy_score(y_test, y_pred_bin)
rs_22_msbb_num = len(df.index)

print(rs_22_msbb_auroc)
print(rs_22_msbb_auprc)
print(rs_22_msbb_f1)
print(rs_22_msbb_balanced_acc)
print(rs_22_msbb_num)

In [None]:
df = rnaseq_36_msbb.copy()

train = df.sample(frac = 0.7, random_state = 7)
test = df.drop(train.index)

x_train = train.drop(columns = ['AD'])
y_train = train[['AD']]
x_test = test.drop(columns = ['AD'])
y_test = test[['AD']]

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 7, n_jobs = -1)
model.fit(x_train, y_train)

y_pred_bin = model.predict(x_test)
y_pred_cont = model.predict_proba(x_test)[:, 1]

rs_36_msbb_auroc = roc_auc_score(y_test, y_pred_cont)
rs_36_msbb_auprc = average_precision_score(y_test, y_pred_cont)
rs_36_msbb_f1 = f1_score(y_test, y_pred_bin)
rs_36_msbb_balanced_acc = balanced_accuracy_score(y_test, y_pred_bin)
rs_36_msbb_num = len(df.index)

print(rs_36_msbb_auroc)
print(rs_36_msbb_auprc)
print(rs_36_msbb_f1)
print(rs_36_msbb_balanced_acc)
print(rs_36_msbb_num)

In [None]:
df = rnaseq_44_msbb.copy()

train = df.sample(frac = 0.7, random_state = 7)
test = df.drop(train.index)

x_train = train.drop(columns = ['AD'])
y_train = train[['AD']]
x_test = test.drop(columns = ['AD'])
y_test = test[['AD']]

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 7, n_jobs = -1)
model.fit(x_train, y_train)

y_pred_bin = model.predict(x_test)
y_pred_cont = model.predict_proba(x_test)[:, 1]

rs_44_msbb_auroc = roc_auc_score(y_test, y_pred_cont)
rs_44_msbb_auprc = average_precision_score(y_test, y_pred_cont)
rs_44_msbb_f1 = f1_score(y_test, y_pred_bin)
rs_44_msbb_balanced_acc = balanced_accuracy_score(y_test, y_pred_bin)
rs_44_msbb_num = len(df.index)

print(rs_44_msbb_auroc)
print(rs_44_msbb_auprc)
print(rs_44_msbb_f1)
print(rs_44_msbb_balanced_acc)
print(rs_44_msbb_num)

In [None]:
df = methyl_rosmap.copy()

train = df.sample(frac = 0.7, random_state = 7)
test = df.drop(train.index)

x_train = train.drop(columns = ['AD'])
y_train = train[['AD']]
x_test = test.drop(columns = ['AD'])
y_test = test[['AD']]

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 7, n_jobs = -1)
model.fit(x_train, y_train)

y_pred_bin = model.predict(x_test)
y_pred_cont = model.predict_proba(x_test)[:, 1]

methyl_rosmap_auroc = roc_auc_score(y_test, y_pred_cont)
methyl_rosmap_auprc = average_precision_score(y_test, y_pred_cont)
methyl_rosmap_f1 = f1_score(y_test, y_pred_bin)
methyl_rosmap_balanced_acc = balanced_accuracy_score(y_test, y_pred_bin)
methyl_rosmap_num = len(df.index)

print(methyl_rosmap_auroc)
print(methyl_rosmap_auprc)
print(methyl_rosmap_f1)
print(methyl_rosmap_balanced_acc)
print(methyl_rosmap_num)

In [None]:
df = methyl_msbb.copy()

train = df.sample(frac = 0.7, random_state = 7)
test = df.drop(train.index)

x_train = train.drop(columns = ['AD'])
y_train = train[['AD']]
x_test = test.drop(columns = ['AD'])
y_test = test[['AD']]

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 7, n_jobs = -1)
model.fit(x_train, y_train)

y_pred_bin = model.predict(x_test)
y_pred_cont = model.predict_proba(x_test)[:, 1]

methyl_msbb_auroc = roc_auc_score(y_test, y_pred_cont)
methyl_msbb_auprc = average_precision_score(y_test, y_pred_cont)
methyl_msbb_f1 = f1_score(y_test, y_pred_bin)
methyl_msbb_balanced_acc = balanced_accuracy_score(y_test, y_pred_bin)
methyl_msbb_num = len(df.index)

print(methyl_msbb_auroc)
print(methyl_msbb_auprc)
print(methyl_msbb_f1)
print(methyl_msbb_balanced_acc)
print(methyl_msbb_num)

In [None]:
df = somoscan_rosmap.copy()

train = df.sample(frac = 0.7, random_state = 7)
test = df.drop(train.index)

x_train = train.drop(columns = ['AD'])
y_train = train[['AD']]
x_test = test.drop(columns = ['AD'])
y_test = test[['AD']]

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 7, n_jobs = -1)
model.fit(x_train, y_train)

y_pred_bin = model.predict(x_test)
y_pred_cont = model.predict_proba(x_test)[:, 1]

somoscan_rosmap_auroc = roc_auc_score(y_test, y_pred_cont)
somoscan_rosmap_auprc = average_precision_score(y_test, y_pred_cont)
somoscan_rosmap_f1 = f1_score(y_test, y_pred_bin)
somoscan_rosmap_balanced_acc = balanced_accuracy_score(y_test, y_pred_bin)
somoscan_rosmap_num = len(df.index)

print(somoscan_rosmap_auroc)
print(somoscan_rosmap_auprc)
print(somoscan_rosmap_f1)
print(somoscan_rosmap_balanced_acc)
print(somoscan_rosmap_num)

In [None]:
df = tmt_msbb.copy()

train = df.sample(frac = 0.7, random_state = 7)
test = df.drop(train.index)

x_train = train.drop(columns = ['AD'])
y_train = train[['AD']]
x_test = test.drop(columns = ['AD'])
y_test = test[['AD']]

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 7, n_jobs = -1)
model.fit(x_train, y_train)

y_pred_bin = model.predict(x_test)
y_pred_cont = model.predict_proba(x_test)[:, 1]

tmt_msbb_auroc = roc_auc_score(y_test, y_pred_cont)
tmt_msbb_auprc = average_precision_score(y_test, y_pred_cont)
tmt_msbb_f1 = f1_score(y_test, y_pred_bin)
tmt_msbb_balanced_acc = balanced_accuracy_score(y_test, y_pred_bin)
tmt_msbb_num = len(df.index)

print(tmt_msbb_auroc)
print(tmt_msbb_auprc)
print(tmt_msbb_f1)
print(tmt_msbb_balanced_acc)
print(tmt_msbb_num)

# extract best model metrics

In [None]:
best_model_auroc = best_model.loc[0, 'TEST_AUROC_MEAN']
best_model_auprc = best_model.loc[0, 'TEST_AUPRC_MEAN']
best_model_f1 = best_model.loc[0, 'TEST_F1_MEAN']
best_model_balanced_acc = best_model.loc[0, 'TEST_BALANCED_ACCURACY_MEAN']
best_model_num = 17123

print(best_model_auroc)
print(best_model_auprc)
print(best_model_f1)
print(best_model_balanced_acc)
print(best_model_num)

# make combined df

In [None]:
all_metics = pd.DataFrame(data = {'METRIC' : ['AUROC', 'AUPRC', 'F1_SCORE', 'BALANCED_ACCURACY', 'N_SAMPLES'],
                                  'BEST_PATHWAY_SCORE' : [best_model_auroc, best_model_auprc, best_model_f1, best_model_balanced_acc, best_model_num],
                                  'GENE_SCORE_ADSP' : [gs_adsp_auroc, gs_adsp_auprc, gs_adsp_f1, gs_adsp_balanced_acc, gs_adsp_num],
                                  'RNASEQ_ROSMAP' : [rs_rosmap_auroc, rs_rosmap_auprc, rs_rosmap_f1, rs_rosmap_balanced_acc, rs_rosmap_num],
                                  'RNASEQ_BA10_MSBB' : [rs_10_msbb_auroc, rs_10_msbb_auprc, rs_10_msbb_f1, rs_10_msbb_balanced_acc, rs_10_msbb_num],
                                  'RNASEQ_BA22_MSBB' : [rs_22_msbb_auroc, rs_22_msbb_auprc, rs_22_msbb_f1, rs_22_msbb_balanced_acc, rs_22_msbb_num],
                                  'RNASEQ_BA36_MSBB' : [rs_36_msbb_auroc, rs_36_msbb_auprc, rs_36_msbb_f1, rs_36_msbb_balanced_acc, rs_36_msbb_num],
                                  'RNASEQ_BA44_MSBB' : [rs_44_msbb_auroc, rs_44_msbb_auprc, rs_44_msbb_f1, rs_44_msbb_balanced_acc, rs_44_msbb_num],
                                  'METHYL_ROSMAP' : [methyl_rosmap_auroc, methyl_rosmap_auprc, methyl_rosmap_f1, methyl_rosmap_balanced_acc, methyl_rosmap_num],
                                  'METHYL_MSBB' : [methyl_msbb_auroc, methyl_msbb_auprc, methyl_msbb_f1, methyl_msbb_balanced_acc, methyl_msbb_num],
                                  'SOMOSCAN_ROSMAP' : [somoscan_rosmap_auroc, somoscan_rosmap_auprc, somoscan_rosmap_f1, somoscan_rosmap_balanced_acc, somoscan_rosmap_num],
                                  'TMT_MSBB' : [tmt_msbb_auroc, tmt_msbb_auprc, tmt_msbb_f1, tmt_msbb_balanced_acc, tmt_msbb_num]})
all_metics

# identify overlapping samples

## make id lists

In [None]:
gs_adsp_unique_id = gene_score_adsp.reset_index()[['index']]
gs_adsp_unique_id['SOURCE'] = 'GS_ADSP'
gs_adsp_unique_id.head()

In [None]:
rs_rosmap_unique_id = rnaseq_rosmap.reset_index()[['index']]
rs_rosmap_unique_id['SOURCE'] = 'RS_ROSMAP'
rs_rosmap_unique_id.head()

In [None]:
rs_10_msbb_unique_id = rnaseq_10_msbb.reset_index()[['index']]
rs_10_msbb_unique_id['SOURCE'] = 'RS_10_MSBB'
rs_10_msbb_unique_id.head()

In [None]:
rs_22_msbb_unique_id = rnaseq_22_msbb.reset_index()[['index']]
rs_22_msbb_unique_id['SOURCE'] = 'RS_22_MSBB'
rs_22_msbb_unique_id.head()

In [None]:
rs_36_msbb_unique_id = rnaseq_36_msbb.reset_index()[['index']]
rs_36_msbb_unique_id['SOURCE'] = 'RS_36_MSBB'
rs_36_msbb_unique_id.head()

In [None]:
rs_44_msbb_unique_id = rnaseq_44_msbb.reset_index()[['index']]
rs_44_msbb_unique_id['SOURCE'] = 'RS_44_MSBB'
rs_44_msbb_unique_id.head()

In [None]:
ma_rosmap_unique_id = methyl_rosmap.reset_index()[['index']]
ma_rosmap_unique_id['SOURCE'] = 'MA_ROSMAP'
ma_rosmap_unique_id.head()

In [None]:
ma_msbb_unique_id = methyl_msbb.reset_index()[['index']]
ma_msbb_unique_id['SOURCE'] = 'MA_MSBB'
ma_msbb_unique_id.head()

In [None]:
sp_rosmap_unique_id = somoscan_rosmap.reset_index()[['index']]
sp_rosmap_unique_id['SOURCE'] = 'SP_ROSMAP'
sp_rosmap_unique_id.head()

In [None]:
tp_msbb_unique_id = tmt_msbb.reset_index()[['index']]
tp_msbb_unique_id['SOURCE'] = 'TP_MSBB'
tp_msbb_unique_id.head()

## concatenate

In [None]:
all_ids = pd.concat([gs_adsp_unique_id,
                     rs_rosmap_unique_id,
                     rs_10_msbb_unique_id,
                     rs_22_msbb_unique_id,
                     rs_36_msbb_unique_id,
                     rs_44_msbb_unique_id,
                     ma_rosmap_unique_id,
                     ma_msbb_unique_id,
                     sp_rosmap_unique_id,
                     tp_msbb_unique_id], axis = 0)
print(len(all_ids.index))
print(len(all_ids['index'].unique()))

## extract duplicate ids

In [None]:
dup_ids = all_ids[all_ids['index'].duplicated(keep = False)]
print(len(dup_ids.index))
print(len(dup_ids['index'].unique()))
dup_ids['SOURCE'].value_counts(dropna = False)

# export

In [None]:
all_metics.to_csv('ML/statistical_models/indiv_omics/AD.all_omics.standard_scaled.model_prediction.csv', index = None)