# load packages

In [None]:
import pandas as pd

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
import numpy as np

In [None]:
import sys

# read in input files

In [None]:
go = pd.read_csv('pathway_score/pathway_annotation/go/go_pathways.GRCh38.113.refseq.exp_validated.jaccard_similarity_matrix.csv', index_col = 0)
print(go.shape)
go.head()

In [None]:
known_path = pd.read_csv('go_ad_pathways.csv')
print(len(known_path.index))
known_path.head()

In [None]:
map = pd.read_csv('pathway_score/pathway_annotation/go/AD_KMI.ADSP.ROSMAP.all_omics.MSBB.all_omics.GRCh38.113.refseq.exp_validated.go.gene_to_pathway.no_duplicates.pathway_mapping.txt',
                  sep = '\t')
print(len(map.index))
print(len(map['PATHWAY_ID'].unique()))
map.head()

# extract pathway sources

## extract

In [None]:
pathway_ids = []
sources = []

with open('pathway_score/pathway_annotation/raw_databases/go-basic.obo.1', "r") as f:
    for line in f:
        if line.startswith("id: "):
            pathway_ids.append(line)
        elif line.startswith('namespace: '):
            sources.append(line)

#print(len(pathway_ids))
#print(len(sources))

go_source = pd.DataFrame({'PATHWAY_ID' : pathway_ids, 'SOURCE' : sources})
go_source['PATHWAY_ID'] = go_source['PATHWAY_ID'].str.replace('\n', '')
go_source['PATHWAY_ID'] = go_source['PATHWAY_ID'].str.replace('id: ', '')
go_source['SOURCE'] = go_source['SOURCE'].str.replace('\n', '')
go_source['SOURCE'] = go_source['SOURCE'].str.replace('namespace: ', '')
print(len(go_source.index))
print(len(go_source['PATHWAY_ID'].unique()))
print(go_source['SOURCE'].unique())
go_source.head()

## separate into different pathway types

In [None]:
go_bio_path = go_source[go_source['SOURCE'].isin(['biological_process'])]
print(len(go_bio_path.index))

In [None]:
go_mole_path = go_source[go_source['SOURCE'].isin(['molecular_function'])]
print(len(go_mole_path.index))

In [None]:
go_cell_path = go_source[go_source['SOURCE'].isin(['cellular_component'])]
print(len(go_cell_path.index))

In [None]:
go_ex_path = go_source[go_source['SOURCE'].isin(['external'])]
print(len(go_ex_path.index))

# convert to long form

In [None]:
go_long = go.reset_index().melt(id_vars = 'PATHWAY_ID', var_name = "PATHWAY_2", value_name = "jaccard_similarity")
go_long = go_long.rename(columns = {'PATHWAY_ID' : 'PATHWAY_1'})
print(go_long.shape)
go_long.head()

# remove equal columns

In [None]:
go_clean = go_long[go_long['PATHWAY_1'] != go_long['PATHWAY_2']]
print(len(go_clean.index))
go_clean.head()

# filter to columns > 0

In [None]:
go_positive = go_clean[go_clean['jaccard_similarity'] > 0]
print(len(go_positive.index))

# sensitively analysis- different pathway datatypes

In [None]:
unique_path = go_clean[['PATHWAY_1']].drop_duplicates()
print(len(unique_path.index))
print(len(unique_path['PATHWAY_1'].unique()))

In [None]:
ad_go_bio = unique_path[unique_path['PATHWAY_1'].isin(go_bio_path['PATHWAY_ID'])]
print(len(ad_go_bio.index))
ad_go_mole = unique_path[unique_path['PATHWAY_1'].isin(go_mole_path['PATHWAY_ID'])]
print(len(ad_go_mole.index))
ad_go_cell = unique_path[unique_path['PATHWAY_1'].isin(go_cell_path['PATHWAY_ID'])]
print(len(ad_go_cell.index))
ad_go_ex = unique_path[unique_path['PATHWAY_1'].isin(go_ex_path['PATHWAY_ID'])]
print(len(ad_go_ex.index))

In [None]:
print(len(ad_go_bio[ad_go_bio['PATHWAY_1'].isin(known_path['GO_ID'])].index))
print(len(ad_go_mole[ad_go_mole['PATHWAY_1'].isin(known_path['GO_ID'])].index))
print(len(ad_go_cell[ad_go_cell['PATHWAY_1'].isin(known_path['GO_ID'])].index))

In [None]:
print(len(ad_go_bio[ad_go_bio['PATHWAY_1'].isin(map['PATHWAY_ID'])].index))
print(len(ad_go_mole[ad_go_mole['PATHWAY_1'].isin(map['PATHWAY_ID'])].index))
print(len(ad_go_cell[ad_go_cell['PATHWAY_1'].isin(map['PATHWAY_ID'])].index))

# do some sensitively analyses for pathway similarity

In [None]:
print(len(go_clean['PATHWAY_1'].unique()))
print(len(go_clean[go_clean['jaccard_similarity'] >= 0.9]['PATHWAY_1'].unique()))
print(len(go_clean[go_clean['jaccard_similarity'] >= 0.8]['PATHWAY_1'].unique()))
print(len(go_clean[go_clean['jaccard_similarity'] >= 0.7]['PATHWAY_1'].unique()))
print(len(go_clean[go_clean['jaccard_similarity'] >= 0.6]['PATHWAY_1'].unique()))
print(len(go_clean[go_clean['jaccard_similarity'] >= 0.5]['PATHWAY_1'].unique()))
print(len(go_clean[go_clean['jaccard_similarity'] >= 0.4]['PATHWAY_1'].unique()))
print(len(go_clean[go_clean['jaccard_similarity'] >= 0.3]['PATHWAY_1'].unique()))
print(len(go_clean[go_clean['jaccard_similarity'] >= 0.2]['PATHWAY_1'].unique()))
print(len(go_clean[go_clean['jaccard_similarity'] >= 0.1]['PATHWAY_1'].unique()))

In [None]:
go_clean_bio = go_clean[go_clean['PATHWAY_1'].isin(go_bio_path['PATHWAY_ID'])]
go_clean_bio = go_clean_bio[go_clean_bio['PATHWAY_2'].isin(go_bio_path['PATHWAY_ID'])]
print(len(go_clean_bio['PATHWAY_1'].unique()))
print(len(go_clean_bio[go_clean_bio['jaccard_similarity'] >= 0.9]['PATHWAY_1'].unique()))
print(len(go_clean_bio[go_clean_bio['jaccard_similarity'] >= 0.8]['PATHWAY_1'].unique()))
print(len(go_clean_bio[go_clean_bio['jaccard_similarity'] >= 0.7]['PATHWAY_1'].unique()))
print(len(go_clean_bio[go_clean_bio['jaccard_similarity'] >= 0.6]['PATHWAY_1'].unique()))
print(len(go_clean_bio[go_clean_bio['jaccard_similarity'] >= 0.5]['PATHWAY_1'].unique()))
print(len(go_clean_bio[go_clean_bio['jaccard_similarity'] >= 0.4]['PATHWAY_1'].unique()))
print(len(go_clean_bio[go_clean_bio['jaccard_similarity'] >= 0.3]['PATHWAY_1'].unique()))
print(len(go_clean_bio[go_clean_bio['jaccard_similarity'] >= 0.2]['PATHWAY_1'].unique()))
print(len(go_clean_bio[go_clean_bio['jaccard_similarity'] >= 0.1]['PATHWAY_1'].unique()))

In [None]:
go_clean_mole = go_clean[go_clean['PATHWAY_1'].isin(go_mole_path['PATHWAY_ID'])]
go_clean_mole = go_clean_mole[go_clean_mole['PATHWAY_2'].isin(go_mole_path['PATHWAY_ID'])]
print(len(go_clean_mole['PATHWAY_1'].unique()))
print(len(go_clean_mole[go_clean_mole['jaccard_similarity'] >= 0.9]['PATHWAY_1'].unique()))
print(len(go_clean_mole[go_clean_mole['jaccard_similarity'] >= 0.8]['PATHWAY_1'].unique()))
print(len(go_clean_mole[go_clean_mole['jaccard_similarity'] >= 0.7]['PATHWAY_1'].unique()))
print(len(go_clean_mole[go_clean_mole['jaccard_similarity'] >= 0.6]['PATHWAY_1'].unique()))
print(len(go_clean_mole[go_clean_mole['jaccard_similarity'] >= 0.5]['PATHWAY_1'].unique()))
print(len(go_clean_mole[go_clean_mole['jaccard_similarity'] >= 0.4]['PATHWAY_1'].unique()))
print(len(go_clean_mole[go_clean_mole['jaccard_similarity'] >= 0.3]['PATHWAY_1'].unique()))
print(len(go_clean_mole[go_clean_mole['jaccard_similarity'] >= 0.2]['PATHWAY_1'].unique()))
print(len(go_clean_mole[go_clean_mole['jaccard_similarity'] >= 0.1]['PATHWAY_1'].unique()))

In [None]:
go_clean_cell = go_clean[go_clean['PATHWAY_1'].isin(go_cell_path['PATHWAY_ID'])]
go_clean_cell = go_clean_cell[go_clean_cell['PATHWAY_2'].isin(go_cell_path['PATHWAY_ID'])]
print(len(go_clean_cell['PATHWAY_1'].unique()))
print(len(go_clean_cell[go_clean_cell['jaccard_similarity'] >= 0.9]['PATHWAY_1'].unique()))
print(len(go_clean_cell[go_clean_cell['jaccard_similarity'] >= 0.8]['PATHWAY_1'].unique()))
print(len(go_clean_cell[go_clean_cell['jaccard_similarity'] >= 0.7]['PATHWAY_1'].unique()))
print(len(go_clean_cell[go_clean_cell['jaccard_similarity'] >= 0.6]['PATHWAY_1'].unique()))
print(len(go_clean_cell[go_clean_cell['jaccard_similarity'] >= 0.5]['PATHWAY_1'].unique()))
print(len(go_clean_cell[go_clean_cell['jaccard_similarity'] >= 0.4]['PATHWAY_1'].unique()))
print(len(go_clean_cell[go_clean_cell['jaccard_similarity'] >= 0.3]['PATHWAY_1'].unique()))
print(len(go_clean_cell[go_clean_cell['jaccard_similarity'] >= 0.2]['PATHWAY_1'].unique()))
print(len(go_clean_cell[go_clean_cell['jaccard_similarity'] >= 0.1]['PATHWAY_1'].unique()))

# filter to pathways with similarity >= 80%

In [None]:
go_similar = go_clean[go_clean['jaccard_similarity'] >= 0.8]
print(len(go_similar.index))
print(len(go_similar['PATHWAY_1'].unique()))
print(len(go_similar['PATHWAY_2'].unique()))
print(4072*2)
go_similar['jaccard_similarity'].describe().apply(lambda x: f'{x:,.2f}')

# filter to pathways with >= 70% similarity

In [None]:
go_similar = go_clean[go_clean['jaccard_similarity'] >= 0.7]
print(len(go_similar.index))
print(len(go_similar['PATHWAY_1'].unique()))
print(len(go_similar['PATHWAY_2'].unique()))
print(4498*2)
go_similar['jaccard_similarity'].describe().apply(lambda x: f'{x:,.2f}')

# filter to go bio pathways with >= 80% similarity

In [None]:
go_similar = go_clean[go_clean['jaccard_similarity'] >= 0.8]
go_similar = go_similar[go_similar['PATHWAY_1'].isin(go_bio_path['PATHWAY_ID'])]
go_similar = go_similar[go_similar['PATHWAY_2'].isin(go_bio_path['PATHWAY_ID'])]
print(len(go_similar.index))
print(len(go_similar['PATHWAY_1'].unique()))
print(len(go_similar['PATHWAY_2'].unique()))
print(4072*2)
go_similar['jaccard_similarity'].describe().apply(lambda x: f'{x:,.2f}')

# remove duplicate pairs

In [None]:
go_similar['pair'] = go_similar.apply(lambda row: tuple(sorted([row['PATHWAY_1'], row['PATHWAY_2']])), axis = 1)
go_similar_unique = go_similar.drop_duplicates(subset = 'pair').drop(columns = 'pair')
print(len(go_similar_unique.index))
print(len(go_similar_unique['PATHWAY_1'].unique()))
print(len(go_similar_unique['PATHWAY_2'].unique()))
go_similar_unique.head()

# see how many overlap w known pathways and different go sources

## known AD paths

In [None]:
print(len(go_similar_unique[go_similar_unique['PATHWAY_1'].isin(known_path['GO_ID'])]['PATHWAY_1'].unique()))
print(len(go_similar_unique[go_similar_unique['PATHWAY_2'].isin(known_path['GO_ID'])]['PATHWAY_2'].unique()))

## go bio

In [None]:
print(len(go_similar_unique[go_similar_unique['PATHWAY_1'].isin(go_bio_path['PATHWAY_ID'])]['PATHWAY_1'].unique()))
print(len(go_similar_unique[go_similar_unique['PATHWAY_2'].isin(go_bio_path['PATHWAY_ID'])]['PATHWAY_2'].unique()))

## go molecular

In [None]:
print(len(go_similar_unique[go_similar_unique['PATHWAY_1'].isin(go_mole_path['PATHWAY_ID'])]['PATHWAY_1'].unique()))
print(len(go_similar_unique[go_similar_unique['PATHWAY_2'].isin(go_mole_path['PATHWAY_ID'])]['PATHWAY_2'].unique()))

## go cell

In [None]:
print(len(go_similar_unique[go_similar_unique['PATHWAY_1'].isin(go_cell_path['PATHWAY_ID'])]['PATHWAY_1'].unique()))
print(len(go_similar_unique[go_similar_unique['PATHWAY_2'].isin(go_cell_path['PATHWAY_ID'])]['PATHWAY_2'].unique()))

## external

In [None]:
print(len(go_similar_unique[go_similar_unique['PATHWAY_1'].isin(go_ex_path['PATHWAY_ID'])]['PATHWAY_1'].unique()))
print(len(go_similar_unique[go_similar_unique['PATHWAY_2'].isin(go_ex_path['PATHWAY_ID'])]['PATHWAY_2'].unique()))

# add pathway source to file

In [None]:
print(len(go_similar_unique.index))
go_similar_source = go_similar_unique.rename(columns = {'PATHWAY_1' : 'PATHWAY_ID'})
go_similar_source = go_similar_source.merge(go_source, on = 'PATHWAY_ID', how = 'left')
print(len(go_similar_source.index))
go_similar_source = go_similar_source.rename(columns = {'PATHWAY_ID' : 'PATHWAY_1', 'SOURCE' : 'PATHWAY_1_SOURCE', 'PATHWAY_2' : 'PATHWAY_ID'})
go_similar_source = go_similar_source.merge(go_source, on = 'PATHWAY_ID', how = 'left')
print(len(go_similar_source.index))
go_similar_source = go_similar_source.rename(columns = {'PATHWAY_ID' : 'PATHWAY_2', 'SOURCE' : 'PATHWAY_2_SOURCE'})
print(go_similar_source['PATHWAY_1_SOURCE'].unique())
print(go_similar_source['PATHWAY_2_SOURCE'].unique())
go_similar_source.head()

# add known path to file

In [None]:
known_path_sub = known_path[['GO_ID']].drop_duplicates()
known_path_sub['KNOWN_PATH'] = True
print(len(known_path_sub.index))
print(len(known_path_sub['GO_ID'].unique()))
known_path_sub.head()

In [None]:
print(len(go_similar_source.index))
go_similar_known = go_similar_source.rename(columns = {'PATHWAY_1' : 'GO_ID'})
go_similar_known = go_similar_known.merge(known_path_sub, on = 'GO_ID', how = 'left')
print(len(go_similar_known.index))
go_similar_known = go_similar_known.rename(columns = {'GO_ID' : 'PATHWAY_1', 'KNOWN_PATH' : 'PATHWAY_1_KNOWN_PATH', 'PATHWAY_2' : 'GO_ID'})
go_similar_known = go_similar_known.merge(known_path_sub, on = 'GO_ID', how = 'left')
print(len(go_similar_known.index))
go_similar_known = go_similar_known.rename(columns = {'GO_ID' : 'PATHWAY_2', 'KNOWN_PATH' : 'PATHWAY_2_KNOWN_PATH'})
go_similar_known['PATHWAY_1_KNOWN_PATH'] = go_similar_known['PATHWAY_1_KNOWN_PATH'].fillna(False)
go_similar_known['PATHWAY_2_KNOWN_PATH'] = go_similar_known['PATHWAY_2_KNOWN_PATH'].fillna(False)
go_similar_known.head()

In [None]:
print(len(go_similar_known['PATHWAY_1'].unique()))
print(len(go_similar_known['PATHWAY_2'].unique()))

# investigate missing sources

## check out numbers

In [None]:
print(len(go_similar_known[go_similar_known['PATHWAY_1_SOURCE'].isna()].index))
print(len(go_similar_known[go_similar_known['PATHWAY_2_SOURCE'].isna()].index))
print(len(go_similar_known[(go_similar_known['PATHWAY_1_SOURCE'].isna()) & (go_similar_known['PATHWAY_2_SOURCE'].isna())].index))

## make list of unique pathways to look up

In [None]:
missing_sources = []

In [None]:
missing_sources = missing_sources + go_similar_known[go_similar_known['PATHWAY_1_SOURCE'].isna()]['PATHWAY_1'].unique().tolist()

In [None]:
missing_sources = missing_sources + go_similar_known[go_similar_known['PATHWAY_2_SOURCE'].isna()]['PATHWAY_2'].unique().tolist()

In [None]:
missing_sources = list(set(missing_sources))
print(len(missing_sources))
missing_sources

# select pathways to prioritize
- known > bio > mole > cell

In [None]:
new_rows = []
selected_pathways = []

for index, row in go_similar_known.iterrows():
    # pathway 1 is known and pathway 2 is not known
    if (row['PATHWAY_1_KNOWN_PATH'] == True) and (row['PATHWAY_2_KNOWN_PATH'] == False):
        selected_pathway = row['PATHWAY_1']
        if selected_pathway not in selected_pathways:
            row['SELECTED_PATHWAY'] = selected_pathway
            selected_pathway = row['PATHWAY_2']
        elif selected_pathway not in selected_pathways:
            row['SELECTED_PATHWAY'] = selected_pathway
        else:
            row['SELECTED_PATHWAY'] = np.nan

    # pathway 2 is known and pathway 1 is not known
    elif (row['PATHWAY_2_KNOWN_PATH'] == True) and (row['PATHWAY_1_KNOWN_PATH'] == False):
        selected_pathway = row['PATHWAY_2']
        if selected_pathway not in selected_pathways:
            row['SELECTED_PATHWAY'] = selected_pathway
            selected_pathway = row['PATHWAY_1']
        elif selected_pathway not in selected_pathways:
            row['SELECTED_PATHWAY'] = selected_pathway
        else:
            row['SELECTED_PATHWAY'] = np.nan

    # both pathway 1 and 2 are known or not known
    else:

        # pathway 1 is biological process and pathway 2 is not
        if (row['PATHWAY_1_SOURCE'] == 'biological_process') and (row['PATHWAY_2_SOURCE'] != 'biological_process'):
            selected_pathway = row['PATHWAY_1']
            if selected_pathway not in selected_pathways:
                row['SELECTED_PATHWAY'] = selected_pathway
                selected_pathway = row['PATHWAY_2']
            elif selected_pathway not in selected_pathways:
                row['SELECTED_PATHWAY'] = selected_pathway
            else:
                row['SELECTED_PATHWAY'] = np.nan

        # pathway 2 is biological process and pathway 1 is not
        elif (row['PATHWAY_2_SOURCE'] == 'biological_process') and (row['PATHWAY_1_SOURCE'] != 'biological_process'):
            selected_pathway = row['PATHWAY_2']
            if selected_pathway not in selected_pathways:
                row['SELECTED_PATHWAY'] = selected_pathway
                selected_pathway = row['PATHWAY_1']
            elif selected_pathway not in selected_pathways:
                row['SELECTED_PATHWAY'] = selected_pathway
            else:
                row['SELECTED_PATHWAY'] = np.nan

        # both pathways are biological process
        elif (row['PATHWAY_1_SOURCE'] == 'biological_process') and (row['PATHWAY_2_SOURCE'] == 'biological_process'):
            selected_pathway = row['PATHWAY_1']
            if selected_pathway not in selected_pathways:
                row['SELECTED_PATHWAY'] = selected_pathway
                selected_pathway = row['PATHWAY_2']
            elif selected_pathway not in selected_pathways:
                row['SELECTED_PATHWAY'] = selected_pathway
            else:
                row['SELECTED_PATHWAY'] = np.nan

        # neither pathways are biological process
        elif (row['PATHWAY_1_SOURCE'] != 'biological_process') and (row['PATHWAY_2_SOURCE'] != 'biological_process'):

            # pathway 1 is molecular function and pathway 2 is not
            if (row['PATHWAY_1_SOURCE'] == 'molecular_function') and (row['PATHWAY_2_SOURCE'] != 'molecular_function'):
                selected_pathway = row['PATHWAY_1']
                if selected_pathway not in selected_pathways:
                    row['SELECTED_PATHWAY'] = selected_pathway
                    selected_pathway = row['PATHWAY_2']
                elif selected_pathway not in selected_pathways:
                    row['SELECTED_PATHWAY'] = selected_pathway
                else:
                    row['SELECTED_PATHWAY'] = np.nan

            # pathway 2 is molecular function and pathway 1 is not
            elif (row['PATHWAY_2_SOURCE'] == 'molecular_function') and (row['PATHWAY_1_SOURCE'] != 'molecular_function'):
                selected_pathway = row['PATHWAY_2']
                if selected_pathway not in selected_pathways:
                    row['SELECTED_PATHWAY'] = selected_pathway
                    selected_pathway = row['PATHWAY_1']
                elif selected_pathway not in selected_pathways:
                    row['SELECTED_PATHWAY'] = selected_pathway
                else:
                    row['SELECTED_PATHWAY'] = np.nan

            # both pathways are molecular function
            elif (row['PATHWAY_1_SOURCE'] == 'molecular_function') and (row['PATHWAY_2_SOURCE'] == 'molecular_function'):
                selected_pathway = row['PATHWAY_1']
                if selected_pathway not in selected_pathways:
                    row['SELECTED_PATHWAY'] = selected_pathway
                    selected_pathway = row['PATHWAY_2']
                elif selected_pathway not in selected_pathways:
                    row['SELECTED_PATHWAY'] = selected_pathway
                else:
                    row['SELECTED_PATHWAY'] = np.nan
            
            # neither pathways are molecular function
            elif (row['PATHWAY_1_SOURCE'] != 'molecular_function') and (row['PATHWAY_2_SOURCE'] != 'molecular_function'):

                # pathway 1 is cellular component and pathway 2 is not
                if (row['PATHWAY_1_SOURCE'] == 'cellular_component') and (row['PATHWAY_2_SOURCE'] != 'cellular_component'):
                    selected_pathway = row['PATHWAY_1']
                    if selected_pathway not in selected_pathways:
                        row['SELECTED_PATHWAY'] = selected_pathway
                        selected_pathway = row['PATHWAY_2']
                    elif selected_pathway not in selected_pathways:
                        row['SELECTED_PATHWAY'] = selected_pathway
                    else:
                        row['SELECTED_PATHWAY'] = np.nan
    
                # pathway 2 is cellular component and pathway 1 is not
                elif (row['PATHWAY_2_SOURCE'] == 'cellular_component') and (row['PATHWAY_1_SOURCE'] != 'cellular_component'):
                    selected_pathway = row['PATHWAY_2']
                    if selected_pathway not in selected_pathways:
                        row['SELECTED_PATHWAY'] = selected_pathway
                        selected_pathway = row['PATHWAY_1']
                    elif selected_pathway not in selected_pathways:
                        row['SELECTED_PATHWAY'] = selected_pathway
                    else:
                        row['SELECTED_PATHWAY'] = np.nan

                # both pathway 1 and pathway 2 are cellular component
                elif (row['PATHWAY_1_SOURCE'] == 'cellular_component') and (row['PATHWAY_2_SOURCE'] == 'cellular_component'):
                    selected_pathway = row['PATHWAY_1']
                    if selected_pathway not in selected_pathways:
                        row['SELECTED_PATHWAY'] = selected_pathway
                        selected_pathway = row['PATHWAY_2']
                    elif selected_pathway not in selected_pathways:
                        row['SELECTED_PATHWAY'] = selected_pathway
                    else:
                        row['SELECTED_PATHWAY'] = np.nan

                # fall back statement
                else:
                    print(row)
                    sys.exit('error with cellular component statements')
            # fall back statement
            else:
                print(row)
                sys.exit('error with molecular statements')
        # fall back statement
        else:
            print(row)
            sys.exit('error with biological process statements')
                
    # append to lists
    selected_pathways.append(row['SELECTED_PATHWAY'])
    new_rows.append(row)

In [None]:
go_selected = pd.DataFrame(new_rows)
print(len(go_selected.index))
print(len(go_selected['SELECTED_PATHWAY'].unique()))
go_selected

In [None]:
go_selected[go_selected['SELECTED_PATHWAY'].duplicated(keep = False)]['SELECTED_PATHWAY'].unique()

# filter map

## remove pathways w any similarity

In [None]:
print(len(map.index))
map_no_similar = map[~map['PATHWAY_ID'].isin(go_selected['PATHWAY_1'])]
map_no_similar = map_no_similar[map_no_similar['PATHWAY_ID'].isin(go_bio_path['PATHWAY_ID'])]
print(len(map_no_similar.index))
map_no_similar = map_no_similar[~map_no_similar['PATHWAY_ID'].isin(go_selected['PATHWAY_2'])]
print(len(map_no_similar.index))
print(len(map['PATHWAY_ID'].unique()))
print(len(map_no_similar['PATHWAY_ID'].unique()))

## add those back in

In [None]:
print(len(map.index))
map_similar = map[map['PATHWAY_ID'].isin(go_selected['SELECTED_PATHWAY'])]
print(len(map_similar.index))
print(len(map['PATHWAY_ID'].unique()))
print(len(map_similar['PATHWAY_ID'].unique()))

In [None]:
map_fixed = pd.concat([map_similar, map_no_similar], axis = 0)
print(len(map_fixed.index))
print(len(map_fixed['PATHWAY_ID'].unique()))

## filter map to go bio pathways
- when using not jaccard similarity threshold

In [None]:
print(len(map.index))
map_fixed = map[map['PATHWAY_ID'].isin(go_bio_path['PATHWAY_ID'])]
print(len(map_fixed.index))
print(len(map['PATHWAY_ID'].unique()))
print(len(map_fixed['PATHWAY_ID'].unique()))

## remove pathways w no reference genes

In [None]:
map_ref = map_fixed[map_fixed['PATHWAY_ID'].isin(go.index)]
print(len(map_fixed['PATHWAY_ID'].unique()))
print(len(map_ref['PATHWAY_ID'].unique()))
print(len(map_ref['GENE'].unique()))
map_ref.head()

# export

## with jaccard similarity threshold

In [None]:
map_ref.to_csv('pathway_score/pathway_annotation/go/AD_KMI.ADSP.ROSMAP.all_omics.MSBB.all_omics.GRCh38.113.refseq.exp_validated.jaccard_similarity_less_70.go.gene_to_pathway.no_duplicates.pathway_mapping.txt',
               sep = '\t',
               index = None)

## go bio, no jaccard similarity threshold

In [None]:
map_ref.to_csv('pathway_score/pathway_annotation/go/AD_KMI.ADSP.ROSMAP.all_omics.MSBB.all_omics.GRCh38.113.refseq.exp_validated.go_bio.gene_to_pathway.no_duplicates.pathway_mapping.txt',
               sep = '\t',
               index = None)

## go bio, jaccard similarity threshold

In [None]:
map_ref.to_csv('pathway_score/pathway_annotation/go/AD_KMI.ADSP.ROSMAP.all_omics.MSBB.all_omics.GRCh38.113.refseq.exp_validated.jaccard_similarity_less_80.go_bio.gene_to_pathway.no_duplicates.pathway_mapping.txt',
               sep = '\t',
               index = None)