In [9]:
## find all parents of GO:0010898, and the lineage all the way to the root 
# load a sif network, where showing child-parent relationship
import pandas as pd

def load_ont(filename):
    data = []
    with open(filename, 'r') as f:
        for line in f:
            columns = line.strip().split('\t')  # tab-separated values
            if len(columns) >= 3:
                parent = columns[0] # based on collapsed_go.symbol, the first column is parent
                child = columns[1] # second column is child
                interaction = columns[2] # third column is interaction type
                if interaction == 'default':
                    data.append({'child': child, 'parent': parent}) # Add child and parent to a dataframe
    df = pd.DataFrame(data)
    return df
             

def get_ancestors(child_node, edges):
    ancestors = []
    # Find the parents of the current node. In the 'edges' DataFrame, 
    # this is done by looking up rows where the 'child' column matches 
    # the current node, and then getting the corresponding values in the 'parent' column.
    parents = edges.loc[edges['child'] == child_node, 'parent'].values

    for parent in parents:
        if parent not in ancestors:  # Check if the parent is already in the ancestors list
            ancestors.append(parent)  # If not, add it to the list
            ancestors.extend(get_ancestors(parent, edges))  # Recursively find the parent's ancestors

    return ancestors


if __name__ == '__main__':
    GO_ID = 'GO:0019433'
    go_network = load_ont('./data/GO_BP/collapsed_go.symbol')
    ancestors = get_ancestors(GO_ID, go_network)
    print(ancestors)
    print(len(ancestors))
    filtered_network = go_network[(go_network['child'].isin(ancestors+[GO_ID]))]
    GO_ID_name = GO_ID.replace(':', '_')
    filtered_network.to_csv(f'./data/GO_term_analysis/{GO_ID_name}_subhierarchy.txt', sep= '\t', index=False)


['GO:0006641', 'GO:0006639', 'GO:0006638', 'GO:0044255', 'GO:0044237', 'GO:0009987', 'GO:0008150', 'GO:0008152', 'GO:0008150', 'GO:0006629', 'GO:0044238', 'GO:0008152', 'GO:0008150', 'GO:0071704', 'GO:0008152', 'GO:0008150', 'GO:0046486', 'GO:0044255', 'GO:0044237', 'GO:0009987', 'GO:0008150', 'GO:0008152', 'GO:0008150', 'GO:0006629', 'GO:0044238', 'GO:0008152', 'GO:0008150', 'GO:0071704', 'GO:0008152', 'GO:0008150', 'GO:0046464', 'GO:0006638', 'GO:0044255', 'GO:0044237', 'GO:0009987', 'GO:0008150', 'GO:0008152', 'GO:0008150', 'GO:0006629', 'GO:0044238', 'GO:0008152', 'GO:0008150', 'GO:0071704', 'GO:0008152', 'GO:0008150', 'GO:0006639', 'GO:0006638', 'GO:0044255', 'GO:0044237', 'GO:0009987', 'GO:0008150', 'GO:0008152', 'GO:0008150', 'GO:0006629', 'GO:0044238', 'GO:0008152', 'GO:0008150', 'GO:0071704', 'GO:0008152', 'GO:0008150', 'GO:0046486', 'GO:0044255', 'GO:0044237', 'GO:0009987', 'GO:0008150', 'GO:0008152', 'GO:0008150', 'GO:0006629', 'GO:0044238', 'GO:0008152', 'GO:0008150', 'GO:0

In [10]:
print(len(filtered_network), len(set(ancestors)))

33 19


In [11]:
# make a nodes attribute table for all nodes in the subnetwork
GO_ID = 'GO:0019433'
GO_ID_name = GO_ID.replace(':', '_')
# load the GO term annotation file
import pandas as pd
df_csv = pd.read_csv('./data/go_terms.csv', index_col=0)

terms_in_subnetwork = set(filtered_network['child'].tolist() + filtered_network['parent'].tolist())

sub_nodes = df_csv.loc[df_csv['GO'].isin(terms_in_subnetwork), :]

# sub_nodes.head()
print(sub_nodes.shape)

sub_nodes.to_csv(f'./data/GO_term_analysis/{GO_ID_name}_subhierarchy_nodes.txt', sep='\t', index=False)

(20, 4)


In [12]:
sub_nodes.head()

Unnamed: 0,GO,Genes,Gene_Count,Term_Description
547,GO:0006639,PNPLA3 APOA2 FGF21 LPGAT1 CYP2E1 INSIG2 FITM2 ...,130,acylglycerol metabolic process
1220,GO:0071704,ALG14 PTPN1 FIGNL1 SNRPB OCRL USP16 CUL1 MYLK2...,10633,organic substance metabolic process
1327,GO:0019433,PNPLA3 FGF21 PNPLA4 PNPLA1 PLIN5 CPS1 APOA4 PN...,31,triglyceride catabolic process
1463,GO:0006629,ALG14 ACOT7 SERINC2 MBOAT1 OCRL LEPR LPIN1 PTP...,1396,lipid metabolic process
2025,GO:0046486,ABHD8 INPP5J SERINC2 CYP2E1 MBOAT1 OCRL RAB38 ...,396,glycerolipid metabolic process
