## Creating Datasets from a SARS Hierarchy and interactome.
- The goal is to generate datasets in various formats based on the assemblies in a hierarchical model.
- This involves:
    - filtering the assemblies on assembly names, min size, and max size
    - filtering the data by columns and by row values
    - changing column names
    - generating/adding experiment, assembly, and content descriptions
    - cleaning the data, such as non-numeric values
    - limiting the precision of numeric values
    - optionaly saving the datasets to the database
    - optionally adding interaction data
    - optionally adding information from other sources, such as genecards
- The Dengue data is on the STRING+diffusion based interactome
- We can also get the data directly from the dengue_with_uniprot.csv
- Laura Martin-Sancho is most interested in the assemblies listed in the interesting_dengue_communities.xlsx spreadsheet

In [12]:
import sys
import os

# Add the parent directory of the current script to the Python path
cwd = os.getcwd()
dirname = os.path.dirname(cwd)
print(cwd)
print(dirname)
sys.path.append(dirname)

print(sys.path)

from models.analysis_plan import AnalysisPlan
from services.analysisrunner import AnalysisRunner
from models.review_plan import ReviewPlan
from services.reviewrunner import ReviewRunner
from app.sqlite_database import SqliteDatabase
from app.config import load_database_config

# Load the db connection details
# db_type, uri, user, password = load_database_config(path='~/ae_config/test_config.ini')
# self.db = Database(uri, db_type, user, password)

_, database_uri, _, _ = load_database_config()
db = SqliteDatabase(database_uri)

/Users/idekeradmin/Dropbox/GitHub/agent_evaluation/notebooks
/Users/idekeradmin/Dropbox/GitHub/agent_evaluation
['/opt/anaconda3/envs/ae2/lib/python311.zip', '/opt/anaconda3/envs/ae2/lib/python3.11', '/opt/anaconda3/envs/ae2/lib/python3.11/lib-dynload', '', '/Users/idekeradmin/.local/lib/python3.11/site-packages', '/opt/anaconda3/envs/ae2/lib/python3.11/site-packages', '/Users/idekeradmin/Dropbox/GitHub/agent_evaluation', '/Users/idekeradmin/Dropbox/GitHub/agent_evaluation', '/Users/idekeradmin/Dropbox/GitHub/agent_evaluation']


## Get the model and the interactome from NDEx in CX2

In [13]:
from models.hierarchy import Hierarchy
import json
import ndex2 
from ndex2.cx2 import RawCX2NetworkFactory

# Create NDEx2 python client
client = ndex2.client.Ndex2()

# Create CX2Network factory
factory = RawCX2NetworkFactory()

# Download BioGRID: Protein-Protein Interactions (SARS-CoV) from NDEx
# https://www.ndexbio.org/viewer/networks/669f30a3-cee6-11ea-aaef-0ac135e8bacf
# client_resp = client.get_network_as_cx2_stream('669f30a3-cee6-11ea-aaef-0ac135e8bacf')

# SARS interactome network 3a3457fe-5e78-11ef-a7fd-005056ae23aa (network found in an old session file. Not clear what the annotations mean  )
# "SARS Interactome from STRING diffused from ISG hits" 2cc33901-5f18-11ef-a7fd-005056ae23aa
client_resp = client.get_network_as_cx2_stream('2cc33901-5f18-11ef-a7fd-005056ae23aa')

# Convert downloaded interactome network to CX2Network object
interactome = factory.get_cx2network(json.loads(client_resp.content))

# SARS hierarchy
# "SARS Model from STRING diffused from ISG hits" 3a3457fe-5e78-11ef-a7fd-005056ae23aa
client_resp = client.get_network_as_cx2_stream('bf7defc5-5f18-11ef-a7fd-005056ae23aa')

# Convert downloaded interactome network to CX2Network object
hierarchy = factory.get_cx2network(json.loads(client_resp.content))

# Display information about the hierarchy network and output 1st 100 characters of CX2
print('Name: ' + hierarchy.get_name())
print('Number of nodes: ' + str(len(hierarchy.get_nodes())))
print('Number of nodes: ' + str(len(hierarchy.get_edges())))

# Display information about the interactome network 
print('Name: ' + interactome.get_name())
print('Number of nodes: ' + str(len(interactome.get_nodes())))
print('Number of nodes: ' + str(len(interactome.get_edges())))

# this is brief so that it will help keep the context small for fast operation.
brief_sars_dataset_description = """
The dataset includes the following for genes/proteins: 

"binds_sars_protein": yes = One or more SARS virus proteins bind the human protein,
"Inhibits_SARS": yes = cDNA overexpression inhibits SARS replication,


"""
      
hierarchy.add_network_attribute("experiment_description", brief_sars_dataset_description)

sars_hierarchy = Hierarchy(hierarchy, interactome)

sars_hierarchy

Name: hidefv1.1beta_(none)_STRING v12.0: Human Protein Links - High Confidence (Score >= 0.7)(3)
Number of nodes: 186
Number of nodes: 218
Name: STRING v12.0: Human Protein Links - High Confidence (Score >= 0.7)(3)
Number of nodes: 993
Number of nodes: 7453


<models.hierarchy.Hierarchy at 0x129c61b90>

In [14]:
from models.hierarchy import dataset_from_assembly
import models.dataset

sars_column_name_mapping ={"name": "Gene Symbol",
                           "binds_sars_protein": "binds_sars_protein",
                           "Inhibits_SARS": "Inhibits_SARS"}

small_assembly_filter = {"max_size": 20}
assemblies = sars_hierarchy.get_assemblies(small_assembly_filter)
print(f'{len(assemblies)} matching assemblies found')
sars_hierarchy.add_data_from_interactome(filter=small_assembly_filter,
                                         columns=sars_column_name_mapping)
for assembly in assemblies:
    
    json_data = json.loads(assembly['v']['data'])
    if "BST2" in assembly['v']['CD_MemberList']:
        print(json_data)
    #for gene, data in json_data.items():
        #print(f'{gene} : {data}')
#dataset = dataset_from_assembly(db, assemblies[0],)


138 matching assemblies found
{'GLMN': {'Gene Symbol': 'GLMN'}, 'BST2': {'Gene Symbol': 'BST2', 'Inhibits_SARS': 'yes'}, 'APOBEC3G': {'Gene Symbol': 'APOBEC3G'}, 'AGRN': {'Gene Symbol': 'AGRN'}, 'PRAME': {'Gene Symbol': 'PRAME'}, 'ZYG11B': {'Gene Symbol': 'ZYG11B', 'binds_sars_protein': 'yes'}, 'TCEB2': {'Gene Symbol': 'TCEB2'}, 'KLHDC10': {'Gene Symbol': 'KLHDC10'}}
{'USP15': {'Gene Symbol': 'USP15'}, 'USP4': {'Gene Symbol': 'USP4'}, 'SMURF2': {'Gene Symbol': 'SMURF2'}, 'SMURF1': {'Gene Symbol': 'SMURF1'}, 'BTRC': {'Gene Symbol': 'BTRC'}, 'BST2': {'Gene Symbol': 'BST2', 'Inhibits_SARS': 'yes'}, 'SMAD4': {'Gene Symbol': 'SMAD4'}, 'AGRN': {'Gene Symbol': 'AGRN'}}


In [15]:
def score_assembly(assembly):
    data = assembly['v']['data']
    json_data = json.loads(data)
    size = assembly['v']['CD_MemberList_Size']
    score = 0
    binds = 0
    inhibits = 0
    for protein, attributes in json_data.items():
        if attributes.get('binds_sars_protein') == "yes":
            binds += 1
        if attributes.get('Inhibits_SARS') == "yes":
            inhibits += 1
    if inhibits > 0 and binds > 0:
        score = inhibits + binds
        return score
    else:
        return 0

for assembly in assemblies:
    data = assembly['v']['data']
    #print(data)
    json_data = json.loads(data)
    found_data = False
    for protein, attributes in json_data.items():
        
        if attributes.get('binds_sars_protein') == "yes" or attributes.get('Inhibits_SARS') == "yes":
 #       if attributes.get('Inhibits_SARS') == "yes":
            
            found_data = True           
            #break
            
    if found_data:

        assembly['interesting'] = score_assembly(assembly)

        #print(f'Dataset: {dataset.name}')
        #dataset = dataset_from_assembly(db, assembly)

sorted_assemblies = sorted((d for d in assemblies if 'interesting' in d and isinstance(d['interesting'], (int, float))), key=lambda x: x['interesting'], reverse=True)

top_n = sorted_assemblies[:10] 

for assembly in top_n:
    name = assembly["v"]["name"]
    community = assembly["v"].get("CD_CommunityName")
    print(f'Assembly: {name} = {community} Interesting: {assembly["interesting"]}')
    #print(f'Data: {assembly["v"]["data"]}')
    dataset = dataset_from_assembly(db, assembly, decimal_places=0, columns=sars_column_name_mapping, experiment_description=sars_hierarchy.get_experiment_description())

Assembly: C64290 = FFAT motif binding Interesting: 5
Assembly: C64285 = SNAP receptor activity Interesting: 2
Assembly: C64289 = protein folding chaperone complex Interesting: 2
Assembly: C64292 = SELK multiprotein complex Interesting: 2
Assembly: C64324 = SELK multiprotein complex Interesting: 2
Assembly: C64339 = NADPH oxidase complex Interesting: 2
Assembly: C64353 = Cul2-RING ubiquitin ligase complex Interesting: 2
Assembly: C64364 = Mitophagy Interesting: 2
Assembly: C64372 = (none) Interesting: 2
Assembly: C64286 = small-subunit processome Interesting: 0


In [16]:
top_n

[{'id': 1632995,
  'v': {'CD_Labeled': True,
   'CD_AnnotatedAlgorithm': 'Annotated by gProfiler [Docker: coleslawndex/cdgprofilergenestoterm:0.3.0] {{--organism=hsapiens, --maxpval=0.00001, --minoverlap=0.05, --maxgenelistsize=50}} via CyCommunityDetection Cytoscape App (1.12.1)',
   'name': 'C64290',
   'CD_MemberList': 'AANAT ASB10 C9orf72 CHMP2B FUS IFITM2 IFITM3 LY6E MOSPD2 OPTN PPM1L RAB8A SOD1 TBC1D17 TBK1 TMEM106B TRAF3IP1 VAPA VAPB',
   'CD_AnnotatedMembers_Size': 3,
   'CD_AnnotatedMembers_Pvalue': 3.828388004920102e-09,
   'CD_AnnotatedMembers': 'MOSPD2 VAPA VAPB',
   'CD_CommunityName': 'FFAT motif binding',
   'CD_AnnotatedMembers_Overlap': 0.158,
   'HiDeF_persistence': 13,
   'CD_AnnotatedMembers_SourceTerm': 'GO:0033149',
   'CD_MemberList_LogSize': 4.248,
   'CD_NonAnnotatedMembers': 'AANAT ASB10 C9orf72 CHMP2B FUS IFITM2 IFITM3 LY6E OPTN PPM1L RAB8A SOD1 TBC1D17 TBK1 TMEM106B TRAF3IP1',
   'CD_AnnotatedMembers_SourceDB': 'GO:MF',
   'CD_MemberList_Size': 19,
   'data'