In [1]:
import sys, csv, os
# if running from clone of the git repo
sys.path.append('../src')

# import the main NPLinker class. normally this all that's required to work
# with NPLinker in a notebook environment
from nplinker.nplinker import NPLinker

In [2]:
# the standard method of loading a dataset configuration is to pass the filename
# of a TOML configuration file to the NPLinker constructor. 
npl = NPLinker('nplinker_demo1.toml')
# loading the actual data files can take some time depending on the dataset,
# so this is done separately by calling the load_data method.
#
# During the loading process, logging messages will be printed to stdout. This
# can be useful for debugging problems with files not being discovered or parsed
# correctly. You can control the verbosity of these messages in the configuration
# file if required, and/or redirect them to a file instead of stdout. 
npl.load_data()

10:23:47 [INFO] config.py:157, Selected platform project ID MSV000079284
10:23:47 [INFO] downloader.py:187, Downloader for MSV000079284, caching to /home/hechth/nplinker_data/pairedomics
10:23:47 [INFO] downloader.py:195, Using existing copy of platform project data
10:23:47 [DEBUG] downloader.py:211, platform_id MSV000079284 matched to pairedomics_id 4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4
10:23:47 [INFO] downloader.py:222, Found project, retrieving JSON data...
10:23:47 [DEBUG] downloader.py:820, Downloaded https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4 to /home/hechth/nplinker_data/pairedomics/MSV000079284.json
10:23:47 [DEBUG] loader.py:190, DatasetLoader(platform:MSV000079284, MSV000079284, True)
10:23:47 [DEBUG] nplinker.py:140, Enabled scoring method: metcalf
10:23:47 [DEBUG] nplinker.py:140, Enabled scoring method: rosetta
10:23:47 [DEBUG] nplinker.py:140, Enabled scoring method: npclassscore
10:23:47 [DEBUG] nplinker.py:265, load_d

  counts_df = pd.DataFrame.from_dict(counts, dtype=int)


10:24:02 [DEBUG] nplinker.py:315, load_data: completed


True

In [3]:
# Basic functionality
# ===================
#
# Once you have an NPLinker object with all data loaded, there are a collection of simple
# methods and properties you can use to access objects and metadata. Some examples are 
# given below, see https://nplinker.readthedocs.io/en/latest/ for a complete API description.

# configuration/dataset metadata
# - a copy of the configuration as parsed from the .toml file (dict)
print(npl.config) 
# - the path to the directory where various nplinker data files are located (e.g. the 
#   default configuration file template) (str)
print(npl.data_dir)
# - a dataset ID, derived from the path for local datasets or the paired platform ID
#   for datasets loaded from that source (str)
print(npl.dataset_id)
# - the root directory for the current dataset (str)
print(npl.root_dir)

# objects
# - you can directly access lists of each of the 4 object types:
print('BGCs:', len(npl.bgcs))
print('GCFs:', len(npl.gcfs)) # contains GCF objects
print('Spectra:', len(npl.spectra)) # contains Spectrum objects
print('Molecular Families:', len(npl.molfams)) # contains MolecularFamily objects

{'loglevel': 'DEBUG', 'logfile': '', 'log_to_stdout': True, 'repro_file': '', 'dataset': {'root': 'platform:MSV000079284', 'platform_id': 'MSV000079284'}, 'antismash': {'antismash_format': 'default', 'ignore_spaces': False}, 'docker': {'run_bigscape': True, 'run_canopus': False, 'extra_canopus_parameters': '--maxmz 600 formula zodiac structure canopus'}, 'webapp': {'tables_metcalf_score': 3.0}, 'scoring': {'rosetta': {}}}
/home/hechth/dev/git/hechth/nplinker/src/nplinker/data
MSV000079284
/home/hechth/nplinker_data/pairedomics/extracted/MSV000079284
BGCs: 387
GCFs: 128
Spectra: 25935
Molecular Families: 25769


In [4]:
# Scoring functionality - part 1
# ==============================
# (again see https://nplinker.readthedocs.io/en/latest/ for API documentation)

# NPLinker provides a set of scoring methods that can be used individually or 
# in combination to find interesting links in the current dataset. To get a
# get a list of the names of the available scoring methods:
print('Available scoring methods:')
for m in npl.scoring_methods:
    print(' - {}'.format(m))
    
# The first step in running a scoring operation is to get an instance of the
# method(s) you want to use by calling scoring_method():
mc = npl.scoring_method('metcalf')

# Now mc is an instance of the class that implements Metcalf scoring. Once
# you have such an instance, you may change any of the parameters it exposes.
# In the case of Metcalf scoring, the following parameters are currently exposed:
# - cutoff (float): the scoring threshold. Links with scores less than this are excluded
# - standardised (bool): set to True to use standardised scores (default), False for regular
mc.cutoff = 3.5
mc.standardised = True

Available scoring methods:
 - metcalf
 - rosetta
 - npclassscore
10:24:03 [INFO] metcalf_scoring.py:37, MetcalfScoring.setup (bgcs=387, gcfs=128, spectra=25935, molfams=25769, strains=14)
10:24:03 [INFO] metcalf_scoring.py:76, MetcalfScoring.setup preprocessing dataset (this may take some time)
10:24:03 [DEBUG] data_linking.py:93, Create mappings between spectra, gcfs, and strains.
10:24:03 [DEBUG] data_linking.py:97, Create co-occurence matrices: spectra<->strains + and gcfs<->strains.
10:24:03 [DEBUG] data_linking.py:105, Create correlation matrices: spectra<->gcfs.
10:24:03 [DEBUG] data_linking.py:384, Calculating correlation matrices of type: spec-gcf
10:24:03 [DEBUG] data_linking.py:107, Create correlation matrices: mol-families<->gcfs.
10:24:03 [DEBUG] data_linking.py:384, Calculating correlation matrices of type: fam-gcf
10:24:03 [DEBUG] metcalf_scoring.py:90, MetcalfScoring.setup caching results
10:24:03 [INFO] metcalf_scoring.py:94, MetcalfScoring.setup completed


In [5]:
# Scoring functionality - part 2
# ==============================

# After creating and optionally configuring a scoring method, you need to call
# get_links() to perform the operation on a selected set of objects. This method
# takes 2-3 parameters, the third being optional:
#  - a list of objects to find links from (or a list of lists of objects)
#  - a list of scoring methods, or a single method as shorthand for a 1-element list
#  - (optional) a boolean indicating if results from multiple methods should be 
#     ANDed together to produce the final results. If set to False, the results will
#     contain links found by any method rather than all methods. 
# 
# This first example shows the simplest case: 1 set of objects and 1 scoring method. 
# If the and_mode parameter is not given it defaults to True, but the value doesn't 
# matter here because only one method is being used. 
results = npl.get_links(npl.gcfs, mc, and_mode=True) 

# get_links returns an instance of a class called LinkCollection. This provides a wrapper
# around the results of the scoring operation and has various useful properties/methods:
#
# - len(results) or .source_count will tell you how many of the input_objects were found to have links
print('Number of results: {}'.format(len(results)))
# - .sources is a list of those objects
objects_with_links = results.sources
# - .links is a dict with structure {input_object: {linked_object: ObjectLink}} 
objects_and_link_info = results.links
# - .get_all_targets() will return a flat list of *all* the linked objects (for all sources)
all_targets = results.get_all_targets() 
# - .methods is a list of the scoring methods passed to get_links
methods = results.methods

10:24:03 [DEBUG] nplinker.py:372, get_links: 1 object sets, 1 methods
10:24:03 [DEBUG] nplinker.py:406, Calling scoring method metcalf on 128 objects
10:24:03 [DEBUG] metcalf_scoring.py:206, MetcalfScoring: standardised = True
10:24:04 [DEBUG] metcalf_scoring.py:142, Postprocessing results for standardised Metcalf scores (gen input)
10:24:11 [DEBUG] metcalf_scoring.py:240, MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, inputs=128, results=(3, 63892)
10:24:12 [DEBUG] metcalf_scoring.py:313, MetcalfScoring found 115 results
10:24:12 [DEBUG] metcalf_scoring.py:316, MetcalfScoring: completed
10:24:12 [DEBUG] nplinker.py:412, Creating internal datalinks object
10:24:12 [DEBUG] nplinker.py:415, Created internal datalinks object
10:24:12 [DEBUG] nplinker.py:423, Calculating shared strain information...


IndexError: index 25768 is out of bounds for axis 0 with size 29

In [None]:
# Scoring functionality - part 3
# ==============================
# 
# The link data inside the LinkCollection object is itself stored in ObjectLink objects.
# Each instance of an ObjectLink represents a link between a given pair of objects as
# determined by 1 or more scoring methods. 
#
# ObjectLinks have the following basic attributes:
# - .source: the input object provided to the method
# - .target: the linked object
# - .methods: a list of the methods that found this link
# - .shared_strains: a list of Strain objects (possibly empty) shared between .source and .target
# - .data(<method_object>): return the output of <method_object> for this link (e.g. any score values)
# 
# You can also retrieve any method-specific info for a link by subscripting these objects with 
# the appropriate method object, e.g. metcalf_link_data = object_link[mc] 

# This shows how to iterate over the link information from result.links. In the body of the loop
# <obj> will be one of  the original objects supplied to get_links and <result> will be a dict 
# with structure {linked_object: ObjectLink} (indicating <obj> is linked to <linked_object> according to
# the information stored in the ObjectLink)
for obj, result in results.links.items():
    # display the object, the number of links it has, and the number of methods that were used to get them
    print('Results for object: {}, {} total links, {} methods used'.format(obj, len(result), results.method_count))
    
    # sorting is method-dependent since they might have very different "scores", so you should
    # use the original object to do this. For Metcalf scoring, this will return the ObjectLinks sorted
    # by their Metcalf scores. 
    sorted_links = results.get_sorted_links(mc, obj)
    # or if you wanted them in the reverse order:
    # sorted_links = results.get_sorted_links(mc, obj, reverse=True)
    
    # Now display some link information for each link associated with <obj>.
    # link_data[<method_object>] will return the per-link data generated by that 
    # method. Here the metcalf method simply returns the link score as a floating point value,
    # but other methods may return more complex objects. 
    # 
    # Each scoring method also has a format_data method which should provide a relatively short 
    # human-readable summary of the data, as a quick way to print and examine results. 
    for link_data in sorted_links:
        print('  --> [{}] {} | {} | shared strains = {}'.format(','.join(method.name for method in link_data.methods), 
                                                                link_data.target, 
                                                                mc.format_data(link_data[mc]), 
                                                                len(link_data.shared_strains)))
        
    # alternatively, if you don't care about ordering, you can just iterate directly over the 
    # linked objects like this:
    # for link_target, link_data in result.items():
    #    print(link_target, link_data)
    

Results for object: GCF(id=18, class=Others, gcf_id=2096, strains=1), 477 total links, 1 methods used
  --> [metcalf] Spectrum(id=367, spectrum_id=368, strains=1) | 3.6056 | shared strains = 1
  --> [metcalf] Spectrum(id=1127, spectrum_id=1128, strains=1) | 3.6056 | shared strains = 1
  --> [metcalf] Spectrum(id=1128, spectrum_id=1129, strains=1) | 3.6056 | shared strains = 1
  --> [metcalf] Spectrum(id=1129, spectrum_id=1130, strains=1) | 3.6056 | shared strains = 1
  --> [metcalf] Spectrum(id=1130, spectrum_id=1131, strains=1) | 3.6056 | shared strains = 1
  --> [metcalf] Spectrum(id=1131, spectrum_id=1132, strains=1) | 3.6056 | shared strains = 1
  --> [metcalf] Spectrum(id=1132, spectrum_id=1133, strains=1) | 3.6056 | shared strains = 1
  --> [metcalf] Spectrum(id=1133, spectrum_id=1134, strains=1) | 3.6056 | shared strains = 1
  --> [metcalf] Spectrum(id=1134, spectrum_id=1135, strains=1) | 3.6056 | shared strains = 1
  --> [metcalf] Spectrum(id=1135, spectrum_id=1136, strains=1) 

In [None]:
# Scoring functionality - part 4
# ==============================
#
# The LinkCollection object supports performing various types of filtering on the original set of
# results contained within it:
# - .filter_no_shared_strains(): remove any links where the linked objects do not share strains
# - .filter_sources(callable), .filter_targets(callable), .filter_links(callable): each of these
#     simply execute callable(object) and filter out objects for which the return value is False/0. 
#     The <objects> in each case are respectively: the original input objects (sources), 
#     their linked objects (targets), and the ObjectLink objects (links).
#
# NOTE:
# - these methods all modify the original LinkCollection in-place
# - they will automatically remove any original results for which no links exist after filtering. For
#    example, if there is a source object which starts off with 2 links, but has 0 after a filter is
#    run, this object will not appear in the LinkCollection afterwards.
#
# Examples:
# - exclude any sources for which an arbitrary function is false (sources are GCFs in this example)
results.filter_sources(lambda gcf: gcf.id % 2 == 0)
# - exclude any linked objects for which an arbitrary function is false (targets are Spectrum objects here)
results.filter_targets(lambda spec: spec.id % 1 == 0)
# - exclude any links for which an arbitrary function is false (<link> is an ObjectLink)
results.filter_links(lambda link: link[mc] > 3.6)

10:09:47 [DEBUG] link_collection.py:115, filter_sources: 116 => 59


In [None]:
# Scoring functionality - part 5
# ==============================
# 
# The get_links method can be passed more complex parameters types than the above example which 
# used a flat list of input objects and a single scoring method instance. 

ts = npl.scoring_method('testscore') # copy of Metcalf method, only for debug use

# You can use the same set of objects with two different methods, and AND the results
# together so that objects will only be returned which have links according to 
# BOTH of the supplied methods (if you provide 2 or more scoring methods but only a single 
# set of objects, that set will be used as input to every method).
results = npl.get_links(npl.gcfs[:10], [mc, ts], and_mode=True)

10:09:49 [DEBUG] nplinker.py:372, get_links: 1 object sets, 2 methods
10:09:49 [DEBUG] nplinker.py:389, Duplicating input object set
10:09:49 [DEBUG] nplinker.py:392, Duplicating input object set
10:09:49 [DEBUG] nplinker.py:406, Calling scoring method metcalf on 10 objects
10:09:49 [DEBUG] metcalf_scoring.py:206, MetcalfScoring: standardised = True
10:09:49 [DEBUG] metcalf_scoring.py:142, Postprocessing results for standardised Metcalf scores (gen input)
10:09:50 [DEBUG] metcalf_scoring.py:240, MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, inputs=10, results=(3, 3674)
10:09:50 [DEBUG] metcalf_scoring.py:313, MetcalfScoring found 8 results
10:09:50 [DEBUG] metcalf_scoring.py:316, MetcalfScoring: completed


AttributeError: 'NoneType' object has no attribute 'name'