Merge pull request #17 from jonathanking/add_to_pypi

Add to PyPI/pip; Implement scn.create
jonathanking · Mar 17, 2021 · 518762b · 518762b
2 parents b2f2bfd + bce9190
commit 518762b
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 59 deletions.
diff --git a/setup.py b/setup.py
@@ -1,7 +1,4 @@
-"""
-SideChainNet
-A protein structure prediction data set that includes side chain information. A direct extension of ProteinNet by Mohammed AlQuraishi.
-"""
+"""Tools and data for all-atom protein structure prediction via machine learning."""
 import sys
 from setuptools import setup, find_packages
 import versioneer
@@ -12,11 +9,8 @@
 needs_pytest = {'pytest', 'test', 'ptr'}.intersection(sys.argv)
 pytest_runner = ['pytest-runner'] if needs_pytest else []
 
-try:
-    with open("README.md", "r") as handle:
-        long_description = handle.read()
-except:
-    long_description = "\n".join(short_description[2:])
+with open("README.md", "r") as handle:
+    long_description = handle.read()
 
 setup(
     # Self-descriptive entries which should always be present
@@ -39,19 +33,36 @@
     # Customize MANIFEST.in if the general case does not suit your needs
     # Comment out this line to prevent the files from being packaged with your software
     include_package_data=True,
+    package_data={
+        "astral_data": ["resources/astral_data.txt"],
+        "full_protein_dssp": ["resources/full_protein_dssp_annotations.json"],
+        "single_domain_dssp": ["resources/single_domain_dssp_annotations.json"]
+        },
 
     # Allows `setup.py test` to work correctly with pytest
     setup_requires=[] + pytest_runner,
 
     # Additional entries you may want simply uncomment the lines you want and fill in the data
-    # url='http://www.my_package.com',  # Website
-    # install_requires=[],              # Required packages, pulls from pip if needed; do not use for Conda deployment
+    url='https://github.com/jonathanking/sidechainnet',  # Website
+    install_requires=[
+        'ProDy>=2.0', 'numpy', 'scipy', 'torch>=1.7', 'biopython', 'tqdm', 'py3Dmol',
+        'requests', 'setuptools'
+    ],  # Required packages, pulls from pip if needed; do not use for Conda deployment
+    tests_require=['pytest'],
     # platforms=['Linux',
     #            'Mac OS-X',
     #            'Unix',
     #            'Windows'],            # Valid platforms your code works on, adjust to your flavor
-    # python_requires=">=3.5",          # Python version restrictions
+    python_requires=">=3.5",  # Python version restrictions
 
     # Manual control if final package is compressible or not, set False to prevent the .egg from being made
     # zip_safe=False,
-)
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: BSD License',
+        'Programming Language :: Python :: 3',
+        'Operating System :: POSIX :: Linux',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Scientific/Engineering :: Bio-Informatics'
+    ])
diff --git a/sidechainnet/__init__.py b/sidechainnet/__init__.py
@@ -6,32 +6,22 @@
 
 import os
 
-_ROOT = os.path.abspath(os.path.dirname(__file__))
-
-
-def get_data(path):
-    """Return absolute path to specified package resource.
-
-    Args:
-        path (str): Filename of resource, e.g. "astral_data.txt".
-
-    Returns:
-        str: Path to requested resource.
-    """
-    return os.path.join(_ROOT, 'resources', path)
-
 # Handle versioneer
 from ._version import get_versions
 
 # Add imports here
 from .structure.StructureBuilder import StructureBuilder
 from .structure.BatchedStructureBuilder import BatchedStructureBuilder
 from .utils.load import load
-
 from .utils.download import VALID_SPLITS, DATA_SPLITS
-
+from .create import create
 
 versions = get_versions()
 __version__ = versions['version']
 __git_revision__ = versions['full-revisionid']
 del get_versions, versions
+
+__author__ = "Jonathan King"
+__credits__ = ("Carnegie Mellon University–"
+               "University of Pittsburgh Joint PhD Program in Computational Biology\n"
+               "David Koes, PhD, Advisor.")
diff --git a/sidechainnet/create.py b/sidechainnet/create.py
@@ -22,6 +22,7 @@
 """
 
 import argparse
+from collections import namedtuple
 import os
 import re
 from multiprocessing import Pool, cpu_count
@@ -43,6 +44,10 @@
 pr.confProDy(verbosity="none")
 pr.confProDy(auto_secondary=False)
 
+ArgsTuple = namedtuple(
+    "ArgsTuple", "casp_version training_set proteinnet_in proteinnet_out "
+    "sidechainnet_out regenerate_scdata limit")
+
 
 def combine(pn_entry, sc_entry, aligner, pnid):
     """Supplements one entry in ProteinNet with sidechain information.
@@ -155,7 +160,45 @@ def format_sidechainnet_path(casp_version, training_split):
     return f"sidechainnet_casp{casp_version}_{training_split}.pkl"
 
 
-def create():
+def create(casp_version=12,
+           training_set=30,
+           proteinnet_in=None,
+           proteinnet_out="data/proteinnet/",
+           sidechainnet_out="data/sidechainnet/",
+           regenerate_scdata=False,
+           limit=None):
+    """Generate the requested SidechainNet dataset and save pickled result files.
+
+    This function replicates CLI behavior of calling `python sidechainnet/create.py`.
+
+    Args:
+        casp_version (int, optional): CASP dataset version (7-12). Defaults to 12.
+        training_set (int, optional): Training set thinning (30, 50, 70, 90, 95, 100
+            where 100 means 100% of the training set is kept). Defaults to 30.
+        proteinnet_in ([type], optional): Path to ProteinNet raw text files, previously
+            downloaded by the user. Defaults to None.
+        proteinnet_out (str, optional): Path for saving processed ProteinNet records.
+            Defaults to "data/proteinnet/".
+        sidechainnet_out (str, optional): Path for saving processed SidechainNet records.
+            Defaults to "data/sidechainnet/".
+        regenerate_scdata (bool, optional): If true, regenerate raw sidechain-applicable
+            data instead of searching for data that has already been preprocessed.
+            Defaults to False.
+        limit (bool, optional): The upper limit on number of proteins to process,
+            useful when debugging. Defaults to None.
+
+    Raises:
+        ValueError: when ProteinNet data paths are non-existant or not as expected.
+    """
+    if proteinnet_in is None:
+        raise ValueError("Please provide a value for proteinnet_in that "
+                         "points to the directory where raw ProteinNet files are stored.")
+    args = ArgsTuple(casp_version, training_set, proteinnet_in, proteinnet_out,
+                     sidechainnet_out, regenerate_scdata, limit)
+    main(args)
+
+
+def _create(args):
     """Generates SidechainNet for a single CASP thinning."""
     # First, parse raw proteinnet files into Python dictionaries for convenience
     pnids = parse_raw_proteinnet(args.proteinnet_in, args.proteinnet_out,
@@ -182,7 +225,7 @@ def create():
     print(f"SidechainNet for CASP {args.casp_version} written to {sidechainnet_outfile}.")
 
 
-def create_all():
+def _create_all(args):
     """Generates all thinnings of a particular CASP dataset, starting with the largest."""
     # First, parse raw proteinnet files into Python dictionaries for convenience
     pnids = parse_raw_proteinnet(args.proteinnet_in, args.proteinnet_out, 100)
@@ -219,6 +262,29 @@ def create_all():
               f"({training_set}% thinning) written to {sc_outfile}.")
 
 
+def main(args_tuple):
+    """Run _create or _create_all using the arguments provided by the namedtuple."""
+    if args_tuple.training_set != 'all':
+        args_tuple = args_tuple._replace(training_set=int(args_tuple.training_set))
+
+    match = re.search(r"casp(\d+)", args_tuple.proteinnet_in, re.IGNORECASE)
+    if not match:
+        raise ValueError("The input_dir does not contain 'caspX'. "
+                         "Please ensure the raw files are enclosed "
+                         "in a path that contains the CASP version"
+                         " i.e. 'casp12'.")
+    args_tuple = args_tuple._replace(casp_version=match.group(1))
+
+    # Initialize DSSP data
+    from sidechainnet.utils.download import _init_dssp_data
+    _init_dssp_data()
+
+    if args_tuple.training_set == 'all':
+        _create_all(args_tuple)
+    else:
+        _create(args_tuple)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Constructs SidechainNet.")
     parser.add_argument('proteinnet_in',
@@ -254,18 +320,7 @@ def create_all():
         help=('If True, then regenerate the sidechain-only data even if it already exists'
               ' locally.'))
     args = parser.parse_args()
-    if args.training_set != 'all':
-        args.training_set = int(args.training_set)
-
-    match = re.search(r"casp(\d+)", args.proteinnet_in, re.IGNORECASE)
-    if not match:
-        raise parser.error("The input_dir does not contain 'caspX'. "
-                           "Please ensure the raw files are enclosed "
-                           "in a path that contains the CASP version"
-                           " i.e. 'casp12'.")
-    args.casp_version = match.group(1)
-
-    if args.training_set == 'all':
-        create_all()
-    else:
-        create()
+    args_tuple = ArgsTuple(args.casp_version, args.training_set, args.proteinnet_in,
+                           args.proteinnet_out, args.sidechainnet_out,
+                           args.regenerate_scdata, args.limit)
+    main(args_tuple)
diff --git a/sidechainnet/utils/download.py b/sidechainnet/utils/download.py
@@ -2,13 +2,13 @@
 
 import multiprocessing
 import os
+import pkg_resources
 from glob import glob
 import requests
 
 import prody as pr
 import tqdm
 
-from sidechainnet import get_data
 import sidechainnet.utils.errors as errors
 from sidechainnet.utils.measure import get_seq_coords_and_angles, no_nans_infs_allzeros
 from sidechainnet.utils.parse import get_chain_from_astral_id, parse_astral_summary_file, parse_dssp_file
@@ -17,14 +17,28 @@
 VALID_SPLITS_INTS = [10, 20, 30, 40, 50, 70, 90]
 VALID_SPLITS = [f'valid-{s}' for s in VALID_SPLITS_INTS]
 DATA_SPLITS = ['train', 'test'] + VALID_SPLITS
-with open(get_data("astral_data.txt"), "r") as astral_file:
-    ASTRAL_ID_MAPPING = parse_astral_summary_file(astral_file.read().splitlines())
 D_AMINO_ACID_CODES = [
     "DAL", "DSN", "DTH", "DCY", "DVA", "DLE", "DIL", "MED", "DPR", "DPN", "DTY", "DTR",
     "DSP", "DGL", "DSG", "DGN", "DHI", "DLY", "DAR"
 ]
-PROTEIN_DSSP_DATA = parse_dssp_file(get_data("full_protein_dssp_annotations.json"))
-PROTEIN_DSSP_DATA.update(parse_dssp_file(get_data("single_domain_dssp_annotations.json")))
+ASTRAL_ID_MAPPING = None
+PROTEIN_DSSP_DATA = None
+
+
+def _init_dssp_data():
+    global PROTEIN_DSSP_DATA
+    global ASTRAL_ID_MAPPING
+    PROTEIN_DSSP_DATA = parse_dssp_file(
+        pkg_resources.resource_filename("sidechainnet",
+                                        "resources/full_protein_dssp_annotations.json"))
+    PROTEIN_DSSP_DATA.update(
+        parse_dssp_file(
+            pkg_resources.resource_filename(
+                "sidechainnet", "resources/single_domain_dssp_annotations.json")))
+    with open(
+            pkg_resources.resource_filename("sidechainnet", "resources/astral_data.txt"),
+            "r") as astral_file:
+        ASTRAL_ID_MAPPING = parse_astral_summary_file(astral_file.read().splitlines())
 
 
 def download_sidechain_data(pnids,
@@ -160,6 +174,7 @@ def process_id(pnid):
 
     # If we've made it this far, we can unpack the data and return it
     dihedrals, coords, sequence = dihedrals_coords_sequence
+
     if "#" not in pnid:
         try:
             dssp = PROTEIN_DSSP_DATA[pnid]
@@ -259,7 +274,7 @@ def get_chain_from_trainid(pnid):
         # For now, if the requested coordinate set doesn't exist, then we will
         # default to using the only (first) available coordinate set
         struct = pr.parsePDB(pdbid, chain=chid) if use_pdb else pr.parseMMCIF(pdbid,
-                                                                            chain=chid)
+                                                                              chain=chid)
         if struct and chnum > 1:
             try:
                 chain = pr.parsePDB(pdbid, chain=chid, model=1)

diff --git a/sidechainnet/utils/parse.py b/sidechainnet/utils/parse.py
@@ -148,7 +148,7 @@ def process_file(input_filename_out_dir, return_ids=False):
 
 
 def parse_raw_proteinnet(proteinnet_in_dir, proteinnet_out_dir, training_set):
-    """Extracts and saves information for a single ProteinNet dataset.
+    """Extract and saves information for a single ProteinNet dataset.
 
     Preprocesses raw ProteinNet records by reading them and transforming them
     into PyTorch-saved dictionaries. Files are kept separate due to file size.

diff --git a/versioneer.py b/versioneer.py
@@ -1038,8 +1038,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     """Get version from 'git describe' in the root of the source tree.
 
     This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
-    version string, meaning we're inside a checked out source tree.
+    expanded, and _version.py hasn't already been rewritten with a short version
+    string, meaning we're inside a checked out source tree.
     """
     GITS = ["git"]
     if sys.platform == "win32":
@@ -1125,8 +1125,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 def do_vcs_install(manifest_in, versionfile_source, ipy):
     """Git-specific installation logic for Versioneer.
 
-    For Git, this means creating/changing .gitattributes to mark _version.py
-    for export-subst keyword substitution.
+    For Git, this means creating/changing .gitattributes to mark _version.py for
+    export-subst keyword substitution.
     """
     GITS = ["git"]
     if sys.platform == "win32":
@@ -1164,8 +1164,8 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     """Try to determine the version from the parent directory name.
 
     Source tarballs conventionally unpack into a directory that includes both
-    the project name and a version string. We will also support searching up
-    two directory levels for an appropriately named parent directory
+    the project name and a version string. We will also support searching up two
+    directory levels for an appropriately named parent directory
     """
     rootdirs = []