Skip to content

Commit

Permalink
Merge pull request #17 from jonathanking/add_to_pypi
Browse files Browse the repository at this point in the history
Add to PyPI/pip; Implement scn.create
  • Loading branch information
jonathanking committed Mar 17, 2021
2 parents b2f2bfd + bce9190 commit 518762b
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 59 deletions.
37 changes: 24 additions & 13 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
"""
SideChainNet
A protein structure prediction data set that includes side chain information. A direct extension of ProteinNet by Mohammed AlQuraishi.
"""
"""Tools and data for all-atom protein structure prediction via machine learning."""
import sys
from setuptools import setup, find_packages
import versioneer
Expand All @@ -12,11 +9,8 @@
needs_pytest = {'pytest', 'test', 'ptr'}.intersection(sys.argv)
pytest_runner = ['pytest-runner'] if needs_pytest else []

try:
with open("README.md", "r") as handle:
long_description = handle.read()
except:
long_description = "\n".join(short_description[2:])
with open("README.md", "r") as handle:
long_description = handle.read()

setup(
# Self-descriptive entries which should always be present
Expand All @@ -39,19 +33,36 @@
# Customize MANIFEST.in if the general case does not suit your needs
# Comment out this line to prevent the files from being packaged with your software
include_package_data=True,
package_data={
"astral_data": ["resources/astral_data.txt"],
"full_protein_dssp": ["resources/full_protein_dssp_annotations.json"],
"single_domain_dssp": ["resources/single_domain_dssp_annotations.json"]
},

# Allows `setup.py test` to work correctly with pytest
setup_requires=[] + pytest_runner,

# Additional entries you may want simply uncomment the lines you want and fill in the data
# url='http://www.my_package.com', # Website
# install_requires=[], # Required packages, pulls from pip if needed; do not use for Conda deployment
url='https://github.com/jonathanking/sidechainnet', # Website
install_requires=[
'ProDy>=2.0', 'numpy', 'scipy', 'torch>=1.7', 'biopython', 'tqdm', 'py3Dmol',
'requests', 'setuptools'
], # Required packages, pulls from pip if needed; do not use for Conda deployment
tests_require=['pytest'],
# platforms=['Linux',
# 'Mac OS-X',
# 'Unix',
# 'Windows'], # Valid platforms your code works on, adjust to your flavor
# python_requires=">=3.5", # Python version restrictions
python_requires=">=3.5", # Python version restrictions

# Manual control if final package is compressible or not, set False to prevent the .egg from being made
# zip_safe=False,
)
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: BSD License',
'Programming Language :: Python :: 3',
'Operating System :: POSIX :: Linux',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Scientific/Engineering :: Bio-Informatics'
])
22 changes: 6 additions & 16 deletions sidechainnet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,22 @@

import os

_ROOT = os.path.abspath(os.path.dirname(__file__))


def get_data(path):
"""Return absolute path to specified package resource.
Args:
path (str): Filename of resource, e.g. "astral_data.txt".
Returns:
str: Path to requested resource.
"""
return os.path.join(_ROOT, 'resources', path)

# Handle versioneer
from ._version import get_versions

# Add imports here
from .structure.StructureBuilder import StructureBuilder
from .structure.BatchedStructureBuilder import BatchedStructureBuilder
from .utils.load import load

from .utils.download import VALID_SPLITS, DATA_SPLITS

from .create import create

versions = get_versions()
__version__ = versions['version']
__git_revision__ = versions['full-revisionid']
del get_versions, versions

__author__ = "Jonathan King"
__credits__ = ("Carnegie Mellon University–"
"University of Pittsburgh Joint PhD Program in Computational Biology\n"
"David Koes, PhD, Advisor.")
89 changes: 72 additions & 17 deletions sidechainnet/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"""

import argparse
from collections import namedtuple
import os
import re
from multiprocessing import Pool, cpu_count
Expand All @@ -43,6 +44,10 @@
pr.confProDy(verbosity="none")
pr.confProDy(auto_secondary=False)

ArgsTuple = namedtuple(
"ArgsTuple", "casp_version training_set proteinnet_in proteinnet_out "
"sidechainnet_out regenerate_scdata limit")


def combine(pn_entry, sc_entry, aligner, pnid):
"""Supplements one entry in ProteinNet with sidechain information.
Expand Down Expand Up @@ -155,7 +160,45 @@ def format_sidechainnet_path(casp_version, training_split):
return f"sidechainnet_casp{casp_version}_{training_split}.pkl"


def create():
def create(casp_version=12,
training_set=30,
proteinnet_in=None,
proteinnet_out="data/proteinnet/",
sidechainnet_out="data/sidechainnet/",
regenerate_scdata=False,
limit=None):
"""Generate the requested SidechainNet dataset and save pickled result files.
This function replicates CLI behavior of calling `python sidechainnet/create.py`.
Args:
casp_version (int, optional): CASP dataset version (7-12). Defaults to 12.
training_set (int, optional): Training set thinning (30, 50, 70, 90, 95, 100
where 100 means 100% of the training set is kept). Defaults to 30.
proteinnet_in ([type], optional): Path to ProteinNet raw text files, previously
downloaded by the user. Defaults to None.
proteinnet_out (str, optional): Path for saving processed ProteinNet records.
Defaults to "data/proteinnet/".
sidechainnet_out (str, optional): Path for saving processed SidechainNet records.
Defaults to "data/sidechainnet/".
regenerate_scdata (bool, optional): If true, regenerate raw sidechain-applicable
data instead of searching for data that has already been preprocessed.
Defaults to False.
limit (bool, optional): The upper limit on number of proteins to process,
useful when debugging. Defaults to None.
Raises:
ValueError: when ProteinNet data paths are non-existant or not as expected.
"""
if proteinnet_in is None:
raise ValueError("Please provide a value for proteinnet_in that "
"points to the directory where raw ProteinNet files are stored.")
args = ArgsTuple(casp_version, training_set, proteinnet_in, proteinnet_out,
sidechainnet_out, regenerate_scdata, limit)
main(args)


def _create(args):
"""Generates SidechainNet for a single CASP thinning."""
# First, parse raw proteinnet files into Python dictionaries for convenience
pnids = parse_raw_proteinnet(args.proteinnet_in, args.proteinnet_out,
Expand All @@ -182,7 +225,7 @@ def create():
print(f"SidechainNet for CASP {args.casp_version} written to {sidechainnet_outfile}.")


def create_all():
def _create_all(args):
"""Generates all thinnings of a particular CASP dataset, starting with the largest."""
# First, parse raw proteinnet files into Python dictionaries for convenience
pnids = parse_raw_proteinnet(args.proteinnet_in, args.proteinnet_out, 100)
Expand Down Expand Up @@ -219,6 +262,29 @@ def create_all():
f"({training_set}% thinning) written to {sc_outfile}.")


def main(args_tuple):
"""Run _create or _create_all using the arguments provided by the namedtuple."""
if args_tuple.training_set != 'all':
args_tuple = args_tuple._replace(training_set=int(args_tuple.training_set))

match = re.search(r"casp(\d+)", args_tuple.proteinnet_in, re.IGNORECASE)
if not match:
raise ValueError("The input_dir does not contain 'caspX'. "
"Please ensure the raw files are enclosed "
"in a path that contains the CASP version"
" i.e. 'casp12'.")
args_tuple = args_tuple._replace(casp_version=match.group(1))

# Initialize DSSP data
from sidechainnet.utils.download import _init_dssp_data
_init_dssp_data()

if args_tuple.training_set == 'all':
_create_all(args_tuple)
else:
_create(args_tuple)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Constructs SidechainNet.")
parser.add_argument('proteinnet_in',
Expand Down Expand Up @@ -254,18 +320,7 @@ def create_all():
help=('If True, then regenerate the sidechain-only data even if it already exists'
' locally.'))
args = parser.parse_args()
if args.training_set != 'all':
args.training_set = int(args.training_set)

match = re.search(r"casp(\d+)", args.proteinnet_in, re.IGNORECASE)
if not match:
raise parser.error("The input_dir does not contain 'caspX'. "
"Please ensure the raw files are enclosed "
"in a path that contains the CASP version"
" i.e. 'casp12'.")
args.casp_version = match.group(1)

if args.training_set == 'all':
create_all()
else:
create()
args_tuple = ArgsTuple(args.casp_version, args.training_set, args.proteinnet_in,
args.proteinnet_out, args.sidechainnet_out,
args.regenerate_scdata, args.limit)
main(args_tuple)
27 changes: 21 additions & 6 deletions sidechainnet/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

import multiprocessing
import os
import pkg_resources
from glob import glob
import requests

import prody as pr
import tqdm

from sidechainnet import get_data
import sidechainnet.utils.errors as errors
from sidechainnet.utils.measure import get_seq_coords_and_angles, no_nans_infs_allzeros
from sidechainnet.utils.parse import get_chain_from_astral_id, parse_astral_summary_file, parse_dssp_file
Expand All @@ -17,14 +17,28 @@
VALID_SPLITS_INTS = [10, 20, 30, 40, 50, 70, 90]
VALID_SPLITS = [f'valid-{s}' for s in VALID_SPLITS_INTS]
DATA_SPLITS = ['train', 'test'] + VALID_SPLITS
with open(get_data("astral_data.txt"), "r") as astral_file:
ASTRAL_ID_MAPPING = parse_astral_summary_file(astral_file.read().splitlines())
D_AMINO_ACID_CODES = [
"DAL", "DSN", "DTH", "DCY", "DVA", "DLE", "DIL", "MED", "DPR", "DPN", "DTY", "DTR",
"DSP", "DGL", "DSG", "DGN", "DHI", "DLY", "DAR"
]
PROTEIN_DSSP_DATA = parse_dssp_file(get_data("full_protein_dssp_annotations.json"))
PROTEIN_DSSP_DATA.update(parse_dssp_file(get_data("single_domain_dssp_annotations.json")))
ASTRAL_ID_MAPPING = None
PROTEIN_DSSP_DATA = None


def _init_dssp_data():
global PROTEIN_DSSP_DATA
global ASTRAL_ID_MAPPING
PROTEIN_DSSP_DATA = parse_dssp_file(
pkg_resources.resource_filename("sidechainnet",
"resources/full_protein_dssp_annotations.json"))
PROTEIN_DSSP_DATA.update(
parse_dssp_file(
pkg_resources.resource_filename(
"sidechainnet", "resources/single_domain_dssp_annotations.json")))
with open(
pkg_resources.resource_filename("sidechainnet", "resources/astral_data.txt"),
"r") as astral_file:
ASTRAL_ID_MAPPING = parse_astral_summary_file(astral_file.read().splitlines())


def download_sidechain_data(pnids,
Expand Down Expand Up @@ -160,6 +174,7 @@ def process_id(pnid):

# If we've made it this far, we can unpack the data and return it
dihedrals, coords, sequence = dihedrals_coords_sequence

if "#" not in pnid:
try:
dssp = PROTEIN_DSSP_DATA[pnid]
Expand Down Expand Up @@ -259,7 +274,7 @@ def get_chain_from_trainid(pnid):
# For now, if the requested coordinate set doesn't exist, then we will
# default to using the only (first) available coordinate set
struct = pr.parsePDB(pdbid, chain=chid) if use_pdb else pr.parseMMCIF(pdbid,
chain=chid)
chain=chid)
if struct and chnum > 1:
try:
chain = pr.parsePDB(pdbid, chain=chid, model=1)
Expand Down
2 changes: 1 addition & 1 deletion sidechainnet/utils/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def process_file(input_filename_out_dir, return_ids=False):


def parse_raw_proteinnet(proteinnet_in_dir, proteinnet_out_dir, training_set):
"""Extracts and saves information for a single ProteinNet dataset.
"""Extract and saves information for a single ProteinNet dataset.
Preprocesses raw ProteinNet records by reading them and transforming them
into PyTorch-saved dictionaries. Files are kept separate due to file size.
Expand Down
12 changes: 6 additions & 6 deletions versioneer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,8 +1038,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
"""Get version from 'git describe' in the root of the source tree.
This only gets called if the git-archive 'subst' keywords were *not*
expanded, and _version.py hasn't already been rewritten with a short
version string, meaning we're inside a checked out source tree.
expanded, and _version.py hasn't already been rewritten with a short version
string, meaning we're inside a checked out source tree.
"""
GITS = ["git"]
if sys.platform == "win32":
Expand Down Expand Up @@ -1125,8 +1125,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
def do_vcs_install(manifest_in, versionfile_source, ipy):
"""Git-specific installation logic for Versioneer.
For Git, this means creating/changing .gitattributes to mark _version.py
for export-subst keyword substitution.
For Git, this means creating/changing .gitattributes to mark _version.py for
export-subst keyword substitution.
"""
GITS = ["git"]
if sys.platform == "win32":
Expand Down Expand Up @@ -1164,8 +1164,8 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
"""Try to determine the version from the parent directory name.
Source tarballs conventionally unpack into a directory that includes both
the project name and a version string. We will also support searching up
two directory levels for an appropriately named parent directory
the project name and a version string. We will also support searching up two
directory levels for an appropriately named parent directory
"""
rootdirs = []

Expand Down

0 comments on commit 518762b

Please sign in to comment.