Merge pull request #171 from hammerlab/pyensembl-logger

one logger per module
openvax · Oct 10, 2016 · 12f2fec · 12f2fec
2 parents 98b61b2 + 75dccf3
commit 12f2fec
Show file tree

Hide file tree

Showing 13 changed files with 111 additions and 30 deletions.
diff --git a/RELEASING.md b/RELEASING.md
@@ -0,0 +1,8 @@
+# Releasing Pyensembl
+
+This document explains what do once your [Pull Request](https://www.atlassian.com/git/tutorials/making-a-pull-request/) has been reviewed and all final changes applied. Now you're ready merge your branch into master and release it to the world:
+
+0. Make sure that you have `pandoc` and `pypandoc` installed: this is needed for readme markdown on PyPI. (See [here](http://pandoc.org/installing.html) and [here](https://pypi.python.org/pypi/pypandoc), respectively, for instructions.)
+1. Bump the [version](http://semver.org/) in `__init__.py`, as part of the PR you want to release.
+2. Merge your branch to master.
+2. Run `python setup.py sdist upload`, which pushes the newest release to PyPI.
diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py
@@ -35,7 +35,7 @@
 )
 from .transcript import Transcript
 
-__version__ = '1.0.1'
+__version__ = '1.0.2'
 
 def cached_release(release, species="human"):
     """

diff --git a/pyensembl/database.py b/pyensembl/database.py
@@ -28,6 +28,10 @@
 # any time we update the database schema, increment this version number
 DATABASE_SCHEMA_VERSION = 2
 
+
+logger = logging.getLogger(__name__)
+
+
 class Database(object):
     """
     Wrapper around sqlite3 database so that the rest of the
@@ -51,9 +55,6 @@ def __init__(self, gtf, install_string):
         self.install_string = install_string
         self._connection = None
 
-        self.logger = logging.getLogger()
-        self.logger.setLevel(logging.INFO)
-
     def __eq__(self, other):
         return (
             other.__class__ is Database and
@@ -114,7 +115,7 @@ def _all_possible_indices(self, column_names):
                 # are not available in all releases of Ensembl (or
                 # other GTFs)
                 if column_name not in column_set:
-                    logging.info(
+                    logger.info(
                         "Skipping database index for {%s}",
                         ", ".join(column_group))
                     skip = True
@@ -187,7 +188,7 @@ def create(self, overwrite=False):
                              str(self))
 
         db_path = self.local_db_path()
-        print("Creating database: %s" % (db_path,))
+        logger.info("Creating database: %s", db_path)
         df = self.gtf.dataframe()
         all_index_groups = self._all_possible_indices(df.columns)
 
@@ -396,7 +397,7 @@ def run_sql_query(self, sql, required=False, query_params=[]):
             cursor = self.connection.execute(sql, query_params)
         except sqlite3.OperationalError as e:
             error_message = e.message if hasattr(e, 'message') else str(e)
-            logging.warn(
+            logger.warn(
                 "Encountered error \"%s\" from query \"%s\" with parameters %s",
                 error_message,
                 sql,

diff --git a/pyensembl/download_cache.py b/pyensembl/download_cache.py
@@ -19,6 +19,9 @@
 
 import datacache
 
+
+logger = logging.getLogger(__name__)
+
 CACHE_BASE_SUBDIR = "pyensembl"
 CACHE_DIR_ENV_KEY = "PYENSEMBL_CACHE_DIR"
 
@@ -208,7 +211,7 @@ def _download_if_necessary(self, url, download_if_missing, overwrite):
         cached_path = self.cached_path(url)
         missing = not exists(cached_path)
         if (missing or overwrite) and download_if_missing:
-            logging.info("Fetching %s from URL %s", cached_path, url)
+            logger.info("Fetching %s from URL %s", cached_path, url)
             local_filename = split(cached_path)[1]
             datacache.download._download(
                 filename=local_filename,
@@ -304,7 +307,7 @@ def delete_cached_files(self, prefixes=[], suffixes=[]):
                 any(filename.startswith(pre) for pre in prefixes))
             if delete:
                 path = join(self.cache_directory_path, filename)
-                print("Deleting %s" % path)
+                logger.info("Deleting %s", path)
                 remove(path)
 
     def delete_cache_directory(self):

diff --git a/pyensembl/fasta.py b/pyensembl/fasta.py
@@ -24,6 +24,10 @@
 
 from six import binary_type, PY3
 
+
+logger = logging.getLogger(__name__)
+
+
 def _parse_header_id(line):
     """
     Pull the transcript or protein identifier from the header line
@@ -117,7 +121,7 @@ def _current_entry(self):
         # entry of the file then put the last one in the dictionary
         if self.current_id:
             if len(self.current_lines) == 0:
-                logging.warn("No sequence data for '%s'" % self.current_id)
+                logger.warn("No sequence data for '%s'", self.current_id)
             else:
                 sequence = b"".join(self.current_lines)
                 if PY3:
@@ -134,7 +138,7 @@ def _read_header(self, line):
         self.current_id = _parse_header_id(line)
 
         if len(self.current_id) == 0:
-            logging.warn("Unable to parse ID from header line: %s" % line)
+            logger.warn("Unable to parse ID from header line: %s", line)
 
         self.current_lines = []
         return previous_entry

diff --git a/pyensembl/genome.py b/pyensembl/genome.py
@@ -18,7 +18,6 @@
 """
 
 from __future__ import print_function, division, absolute_import
-import logging
 from os import remove
 from os.path import exists
 
@@ -112,9 +111,6 @@ def __init__(
         self.has_gtf = self._gtf_path_or_url is not None
         self.has_transcript_fasta = self._transcript_fasta_path_or_url is not None
         self.has_protein_fasta = self._protein_fasta_path_or_url is not None
-
-        self.logger = logging.getLogger()
-        self.logger.setLevel(logging.INFO)
         self.memory_cache = MemoryCache()
 
         self._init_lazy_fields()

diff --git a/pyensembl/gtf.py b/pyensembl/gtf.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function, division, absolute_import
+import logging
 from os.path import split, abspath, join, exists, splitext
 import pandas as pd
 
@@ -22,6 +23,10 @@
 from .normalization import normalize_chromosome, normalize_strand
 from .memory_cache import MemoryCache
 
+
+logger = logging.getLogger(__name__)
+
+
 class GTF(object):
     """
     Parse a GTF gene annotation file from a given local path.
@@ -131,7 +136,7 @@ def _load_full_dataframe_from_gtf(self):
         """
         Parse this genome source's GTF file and load it as a Pandas DataFrame
         """
-        print("Reading GTF from %s" % self.gtf_path)
+        logger.info("Reading GTF from %s", self.gtf_path)
         df = read_gtf_as_dataframe(
             self.gtf_path,
             column_converters={
@@ -149,6 +154,7 @@ def _load_full_dataframe_from_gtf(self):
             # if we have to reconstruct gene feature rows then
             # fill in values for 'gene_name' and 'gene_biotype'
             # but only if they're actually present in the GTF
+            logger.info("Creating missing gene features...")
             df = create_missing_features(
                 dataframe=df,
                 unique_keys={"gene": "gene_id"},
@@ -159,8 +165,10 @@ def _load_full_dataframe_from_gtf(self):
                     }.intersection(column_names),
                 },
                 missing_value="")
+            logger.info("Done.")
 
         if "transcript" not in features:
+            logger.info("Creating missing transcript features...")
             df = create_missing_features(
                 dataframe=df,
                 unique_keys={"transcript": "transcript_id"},
@@ -175,6 +183,8 @@ def _load_full_dataframe_from_gtf(self):
                     }.intersection(column_names)
                 },
                 missing_value="")
+            logger.info("Done.")
+
         return df
 
     def dataframe(

diff --git a/pyensembl/logging.conf b/pyensembl/logging.conf
@@ -0,0 +1,42 @@
+[loggers]
+keys=root,pyensembl,datacache
+
+[formatters]
+keys=simpleFormatter
+
+[handlers]
+keys=consoleHandler,consoleHandlerCritical
+
+[logger_root]
+level=INFO
+handlers=consoleHandlerCritical
+
+[handler_consoleHandler]
+class=StreamHandler
+level=INFO
+formatter=simpleFormatter
+args=(sys.stdout,)
+
+[handler_consoleHandlerCritical]  # only for root logger: essentially silent
+class=StreamHandler
+level=CRITICAL
+formatter=simpleFormatter
+args=(sys.stdout,)
+
+[formatter_simpleFormatter]
+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
+datefmt=
+
+# pyensembl
+
+[logger_pyensembl]
+level=DEBUG
+qualname=pyensembl
+handlers=consoleHandler
+
+# datacache
+
+[logger_datacache]
+level=DEBUG
+qualname=datacache
+handlers=consoleHandler
diff --git a/pyensembl/memory_cache.py b/pyensembl/memory_cache.py
@@ -32,6 +32,10 @@
 
 from .common import load_pickle, dump_pickle
 
+
+logger = logging.getLogger(__name__)
+
+
 class MemoryCache(object):
     """
     In-memory and on-disk caching of long-running queries and computations.
@@ -44,7 +48,7 @@ def is_empty(self, filename):
 
     def delete_file(self, path):
         if exists(path):
-            logging.info("Deleting cached file %s" % path)
+            logger.info("Deleting cached file %s", path)
             remove(path)
 
     def remove_from_cache(self, key):
@@ -58,7 +62,7 @@ def clear_cached_objects(self):
         self._memory_cache.clear()
 
     def _read_csv(self, csv_path):
-        print("Reading Dataframe from %s" % csv_path)
+        logger.info("Reading Dataframe from %s", csv_path)
         df = pd.read_csv(csv_path)
         if 'seqname' in df:
             # by default, Pandas will infer the type as int,
@@ -79,7 +83,7 @@ def _write_csv(self, df, csv_path, chunksize=10**5):
             Number of rows to write at a time. Helps to limit memory
             consumption while writing a CSV.
         """
-        print("Saving DataFrame to %s" % csv_path)
+        logger.info("Saving DataFrame to %s", csv_path)
         df.to_csv(csv_path, index=False, chunksize=chunksize)
 
     def cached_dataframe(self, csv_path, compute_fn):

diff --git a/pyensembl/sequence_data.py b/pyensembl/sequence_data.py
@@ -26,6 +26,10 @@
 )
 from .fasta import parse_fasta_dictionary
 
+
+logger = logging.getLogger(__name__)
+
+
 class SequenceData(object):
     """
     Container for reference nucleotide and amino acid sequenes.
@@ -87,19 +91,19 @@ def _load_or_create_fasta_dictionary_pickle(self):
             try:
                 self._fasta_dictionary = load_pickle(
                     self.fasta_dictionary_pickle_path)
-                logging.info(
-                    "Loaded sequence dictionary from %s" % self.fasta_dictionary_pickle_path)
+                logger.info(
+                    "Loaded sequence dictionary from %s", self.fasta_dictionary_pickle_path)
                 return
             except (pickle.UnpicklingError, AttributeError):
                 # catch either an UnpicklingError or an AttributeError
                 # resulting from pickled objects refering to classes
                 # that no longer exists
-                logging.warn(
-                    "Failed to load %s, attempting to read FASTA directly" % (
-                        self.fasta_dictionary_pickle_path,))
-        logging.info("Parsing sequences from FASTA file at %s" % self.fasta_path)
+                logger.warn(
+                    "Failed to load %s, attempting to read FASTA directly",
+                        self.fasta_dictionary_pickle_path)
+        logger.info("Parsing sequences from FASTA file at %s", self.fasta_path)
         self._fasta_dictionary = parse_fasta_dictionary(self.fasta_path)
-        logging.info("Saving sequence dictionary to %s" % self.fasta_dictionary_pickle_path)
+        logger.info("Saving sequence dictionary to %s", self.fasta_dictionary_pickle_path)
         dump_pickle(self._fasta_dictionary, self.fasta_dictionary_pickle_path)
 
     def index(self, overwrite=False):

diff --git a/pyensembl/shell.py b/pyensembl/shell.py
@@ -43,10 +43,17 @@
 
 from __future__ import absolute_import
 import argparse
+import logging
+import logging.config
 
 from .ensembl_release import EnsemblRelease
 from .genome import Genome
 
+
+logging.config.fileConfig('pyensembl/logging.conf')
+logger = logging.getLogger(__name__)
+
+
 def run():
     parser = argparse.ArgumentParser(usage=__doc__)
     parser.add_argument(
@@ -134,11 +141,11 @@ def run():
                 EnsemblRelease(version, species=args.species))
 
     if len(genomes) == 0:
-        print("ERROR: No genomes selected!\n")
+        logger.error("ERROR: No genomes selected!")
         parser.print_help()
 
     for genome in genomes:
-        print("-- Running '%s' for %s" % (args.action, genome))
+        logger.info("Running '%s' for %s", args.action, genome)
         if args.action == "delete-all-files":
             genome.download_cache.delete_cache_directory()
         elif args.action == "delete-index-files":

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 numpy>=1.7
 pandas>=0.13.1
-datacache>=0.4.16
+datacache>=0.4.19
 memoized-property>=1.0.2
 nose>=1.3.3
 tinytimer>=0.0.0

diff --git a/setup.py b/setup.py
@@ -74,12 +74,14 @@
             "typechecks>=0.0.2",
             "numpy>=1.7",
             "pandas>=0.15",
-            "datacache>=0.4.16",
+            "datacache>=0.4.19",
             "memoized-property>=1.0.2",
             "six>=1.9.0",
             "gtfparse>=0.0.3",
             "serializable",
+            "tinytimer",
         ],
         long_description=readme,
         packages=['pyensembl'],
+        package_data={'pyensembl': ['logging.conf']},
     )