Skip to content

Commit

Permalink
Merge pull request #171 from hammerlab/pyensembl-logger
Browse files Browse the repository at this point in the history
one logger per module
  • Loading branch information
julia326 committed Oct 10, 2016
2 parents 98b61b2 + 75dccf3 commit 12f2fec
Show file tree
Hide file tree
Showing 13 changed files with 111 additions and 30 deletions.
8 changes: 8 additions & 0 deletions RELEASING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Releasing Pyensembl

This document explains what do once your [Pull Request](https://www.atlassian.com/git/tutorials/making-a-pull-request/) has been reviewed and all final changes applied. Now you're ready merge your branch into master and release it to the world:

0. Make sure that you have `pandoc` and `pypandoc` installed: this is needed for readme markdown on PyPI. (See [here](http://pandoc.org/installing.html) and [here](https://pypi.python.org/pypi/pypandoc), respectively, for instructions.)
1. Bump the [version](http://semver.org/) in `__init__.py`, as part of the PR you want to release.
2. Merge your branch to master.
2. Run `python setup.py sdist upload`, which pushes the newest release to PyPI.
2 changes: 1 addition & 1 deletion pyensembl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
)
from .transcript import Transcript

__version__ = '1.0.1'
__version__ = '1.0.2'

def cached_release(release, species="human"):
"""
Expand Down
13 changes: 7 additions & 6 deletions pyensembl/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
# any time we update the database schema, increment this version number
DATABASE_SCHEMA_VERSION = 2


logger = logging.getLogger(__name__)


class Database(object):
"""
Wrapper around sqlite3 database so that the rest of the
Expand All @@ -51,9 +55,6 @@ def __init__(self, gtf, install_string):
self.install_string = install_string
self._connection = None

self.logger = logging.getLogger()
self.logger.setLevel(logging.INFO)

def __eq__(self, other):
return (
other.__class__ is Database and
Expand Down Expand Up @@ -114,7 +115,7 @@ def _all_possible_indices(self, column_names):
# are not available in all releases of Ensembl (or
# other GTFs)
if column_name not in column_set:
logging.info(
logger.info(
"Skipping database index for {%s}",
", ".join(column_group))
skip = True
Expand Down Expand Up @@ -187,7 +188,7 @@ def create(self, overwrite=False):
str(self))

db_path = self.local_db_path()
print("Creating database: %s" % (db_path,))
logger.info("Creating database: %s", db_path)
df = self.gtf.dataframe()
all_index_groups = self._all_possible_indices(df.columns)

Expand Down Expand Up @@ -396,7 +397,7 @@ def run_sql_query(self, sql, required=False, query_params=[]):
cursor = self.connection.execute(sql, query_params)
except sqlite3.OperationalError as e:
error_message = e.message if hasattr(e, 'message') else str(e)
logging.warn(
logger.warn(
"Encountered error \"%s\" from query \"%s\" with parameters %s",
error_message,
sql,
Expand Down
7 changes: 5 additions & 2 deletions pyensembl/download_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@

import datacache


logger = logging.getLogger(__name__)

CACHE_BASE_SUBDIR = "pyensembl"
CACHE_DIR_ENV_KEY = "PYENSEMBL_CACHE_DIR"

Expand Down Expand Up @@ -208,7 +211,7 @@ def _download_if_necessary(self, url, download_if_missing, overwrite):
cached_path = self.cached_path(url)
missing = not exists(cached_path)
if (missing or overwrite) and download_if_missing:
logging.info("Fetching %s from URL %s", cached_path, url)
logger.info("Fetching %s from URL %s", cached_path, url)
local_filename = split(cached_path)[1]
datacache.download._download(
filename=local_filename,
Expand Down Expand Up @@ -304,7 +307,7 @@ def delete_cached_files(self, prefixes=[], suffixes=[]):
any(filename.startswith(pre) for pre in prefixes))
if delete:
path = join(self.cache_directory_path, filename)
print("Deleting %s" % path)
logger.info("Deleting %s", path)
remove(path)

def delete_cache_directory(self):
Expand Down
8 changes: 6 additions & 2 deletions pyensembl/fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

from six import binary_type, PY3


logger = logging.getLogger(__name__)


def _parse_header_id(line):
"""
Pull the transcript or protein identifier from the header line
Expand Down Expand Up @@ -117,7 +121,7 @@ def _current_entry(self):
# entry of the file then put the last one in the dictionary
if self.current_id:
if len(self.current_lines) == 0:
logging.warn("No sequence data for '%s'" % self.current_id)
logger.warn("No sequence data for '%s'", self.current_id)
else:
sequence = b"".join(self.current_lines)
if PY3:
Expand All @@ -134,7 +138,7 @@ def _read_header(self, line):
self.current_id = _parse_header_id(line)

if len(self.current_id) == 0:
logging.warn("Unable to parse ID from header line: %s" % line)
logger.warn("Unable to parse ID from header line: %s", line)

self.current_lines = []
return previous_entry
Expand Down
4 changes: 0 additions & 4 deletions pyensembl/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"""

from __future__ import print_function, division, absolute_import
import logging
from os import remove
from os.path import exists

Expand Down Expand Up @@ -112,9 +111,6 @@ def __init__(
self.has_gtf = self._gtf_path_or_url is not None
self.has_transcript_fasta = self._transcript_fasta_path_or_url is not None
self.has_protein_fasta = self._protein_fasta_path_or_url is not None

self.logger = logging.getLogger()
self.logger.setLevel(logging.INFO)
self.memory_cache = MemoryCache()

self._init_lazy_fields()
Expand Down
12 changes: 11 additions & 1 deletion pyensembl/gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from __future__ import print_function, division, absolute_import
import logging
from os.path import split, abspath, join, exists, splitext
import pandas as pd

Expand All @@ -22,6 +23,10 @@
from .normalization import normalize_chromosome, normalize_strand
from .memory_cache import MemoryCache


logger = logging.getLogger(__name__)


class GTF(object):
"""
Parse a GTF gene annotation file from a given local path.
Expand Down Expand Up @@ -131,7 +136,7 @@ def _load_full_dataframe_from_gtf(self):
"""
Parse this genome source's GTF file and load it as a Pandas DataFrame
"""
print("Reading GTF from %s" % self.gtf_path)
logger.info("Reading GTF from %s", self.gtf_path)
df = read_gtf_as_dataframe(
self.gtf_path,
column_converters={
Expand All @@ -149,6 +154,7 @@ def _load_full_dataframe_from_gtf(self):
# if we have to reconstruct gene feature rows then
# fill in values for 'gene_name' and 'gene_biotype'
# but only if they're actually present in the GTF
logger.info("Creating missing gene features...")
df = create_missing_features(
dataframe=df,
unique_keys={"gene": "gene_id"},
Expand All @@ -159,8 +165,10 @@ def _load_full_dataframe_from_gtf(self):
}.intersection(column_names),
},
missing_value="")
logger.info("Done.")

if "transcript" not in features:
logger.info("Creating missing transcript features...")
df = create_missing_features(
dataframe=df,
unique_keys={"transcript": "transcript_id"},
Expand All @@ -175,6 +183,8 @@ def _load_full_dataframe_from_gtf(self):
}.intersection(column_names)
},
missing_value="")
logger.info("Done.")

return df

def dataframe(
Expand Down
42 changes: 42 additions & 0 deletions pyensembl/logging.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
[loggers]
keys=root,pyensembl,datacache

[formatters]
keys=simpleFormatter

[handlers]
keys=consoleHandler,consoleHandlerCritical

[logger_root]
level=INFO
handlers=consoleHandlerCritical

[handler_consoleHandler]
class=StreamHandler
level=INFO
formatter=simpleFormatter
args=(sys.stdout,)

[handler_consoleHandlerCritical] # only for root logger: essentially silent
class=StreamHandler
level=CRITICAL
formatter=simpleFormatter
args=(sys.stdout,)

[formatter_simpleFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
datefmt=

# pyensembl

[logger_pyensembl]
level=DEBUG
qualname=pyensembl
handlers=consoleHandler

# datacache

[logger_datacache]
level=DEBUG
qualname=datacache
handlers=consoleHandler
10 changes: 7 additions & 3 deletions pyensembl/memory_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@

from .common import load_pickle, dump_pickle


logger = logging.getLogger(__name__)


class MemoryCache(object):
"""
In-memory and on-disk caching of long-running queries and computations.
Expand All @@ -44,7 +48,7 @@ def is_empty(self, filename):

def delete_file(self, path):
if exists(path):
logging.info("Deleting cached file %s" % path)
logger.info("Deleting cached file %s", path)
remove(path)

def remove_from_cache(self, key):
Expand All @@ -58,7 +62,7 @@ def clear_cached_objects(self):
self._memory_cache.clear()

def _read_csv(self, csv_path):
print("Reading Dataframe from %s" % csv_path)
logger.info("Reading Dataframe from %s", csv_path)
df = pd.read_csv(csv_path)
if 'seqname' in df:
# by default, Pandas will infer the type as int,
Expand All @@ -79,7 +83,7 @@ def _write_csv(self, df, csv_path, chunksize=10**5):
Number of rows to write at a time. Helps to limit memory
consumption while writing a CSV.
"""
print("Saving DataFrame to %s" % csv_path)
logger.info("Saving DataFrame to %s", csv_path)
df.to_csv(csv_path, index=False, chunksize=chunksize)

def cached_dataframe(self, csv_path, compute_fn):
Expand Down
18 changes: 11 additions & 7 deletions pyensembl/sequence_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
)
from .fasta import parse_fasta_dictionary


logger = logging.getLogger(__name__)


class SequenceData(object):
"""
Container for reference nucleotide and amino acid sequenes.
Expand Down Expand Up @@ -87,19 +91,19 @@ def _load_or_create_fasta_dictionary_pickle(self):
try:
self._fasta_dictionary = load_pickle(
self.fasta_dictionary_pickle_path)
logging.info(
"Loaded sequence dictionary from %s" % self.fasta_dictionary_pickle_path)
logger.info(
"Loaded sequence dictionary from %s", self.fasta_dictionary_pickle_path)
return
except (pickle.UnpicklingError, AttributeError):
# catch either an UnpicklingError or an AttributeError
# resulting from pickled objects refering to classes
# that no longer exists
logging.warn(
"Failed to load %s, attempting to read FASTA directly" % (
self.fasta_dictionary_pickle_path,))
logging.info("Parsing sequences from FASTA file at %s" % self.fasta_path)
logger.warn(
"Failed to load %s, attempting to read FASTA directly",
self.fasta_dictionary_pickle_path)
logger.info("Parsing sequences from FASTA file at %s", self.fasta_path)
self._fasta_dictionary = parse_fasta_dictionary(self.fasta_path)
logging.info("Saving sequence dictionary to %s" % self.fasta_dictionary_pickle_path)
logger.info("Saving sequence dictionary to %s", self.fasta_dictionary_pickle_path)
dump_pickle(self._fasta_dictionary, self.fasta_dictionary_pickle_path)

def index(self, overwrite=False):
Expand Down
11 changes: 9 additions & 2 deletions pyensembl/shell.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,17 @@

from __future__ import absolute_import
import argparse
import logging
import logging.config

from .ensembl_release import EnsemblRelease
from .genome import Genome


logging.config.fileConfig('pyensembl/logging.conf')
logger = logging.getLogger(__name__)


def run():
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
Expand Down Expand Up @@ -134,11 +141,11 @@ def run():
EnsemblRelease(version, species=args.species))

if len(genomes) == 0:
print("ERROR: No genomes selected!\n")
logger.error("ERROR: No genomes selected!")
parser.print_help()

for genome in genomes:
print("-- Running '%s' for %s" % (args.action, genome))
logger.info("Running '%s' for %s", args.action, genome)
if args.action == "delete-all-files":
genome.download_cache.delete_cache_directory()
elif args.action == "delete-index-files":
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
numpy>=1.7
pandas>=0.13.1
datacache>=0.4.16
datacache>=0.4.19
memoized-property>=1.0.2
nose>=1.3.3
tinytimer>=0.0.0
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,14 @@
"typechecks>=0.0.2",
"numpy>=1.7",
"pandas>=0.15",
"datacache>=0.4.16",
"datacache>=0.4.19",
"memoized-property>=1.0.2",
"six>=1.9.0",
"gtfparse>=0.0.3",
"serializable",
"tinytimer",
],
long_description=readme,
packages=['pyensembl'],
package_data={'pyensembl': ['logging.conf']},
)

0 comments on commit 12f2fec

Please sign in to comment.