Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hail] grep return strings #7608

Merged
merged 2 commits into from Nov 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
78 changes: 45 additions & 33 deletions hail/python/hail/methods/impex.py
Expand Up @@ -887,41 +887,53 @@ def import_fam(path, quant_pheno=False, delimiter=r'\\s+', missing='NA') -> Tabl

@typecheck(regex=str,
path=oneof(str, sequenceof(str)),
max_count=int)
def grep(regex, path, max_count=100):
max_count=int,
show=bool)
def grep(regex, path, max_count=100, *, show=True):
r"""Searches given paths for all lines containing regex matches.

Examples
--------

Print all lines containing the string ``hello`` in *file.txt*:

>>> hl.grep('hello','data/file.txt')

Print all lines containing digits in *file1.txt* and *file2.txt*:

>>> hl.grep('\d', ['data/file1.txt','data/file2.txt'])

Notes
-----
:func:`.grep` mimics the basic functionality of Unix ``grep`` in
parallel, printing results to the screen. This command is provided as a
convenience to those in the statistical genetics community who often
search enormous text files like VCFs. Hail uses `Java regular expression
patterns
<https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html>`__.
The `RegExr sandbox <http://regexr.com/>`__ may be helpful.

Parameters
----------
regex : :obj:`str`
The regular expression to match.
path : :obj:`str` or :obj:`list` of :obj:`str`
The files to search.
max_count : :obj:`int`
The maximum number of matches to return
"""
Env.hc()._jhc.grep(regex, jindexed_seq_args(path), max_count)
Examples
--------

Print all lines containing the string ``hello`` in *file.txt*:

>>> hl.grep('hello','data/file.txt')

Print all lines containing digits in *file1.txt* and *file2.txt*:

>>> hl.grep('\d', ['data/file1.txt','data/file2.txt'])

Notes
-----
:func:`.grep` mimics the basic functionality of Unix ``grep`` in
parallel, printing results to the screen. This command is provided as a
convenience to those in the statistical genetics community who often
search enormous text files like VCFs. Hail uses `Java regular expression
patterns
<https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html>`__.
The `RegExr sandbox <http://regexr.com/>`__ may be helpful.

Parameters
----------
regex : :obj:`str`
The regular expression to match.
path : :obj:`str` or :obj:`list` of :obj:`str`
The files to search.
max_count : :obj:`int`
The maximum number of matches to return
show : :obj:`bool`
When `True`, show the values on stdout. When `False`, return a
dictionary mapping file names to lines.

Returns
---
:obj:`dict` of :obj:`str` to :obj:`list` of :obj:`str`
"""
if show:
Env.hc()._jhc.grepPrint(regex, jindexed_seq_args(path), max_count)
else:
jarr = Env.hc()._jhc.grepReturn(regex, jindexed_seq_args(path), max_count)
return {x._1(): list(x._2()) for x in jarr}


@typecheck(path=oneof(str, sequenceof(str)),
Expand Down
16 changes: 15 additions & 1 deletion hail/python/test/hail/methods/test_impex.py
Expand Up @@ -1742,4 +1742,18 @@ def test_read_write_identity_keyed(self):
def test_import_same(self):
ht = hl.import_table(resource('sampleAnnotations.tsv'))
ht2 = hl.import_table(resource('sampleAnnotations.tsv'))
assert ht._same(ht2)
assert ht._same(ht2)


class GrepTests(unittest.TestCase):
def test_grep_show_false(self):
expected = {'sampleAnnotations.tsv': ['HG00120\tCASE\t19599', 'HG00121\tCASE\t4832'],
'sample2_rename.tsv': ['HG00120\tB_HG00120', 'HG00121\tB_HG00121'],
'sampleAnnotations2.tsv': ['HG00120\t3919.8\t19589',
'HG00121\t966.4\t4822',
'HG00120_B\t3919.8\t19589',
'HG00121_B\t966.4\t4822',
'HG00120_B_B\t3919.8\t19589',
'HG00121_B_B\t966.4\t4822']}

assert hl.grep('HG0012[0-1]', resource('*.tsv'), show=False) == expected
26 changes: 18 additions & 8 deletions hail/src/main/scala/is/hail/HailContext.scala
Expand Up @@ -637,22 +637,32 @@ class HailContext private(

def version: String = is.hail.HAIL_PRETTY_VERSION

def grep(regex: String, files: Seq[String], maxLines: Int = 100) {
private[this] def fileAndLineCounts(
regex: String,
files: Seq[String],
maxLines: Int
): Map[String, Array[WithContext[String]]] = {
val regexp = regex.r
sc.textFilesLines(sFS.globAll(files))
.filter(line => regexp.findFirstIn(line.value).isDefined)
.take(maxLines)
.groupBy(_.source.asInstanceOf[Context].file)
.foreach { case (file, lines) =>
info(s"$file: ${ lines.length } ${ plural(lines.length, "match", "matches") }:")
lines.map(_.value).foreach { line =>
val (screen, logged) = line.truncatable().strings
log.info("\t" + logged)
println(s"\t$screen")
}
}

def grepPrint(regex: String, files: Seq[String], maxLines: Int) {
fileAndLineCounts(regex, files, maxLines).foreach { case (file, lines) =>
info(s"$file: ${ lines.length } ${ plural(lines.length, "match", "matches") }:")
lines.map(_.value).foreach { line =>
val (screen, logged) = line.truncatable().strings
log.info("\t" + logged)
println(s"\t$screen")
}
}
}

def grepReturn(regex: String, files: Seq[String], maxLines: Int): Array[(String, Array[String])] =
fileAndLineCounts(regex, files, maxLines).mapValues(_.map(_.value)).toArray

def getTemporaryFile(nChar: Int = 10, prefix: Option[String] = None, suffix: Option[String] = None): String =
sFS.getTemporaryFile(tmpDir, nChar, prefix, suffix)

Expand Down