Merge branch 'master' of https://github.com/cltk/cltk

kylepjohnson · Aug 18, 2018 · 8f1d351 · 8f1d351
2 parents 8078766 + 5feef59
commit 8f1d351
Show file tree

Hide file tree

Showing 75 changed files with 5,415 additions and 585 deletions.
diff --git a/cltk/corpus/akkadian/cdli_corpus.py b/cltk/corpus/akkadian/cdli_corpus.py
@@ -0,0 +1,151 @@
+"""
+The Importer feature sets up the ability to work with cuneiform text(s)
+one-on-one, whether it is the Code of Hammurabi, a collection of texts such as
+ARM01, or whatever your research desires.
+
+This cdli_corpus module is for working with text files having already been read
+by file_importer. The file_lines required by CDLICorpus are taken from prior
+use of FileImport(text_file).read_file().
+
+e.g.:
+    # FileImport takes a txt file and reads it; this becomes file_lines.
+        text_path = os.path.join('texts', 'ARM01_texts.txt')
+        f_i = FileImport(text_path)
+        f_i.read_file()
+        ARM01 = f_i.file_lines
+    # CDLICorpus takes file_lines and uses it to work:
+        cdli = CDLICorpus()
+        cdli.parse_file(ARM01)
+        cdli.print_catalog()
+
+The output of CDLICorpus will be able to further utilized by the feature
+ATFConverter and its subsequent classes: Tokenizer, ATFConverter, Lemmatizer,
+and PPrint.
+"""
+
+import re
+
+__author__ = ['Andrew Deloucas <ADeloucas@g.harvard.com>']
+__license__ = 'MIT License. See LICENSE.'
+
+
+class CDLICorpus(object):
+    """
+    Takes file_lines, prepares and organizes data.
+    """
+
+    def __init__(self):
+        """
+        Empty.
+        """
+        self.chunks = []
+        self.catalog = {}
+
+    def parse_file(self, file_lines):
+        """
+        Parses lines of file into a dictionary of texts.
+        :param file_lines: file_importer.file_lines
+        :return: Each text as the form:
+            Pnum: {'metadata': List of lines of metadata,
+                   'pnum': P-number,
+                   'edition': Bibliographic edition,
+                   'raw_text': Raw lines of ATF text,
+                   'transliteration': lines of transliteration,
+                   'normalization': lines of normalization (if present),
+                   'translation': lines of translation (if present)}
+        """
+        # separate the file into chunks of text
+        chunks, chunk = [], []
+        # check to see what format the corpus is in, we assume that the headers are the same for all
+        # texts in the file... (maybe not safe?)
+        if re.match('Primary publication:', file_lines[0]):
+            header = re.compile('Primary publication:')
+        else:
+            header = re.compile(r'&?P\d{6}')
+        for line in file_lines:
+            if header.match(line):
+                if len(chunk) > 0:  # pylint: disable=len-as-condition
+                    chunks.append(chunk)
+                chunk = [line]
+            else:
+                if len(line) > 0:  # pylint: disable=len-as-condition
+                    chunk.append(line)
+        chunks.append(chunk)
+        self.chunks = chunks
+        # create a rich catalog from the chunks
+        re_translit = re.compile(r'(\d+\'?\.) ?(.*)')
+        re_normaliz = re.compile(r'(#tr\.ts:) ?(.*)')
+        re_translat = re.compile(r'(#tr\.en:) ?(.*)')
+        for chunk in self.chunks:
+            text = chunk
+            if chunk[0].startswith('Primary publication:'):
+                # we've got full metadata, add additional parsing later
+                metadata = chunk[:25]
+                text = chunk[26:]
+            else:  # no metadata
+                metadata = []
+            pnum = ''.join([c for c in text[0].split('=')[0] if c != '&']).rstrip()
+            edition = text[0].split('=')[1].lstrip()
+            text = text[3:]
+            translit = []
+            normaliz = []
+            translat = []
+            for line in text:
+                if re.match(r'\d+\'?\.', line):
+                    translit.append(re_translit.match(line).groups()[1])
+                if line.startswith('#tr.ts:'):
+                    normaliz.append(re_normaliz.match(line).groups()[1])
+                if line.startswith('#tr.en:'):
+                    translat.append(re_translat.match(line).groups()[1])
+            self.catalog[pnum] = {'metadata': metadata,
+                                  'pnum': pnum,
+                                  'edition': edition,
+                                  'raw_text': text,
+                                  'transliteration': translit,
+                                  'normalization': normaliz,
+                                  'translation': translat}
+
+    def toc(self):
+        """
+        Returns a rich list of texts in the catalog.
+        """
+        return [
+            f"Pnum: {key}, Edition: {self.catalog[key]['edition']}, "
+            f"length: {len(self.catalog[key]['transliteration'])} line(s)"
+            for key in sorted(self.catalog.keys())]
+
+    def list_pnums(self):
+        """
+        Lists all Pnums in the catalog.
+        """
+        return sorted([key for key in self.catalog])
+
+    def list_editions(self):
+        """
+        Lists all text editions in the catalog.
+        """
+        return sorted([self.catalog[key]['edition'] for key in self.catalog])
+
+    def print_catalog(self, catalog_filter=[]):
+        """
+        Prints out a catalog of all the texts in the corpus.  Can be filtered by passing
+        a list of keys you want present in the texts.
+        :param: catalog_filter = If you wish to sort the list, use the keys pnum,
+        edition, metadata, transliteration, normalization, or translation.
+        """
+        keys = sorted(self.catalog.keys())
+        if len(catalog_filter) > 0:  # pylint: disable=len-as-condition
+            valid = []
+            for key in keys:
+                for f in catalog_filter:
+                    if len(self.catalog[key][f]) > 0:  # pylint: disable=len-as-condition
+                        valid.append(key)
+            keys = valid
+        for key in keys:
+            print(f"Pnum: {self.catalog[key]['pnum']}")
+            print(f"Edition: {self.catalog[key]['edition']}")
+            print(f"Metadata: {len(self.catalog[key]['metadata']) > 0}")
+            print(f"Transliteration: {len(self.catalog[key]['transliteration']) > 0}")
+            print(f"Normalization: {len(self.catalog[key]['normalization']) > 0}")
+            print(f"Translation: {len(self.catalog[key]['translation']) > 0}")
+            print()
diff --git a/cltk/corpus/akkadian/file_importer.py b/cltk/corpus/akkadian/file_importer.py
@@ -0,0 +1,49 @@
+"""
+The Importer feature sets up the ability to work with cuneiform text(s)
+one-on-one, whether it is the Code of Hammurabi, a collection of texts such as
+ARM01, or whatever your research desires.
+
+This file_importer module is for importing text files. Currently, this is
+made for the purpose of reading from one of the CDLI's "download all  text"
+option: (https://cdli.ucla.edu/search/download_data_new.php?data_type=all).
+
+From this link, one has produced either one text (e.g. Code of Hammurabi:
+https://cdli.ucla.edu/search/search_results.php?ObjectID=P249253)
+or a variety of texts through a search function (e.g. ARM 01 publication:
+https://cdli.ucla.edu/search/search_results.php?PrimaryPublication=ARM+01).
+"""
+
+import os
+
+__author__ = ['Andrew Deloucas <ADeloucas@g.harvard.com>']
+__license__ = 'MIT License. See LICENSE.'
+
+
+class FileImport(object):
+    """
+    Takes a text file and prepares it in two ways: as a whole (raw_file) and as
+    a list of strings denoting the text line by line.
+    """
+    def __init__(self, filename):
+        """
+        :param filename: name of any downloaded file, ideally from CDLI as
+        discussed in the method docstring.
+        """
+        self.filename = filename
+
+    def read_file(self):
+        """
+        Grabs filename and enables it to be read.
+        :return: raw_file = unaltered text; file_lines = text split by lines.
+        """
+        with open(self.filename, mode='r+', encoding='utf8') as text_file:
+            self.raw_file = text_file.read()  # pylint: disable= attribute-defined-outside-init
+        self.file_lines = [x.rstrip() for x in self.raw_file.splitlines()]  # pylint: disable= attribute-defined-outside-init
+
+    def file_catalog(self):
+        """
+        Looks at the folder filename is in and lists other files in the folder.
+        :return: list of files.
+        """
+        pathway = os.path.split(self.filename)
+        self.catalog = sorted(os.listdir(pathway[0]))  # pylint: disable= attribute-defined-outside-init
diff --git a/cltk/corpus/akkadian/pretty_print.py b/cltk/corpus/akkadian/pretty_print.py
@@ -0,0 +1,144 @@
+"""
+This module is for printing texts in Markdown or HTML.
+"""
+
+__author__ = ['Andrew Deloucas <ADeloucas@g.harvard.com>']
+__license__ = 'MIT License. See LICENSE.'
+
+class PrettyPrint(object):
+    """
+    Prints texts in markdown or in HTML.
+    """
+    def __init__(self):
+        """
+        Empty.
+        """
+
+    def markdown_single_text(self, catalog, cdli_number):
+        """
+        Prints single text in file in markdown.
+       :param catalog: text ingested by cdli_corpus
+       :param cdli_number: text you wish to print
+       :return: output in filename.md
+       """
+        if cdli_number in catalog:
+            pnum = catalog[cdli_number]['pnum']
+            edition = catalog[cdli_number]['edition']
+            metadata = '\n\t'.join(catalog[cdli_number]['metadata'])
+            transliteration = '\n\t'.join(catalog[cdli_number]['transliteration'])
+            normalization = '\n\t'.join(catalog[cdli_number]['normalization'])
+            translation = '\n\t'.join(catalog[cdli_number]['translation'])
+            m_d = """{edition}
+{pnum}
+---
+### metadata
+    {metadata}
+### transliteration
+    {trans}
+### normalization
+    {norm}
+### translation
+    {translation}  
+""".format(pnum=pnum, edition=edition, metadata=metadata,
+           trans=transliteration, norm=normalization,
+           translation=translation)
+            self.markdown_text = m_d  # pylint: disable=attribute-defined-outside-init
+
+    def html_print_file(self, catalog, destination):
+        """
+        Prints text_file in html.
+        :param catalog: text file you wish to pretty print
+        :param destination: where you wish to save the HTML data
+        :return: output in html_file.html.
+        """
+        with open(destination, mode='r+', encoding='utf8') as t_f:
+            for text in catalog:
+                pnum = catalog[text]['pnum']
+                edition = catalog[text]['edition']
+                metadata = '<br>\n'.join(catalog[text]['metadata'])
+                transliteration = '<br>\n'.join(catalog[text]['transliteration'])
+                normalization = '<br>\n'.join(catalog[text]['normalization'])
+                translation = '<br>\n'.join(catalog[text]['translation'])
+                self.html_file = """<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<title>{edition}</title>
+</head>
+<body><table cellpadding="10"; border="1">
+<tr><th>
+<h2>{edition}<br>{pnum}</h2>
+</th><th>
+<h3>transliteration</h3>
+</th><th>
+<h3>normalization</h3>
+</th><th>
+<h3>translation</h3>
+</tr><tr><td>
+{metadata}</td><td>
+<p>{trans}
+</td><td>
+<p>{norm}
+</td><td>
+<font size='2'>
+{translation}
+</font></td></tr>
+
+</table>
+<br>
+</body>
+</html>""".format(
+    pnum=pnum, edition=edition, metadata=metadata,
+    trans=transliteration, norm=normalization,
+    translation=translation)
+                t_f.write(self.html_file)
+
+    def html_print_single_text(self, catalog, cdli_number, destination):
+        """
+        Prints text_file in html.
+        :param catalog: CDLICorpus().catalog
+        :param cdli_number: which text you want printed
+        :param destination: where you wish to save the HTML data
+        :return: output in html_file.html.
+        """
+        if cdli_number in catalog:
+            pnum = catalog[cdli_number]['pnum']
+            edition = catalog[cdli_number]['edition']
+            metadata = '<br>\n'.join(catalog[cdli_number]['metadata'])
+            transliteration = '<br>\n'.join(catalog[cdli_number]['transliteration'])
+            normalization = '<br>\n'.join(catalog[cdli_number]['normalization'])
+            translation = '<br>\n'.join(catalog[cdli_number]['translation'])
+            self.html_single = """<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<title>{edition}</title>
+</head>
+<body><table cellpadding="10"; border="1">
+<tr><th>
+<h2>{edition}<br>{pnum}</h2>
+</th><th>
+<h3>transliteration</h3>
+</th><th>
+<h3>normalization</h3>
+</th><th>
+<h3>translation</h3>
+</tr><tr><td>
+{metadata}</td><td>
+<p>{trans}
+</td><td>
+<p>{norm}
+</td><td>
+<font size='2'>
+{translation}
+</font></td></tr>
+
+</table>
+<br>
+</body>
+</html>""".format(
+    pnum=pnum, edition=edition, metadata=metadata,
+    trans=transliteration, norm=normalization,
+    translation=translation)
+            with open(destination, mode='r+', encoding='utf8') as t_f:
+                t_f.write(self.html_single)