Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/cltk/cltk
Browse files Browse the repository at this point in the history
  • Loading branch information
Travis CI committed Aug 18, 2018
2 parents 8078766 + 5feef59 commit 8f1d351
Show file tree
Hide file tree
Showing 75 changed files with 5,415 additions and 585 deletions.
151 changes: 151 additions & 0 deletions cltk/corpus/akkadian/cdli_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""
The Importer feature sets up the ability to work with cuneiform text(s)
one-on-one, whether it is the Code of Hammurabi, a collection of texts such as
ARM01, or whatever your research desires.
This cdli_corpus module is for working with text files having already been read
by file_importer. The file_lines required by CDLICorpus are taken from prior
use of FileImport(text_file).read_file().
e.g.:
# FileImport takes a txt file and reads it; this becomes file_lines.
text_path = os.path.join('texts', 'ARM01_texts.txt')
f_i = FileImport(text_path)
f_i.read_file()
ARM01 = f_i.file_lines
# CDLICorpus takes file_lines and uses it to work:
cdli = CDLICorpus()
cdli.parse_file(ARM01)
cdli.print_catalog()
The output of CDLICorpus will be able to further utilized by the feature
ATFConverter and its subsequent classes: Tokenizer, ATFConverter, Lemmatizer,
and PPrint.
"""

import re

__author__ = ['Andrew Deloucas <ADeloucas@g.harvard.com>']
__license__ = 'MIT License. See LICENSE.'


class CDLICorpus(object):
"""
Takes file_lines, prepares and organizes data.
"""

def __init__(self):
"""
Empty.
"""
self.chunks = []
self.catalog = {}

def parse_file(self, file_lines):
"""
Parses lines of file into a dictionary of texts.
:param file_lines: file_importer.file_lines
:return: Each text as the form:
Pnum: {'metadata': List of lines of metadata,
'pnum': P-number,
'edition': Bibliographic edition,
'raw_text': Raw lines of ATF text,
'transliteration': lines of transliteration,
'normalization': lines of normalization (if present),
'translation': lines of translation (if present)}
"""
# separate the file into chunks of text
chunks, chunk = [], []
# check to see what format the corpus is in, we assume that the headers are the same for all
# texts in the file... (maybe not safe?)
if re.match('Primary publication:', file_lines[0]):
header = re.compile('Primary publication:')
else:
header = re.compile(r'&?P\d{6}')
for line in file_lines:
if header.match(line):
if len(chunk) > 0: # pylint: disable=len-as-condition
chunks.append(chunk)
chunk = [line]
else:
if len(line) > 0: # pylint: disable=len-as-condition
chunk.append(line)
chunks.append(chunk)
self.chunks = chunks
# create a rich catalog from the chunks
re_translit = re.compile(r'(\d+\'?\.) ?(.*)')
re_normaliz = re.compile(r'(#tr\.ts:) ?(.*)')
re_translat = re.compile(r'(#tr\.en:) ?(.*)')
for chunk in self.chunks:
text = chunk
if chunk[0].startswith('Primary publication:'):
# we've got full metadata, add additional parsing later
metadata = chunk[:25]
text = chunk[26:]
else: # no metadata
metadata = []
pnum = ''.join([c for c in text[0].split('=')[0] if c != '&']).rstrip()
edition = text[0].split('=')[1].lstrip()
text = text[3:]
translit = []
normaliz = []
translat = []
for line in text:
if re.match(r'\d+\'?\.', line):
translit.append(re_translit.match(line).groups()[1])
if line.startswith('#tr.ts:'):
normaliz.append(re_normaliz.match(line).groups()[1])
if line.startswith('#tr.en:'):
translat.append(re_translat.match(line).groups()[1])
self.catalog[pnum] = {'metadata': metadata,
'pnum': pnum,
'edition': edition,
'raw_text': text,
'transliteration': translit,
'normalization': normaliz,
'translation': translat}

def toc(self):
"""
Returns a rich list of texts in the catalog.
"""
return [
f"Pnum: {key}, Edition: {self.catalog[key]['edition']}, "
f"length: {len(self.catalog[key]['transliteration'])} line(s)"
for key in sorted(self.catalog.keys())]

def list_pnums(self):
"""
Lists all Pnums in the catalog.
"""
return sorted([key for key in self.catalog])

def list_editions(self):
"""
Lists all text editions in the catalog.
"""
return sorted([self.catalog[key]['edition'] for key in self.catalog])

def print_catalog(self, catalog_filter=[]):
"""
Prints out a catalog of all the texts in the corpus. Can be filtered by passing
a list of keys you want present in the texts.
:param: catalog_filter = If you wish to sort the list, use the keys pnum,
edition, metadata, transliteration, normalization, or translation.
"""
keys = sorted(self.catalog.keys())
if len(catalog_filter) > 0: # pylint: disable=len-as-condition
valid = []
for key in keys:
for f in catalog_filter:
if len(self.catalog[key][f]) > 0: # pylint: disable=len-as-condition
valid.append(key)
keys = valid
for key in keys:
print(f"Pnum: {self.catalog[key]['pnum']}")
print(f"Edition: {self.catalog[key]['edition']}")
print(f"Metadata: {len(self.catalog[key]['metadata']) > 0}")
print(f"Transliteration: {len(self.catalog[key]['transliteration']) > 0}")
print(f"Normalization: {len(self.catalog[key]['normalization']) > 0}")
print(f"Translation: {len(self.catalog[key]['translation']) > 0}")
print()
49 changes: 49 additions & 0 deletions cltk/corpus/akkadian/file_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
The Importer feature sets up the ability to work with cuneiform text(s)
one-on-one, whether it is the Code of Hammurabi, a collection of texts such as
ARM01, or whatever your research desires.
This file_importer module is for importing text files. Currently, this is
made for the purpose of reading from one of the CDLI's "download all text"
option: (https://cdli.ucla.edu/search/download_data_new.php?data_type=all).
From this link, one has produced either one text (e.g. Code of Hammurabi:
https://cdli.ucla.edu/search/search_results.php?ObjectID=P249253)
or a variety of texts through a search function (e.g. ARM 01 publication:
https://cdli.ucla.edu/search/search_results.php?PrimaryPublication=ARM+01).
"""

import os

__author__ = ['Andrew Deloucas <ADeloucas@g.harvard.com>']
__license__ = 'MIT License. See LICENSE.'


class FileImport(object):
"""
Takes a text file and prepares it in two ways: as a whole (raw_file) and as
a list of strings denoting the text line by line.
"""
def __init__(self, filename):
"""
:param filename: name of any downloaded file, ideally from CDLI as
discussed in the method docstring.
"""
self.filename = filename

def read_file(self):
"""
Grabs filename and enables it to be read.
:return: raw_file = unaltered text; file_lines = text split by lines.
"""
with open(self.filename, mode='r+', encoding='utf8') as text_file:
self.raw_file = text_file.read() # pylint: disable= attribute-defined-outside-init
self.file_lines = [x.rstrip() for x in self.raw_file.splitlines()] # pylint: disable= attribute-defined-outside-init

def file_catalog(self):
"""
Looks at the folder filename is in and lists other files in the folder.
:return: list of files.
"""
pathway = os.path.split(self.filename)
self.catalog = sorted(os.listdir(pathway[0])) # pylint: disable= attribute-defined-outside-init
144 changes: 144 additions & 0 deletions cltk/corpus/akkadian/pretty_print.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""
This module is for printing texts in Markdown or HTML.
"""

__author__ = ['Andrew Deloucas <ADeloucas@g.harvard.com>']
__license__ = 'MIT License. See LICENSE.'

class PrettyPrint(object):
"""
Prints texts in markdown or in HTML.
"""
def __init__(self):
"""
Empty.
"""

def markdown_single_text(self, catalog, cdli_number):
"""
Prints single text in file in markdown.
:param catalog: text ingested by cdli_corpus
:param cdli_number: text you wish to print
:return: output in filename.md
"""
if cdli_number in catalog:
pnum = catalog[cdli_number]['pnum']
edition = catalog[cdli_number]['edition']
metadata = '\n\t'.join(catalog[cdli_number]['metadata'])
transliteration = '\n\t'.join(catalog[cdli_number]['transliteration'])
normalization = '\n\t'.join(catalog[cdli_number]['normalization'])
translation = '\n\t'.join(catalog[cdli_number]['translation'])
m_d = """{edition}
{pnum}
---
### metadata
{metadata}
### transliteration
{trans}
### normalization
{norm}
### translation
{translation}
""".format(pnum=pnum, edition=edition, metadata=metadata,
trans=transliteration, norm=normalization,
translation=translation)
self.markdown_text = m_d # pylint: disable=attribute-defined-outside-init

def html_print_file(self, catalog, destination):
"""
Prints text_file in html.
:param catalog: text file you wish to pretty print
:param destination: where you wish to save the HTML data
:return: output in html_file.html.
"""
with open(destination, mode='r+', encoding='utf8') as t_f:
for text in catalog:
pnum = catalog[text]['pnum']
edition = catalog[text]['edition']
metadata = '<br>\n'.join(catalog[text]['metadata'])
transliteration = '<br>\n'.join(catalog[text]['transliteration'])
normalization = '<br>\n'.join(catalog[text]['normalization'])
translation = '<br>\n'.join(catalog[text]['translation'])
self.html_file = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{edition}</title>
</head>
<body><table cellpadding="10"; border="1">
<tr><th>
<h2>{edition}<br>{pnum}</h2>
</th><th>
<h3>transliteration</h3>
</th><th>
<h3>normalization</h3>
</th><th>
<h3>translation</h3>
</tr><tr><td>
{metadata}</td><td>
<p>{trans}
</td><td>
<p>{norm}
</td><td>
<font size='2'>
{translation}
</font></td></tr>
</table>
<br>
</body>
</html>""".format(
pnum=pnum, edition=edition, metadata=metadata,
trans=transliteration, norm=normalization,
translation=translation)
t_f.write(self.html_file)

def html_print_single_text(self, catalog, cdli_number, destination):
"""
Prints text_file in html.
:param catalog: CDLICorpus().catalog
:param cdli_number: which text you want printed
:param destination: where you wish to save the HTML data
:return: output in html_file.html.
"""
if cdli_number in catalog:
pnum = catalog[cdli_number]['pnum']
edition = catalog[cdli_number]['edition']
metadata = '<br>\n'.join(catalog[cdli_number]['metadata'])
transliteration = '<br>\n'.join(catalog[cdli_number]['transliteration'])
normalization = '<br>\n'.join(catalog[cdli_number]['normalization'])
translation = '<br>\n'.join(catalog[cdli_number]['translation'])
self.html_single = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{edition}</title>
</head>
<body><table cellpadding="10"; border="1">
<tr><th>
<h2>{edition}<br>{pnum}</h2>
</th><th>
<h3>transliteration</h3>
</th><th>
<h3>normalization</h3>
</th><th>
<h3>translation</h3>
</tr><tr><td>
{metadata}</td><td>
<p>{trans}
</td><td>
<p>{norm}
</td><td>
<font size='2'>
{translation}
</font></td></tr>
</table>
<br>
</body>
</html>""".format(
pnum=pnum, edition=edition, metadata=metadata,
trans=transliteration, norm=normalization,
translation=translation)
with open(destination, mode='r+', encoding='utf8') as t_f:
t_f.write(self.html_single)
Loading

0 comments on commit 8f1d351

Please sign in to comment.