forked from cltk/cltk
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of https://github.com/cltk/cltk
- Loading branch information
Showing
75 changed files
with
5,415 additions
and
585 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
""" | ||
The Importer feature sets up the ability to work with cuneiform text(s) | ||
one-on-one, whether it is the Code of Hammurabi, a collection of texts such as | ||
ARM01, or whatever your research desires. | ||
This cdli_corpus module is for working with text files having already been read | ||
by file_importer. The file_lines required by CDLICorpus are taken from prior | ||
use of FileImport(text_file).read_file(). | ||
e.g.: | ||
# FileImport takes a txt file and reads it; this becomes file_lines. | ||
text_path = os.path.join('texts', 'ARM01_texts.txt') | ||
f_i = FileImport(text_path) | ||
f_i.read_file() | ||
ARM01 = f_i.file_lines | ||
# CDLICorpus takes file_lines and uses it to work: | ||
cdli = CDLICorpus() | ||
cdli.parse_file(ARM01) | ||
cdli.print_catalog() | ||
The output of CDLICorpus will be able to further utilized by the feature | ||
ATFConverter and its subsequent classes: Tokenizer, ATFConverter, Lemmatizer, | ||
and PPrint. | ||
""" | ||
|
||
import re | ||
|
||
__author__ = ['Andrew Deloucas <ADeloucas@g.harvard.com>'] | ||
__license__ = 'MIT License. See LICENSE.' | ||
|
||
|
||
class CDLICorpus(object): | ||
""" | ||
Takes file_lines, prepares and organizes data. | ||
""" | ||
|
||
def __init__(self): | ||
""" | ||
Empty. | ||
""" | ||
self.chunks = [] | ||
self.catalog = {} | ||
|
||
def parse_file(self, file_lines): | ||
""" | ||
Parses lines of file into a dictionary of texts. | ||
:param file_lines: file_importer.file_lines | ||
:return: Each text as the form: | ||
Pnum: {'metadata': List of lines of metadata, | ||
'pnum': P-number, | ||
'edition': Bibliographic edition, | ||
'raw_text': Raw lines of ATF text, | ||
'transliteration': lines of transliteration, | ||
'normalization': lines of normalization (if present), | ||
'translation': lines of translation (if present)} | ||
""" | ||
# separate the file into chunks of text | ||
chunks, chunk = [], [] | ||
# check to see what format the corpus is in, we assume that the headers are the same for all | ||
# texts in the file... (maybe not safe?) | ||
if re.match('Primary publication:', file_lines[0]): | ||
header = re.compile('Primary publication:') | ||
else: | ||
header = re.compile(r'&?P\d{6}') | ||
for line in file_lines: | ||
if header.match(line): | ||
if len(chunk) > 0: # pylint: disable=len-as-condition | ||
chunks.append(chunk) | ||
chunk = [line] | ||
else: | ||
if len(line) > 0: # pylint: disable=len-as-condition | ||
chunk.append(line) | ||
chunks.append(chunk) | ||
self.chunks = chunks | ||
# create a rich catalog from the chunks | ||
re_translit = re.compile(r'(\d+\'?\.) ?(.*)') | ||
re_normaliz = re.compile(r'(#tr\.ts:) ?(.*)') | ||
re_translat = re.compile(r'(#tr\.en:) ?(.*)') | ||
for chunk in self.chunks: | ||
text = chunk | ||
if chunk[0].startswith('Primary publication:'): | ||
# we've got full metadata, add additional parsing later | ||
metadata = chunk[:25] | ||
text = chunk[26:] | ||
else: # no metadata | ||
metadata = [] | ||
pnum = ''.join([c for c in text[0].split('=')[0] if c != '&']).rstrip() | ||
edition = text[0].split('=')[1].lstrip() | ||
text = text[3:] | ||
translit = [] | ||
normaliz = [] | ||
translat = [] | ||
for line in text: | ||
if re.match(r'\d+\'?\.', line): | ||
translit.append(re_translit.match(line).groups()[1]) | ||
if line.startswith('#tr.ts:'): | ||
normaliz.append(re_normaliz.match(line).groups()[1]) | ||
if line.startswith('#tr.en:'): | ||
translat.append(re_translat.match(line).groups()[1]) | ||
self.catalog[pnum] = {'metadata': metadata, | ||
'pnum': pnum, | ||
'edition': edition, | ||
'raw_text': text, | ||
'transliteration': translit, | ||
'normalization': normaliz, | ||
'translation': translat} | ||
|
||
def toc(self): | ||
""" | ||
Returns a rich list of texts in the catalog. | ||
""" | ||
return [ | ||
f"Pnum: {key}, Edition: {self.catalog[key]['edition']}, " | ||
f"length: {len(self.catalog[key]['transliteration'])} line(s)" | ||
for key in sorted(self.catalog.keys())] | ||
|
||
def list_pnums(self): | ||
""" | ||
Lists all Pnums in the catalog. | ||
""" | ||
return sorted([key for key in self.catalog]) | ||
|
||
def list_editions(self): | ||
""" | ||
Lists all text editions in the catalog. | ||
""" | ||
return sorted([self.catalog[key]['edition'] for key in self.catalog]) | ||
|
||
def print_catalog(self, catalog_filter=[]): | ||
""" | ||
Prints out a catalog of all the texts in the corpus. Can be filtered by passing | ||
a list of keys you want present in the texts. | ||
:param: catalog_filter = If you wish to sort the list, use the keys pnum, | ||
edition, metadata, transliteration, normalization, or translation. | ||
""" | ||
keys = sorted(self.catalog.keys()) | ||
if len(catalog_filter) > 0: # pylint: disable=len-as-condition | ||
valid = [] | ||
for key in keys: | ||
for f in catalog_filter: | ||
if len(self.catalog[key][f]) > 0: # pylint: disable=len-as-condition | ||
valid.append(key) | ||
keys = valid | ||
for key in keys: | ||
print(f"Pnum: {self.catalog[key]['pnum']}") | ||
print(f"Edition: {self.catalog[key]['edition']}") | ||
print(f"Metadata: {len(self.catalog[key]['metadata']) > 0}") | ||
print(f"Transliteration: {len(self.catalog[key]['transliteration']) > 0}") | ||
print(f"Normalization: {len(self.catalog[key]['normalization']) > 0}") | ||
print(f"Translation: {len(self.catalog[key]['translation']) > 0}") | ||
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
""" | ||
The Importer feature sets up the ability to work with cuneiform text(s) | ||
one-on-one, whether it is the Code of Hammurabi, a collection of texts such as | ||
ARM01, or whatever your research desires. | ||
This file_importer module is for importing text files. Currently, this is | ||
made for the purpose of reading from one of the CDLI's "download all text" | ||
option: (https://cdli.ucla.edu/search/download_data_new.php?data_type=all). | ||
From this link, one has produced either one text (e.g. Code of Hammurabi: | ||
https://cdli.ucla.edu/search/search_results.php?ObjectID=P249253) | ||
or a variety of texts through a search function (e.g. ARM 01 publication: | ||
https://cdli.ucla.edu/search/search_results.php?PrimaryPublication=ARM+01). | ||
""" | ||
|
||
import os | ||
|
||
__author__ = ['Andrew Deloucas <ADeloucas@g.harvard.com>'] | ||
__license__ = 'MIT License. See LICENSE.' | ||
|
||
|
||
class FileImport(object): | ||
""" | ||
Takes a text file and prepares it in two ways: as a whole (raw_file) and as | ||
a list of strings denoting the text line by line. | ||
""" | ||
def __init__(self, filename): | ||
""" | ||
:param filename: name of any downloaded file, ideally from CDLI as | ||
discussed in the method docstring. | ||
""" | ||
self.filename = filename | ||
|
||
def read_file(self): | ||
""" | ||
Grabs filename and enables it to be read. | ||
:return: raw_file = unaltered text; file_lines = text split by lines. | ||
""" | ||
with open(self.filename, mode='r+', encoding='utf8') as text_file: | ||
self.raw_file = text_file.read() # pylint: disable= attribute-defined-outside-init | ||
self.file_lines = [x.rstrip() for x in self.raw_file.splitlines()] # pylint: disable= attribute-defined-outside-init | ||
|
||
def file_catalog(self): | ||
""" | ||
Looks at the folder filename is in and lists other files in the folder. | ||
:return: list of files. | ||
""" | ||
pathway = os.path.split(self.filename) | ||
self.catalog = sorted(os.listdir(pathway[0])) # pylint: disable= attribute-defined-outside-init |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
""" | ||
This module is for printing texts in Markdown or HTML. | ||
""" | ||
|
||
__author__ = ['Andrew Deloucas <ADeloucas@g.harvard.com>'] | ||
__license__ = 'MIT License. See LICENSE.' | ||
|
||
class PrettyPrint(object): | ||
""" | ||
Prints texts in markdown or in HTML. | ||
""" | ||
def __init__(self): | ||
""" | ||
Empty. | ||
""" | ||
|
||
def markdown_single_text(self, catalog, cdli_number): | ||
""" | ||
Prints single text in file in markdown. | ||
:param catalog: text ingested by cdli_corpus | ||
:param cdli_number: text you wish to print | ||
:return: output in filename.md | ||
""" | ||
if cdli_number in catalog: | ||
pnum = catalog[cdli_number]['pnum'] | ||
edition = catalog[cdli_number]['edition'] | ||
metadata = '\n\t'.join(catalog[cdli_number]['metadata']) | ||
transliteration = '\n\t'.join(catalog[cdli_number]['transliteration']) | ||
normalization = '\n\t'.join(catalog[cdli_number]['normalization']) | ||
translation = '\n\t'.join(catalog[cdli_number]['translation']) | ||
m_d = """{edition} | ||
{pnum} | ||
--- | ||
### metadata | ||
{metadata} | ||
### transliteration | ||
{trans} | ||
### normalization | ||
{norm} | ||
### translation | ||
{translation} | ||
""".format(pnum=pnum, edition=edition, metadata=metadata, | ||
trans=transliteration, norm=normalization, | ||
translation=translation) | ||
self.markdown_text = m_d # pylint: disable=attribute-defined-outside-init | ||
|
||
def html_print_file(self, catalog, destination): | ||
""" | ||
Prints text_file in html. | ||
:param catalog: text file you wish to pretty print | ||
:param destination: where you wish to save the HTML data | ||
:return: output in html_file.html. | ||
""" | ||
with open(destination, mode='r+', encoding='utf8') as t_f: | ||
for text in catalog: | ||
pnum = catalog[text]['pnum'] | ||
edition = catalog[text]['edition'] | ||
metadata = '<br>\n'.join(catalog[text]['metadata']) | ||
transliteration = '<br>\n'.join(catalog[text]['transliteration']) | ||
normalization = '<br>\n'.join(catalog[text]['normalization']) | ||
translation = '<br>\n'.join(catalog[text]['translation']) | ||
self.html_file = """<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="UTF-8"> | ||
<title>{edition}</title> | ||
</head> | ||
<body><table cellpadding="10"; border="1"> | ||
<tr><th> | ||
<h2>{edition}<br>{pnum}</h2> | ||
</th><th> | ||
<h3>transliteration</h3> | ||
</th><th> | ||
<h3>normalization</h3> | ||
</th><th> | ||
<h3>translation</h3> | ||
</tr><tr><td> | ||
{metadata}</td><td> | ||
<p>{trans} | ||
</td><td> | ||
<p>{norm} | ||
</td><td> | ||
<font size='2'> | ||
{translation} | ||
</font></td></tr> | ||
</table> | ||
<br> | ||
</body> | ||
</html>""".format( | ||
pnum=pnum, edition=edition, metadata=metadata, | ||
trans=transliteration, norm=normalization, | ||
translation=translation) | ||
t_f.write(self.html_file) | ||
|
||
def html_print_single_text(self, catalog, cdli_number, destination): | ||
""" | ||
Prints text_file in html. | ||
:param catalog: CDLICorpus().catalog | ||
:param cdli_number: which text you want printed | ||
:param destination: where you wish to save the HTML data | ||
:return: output in html_file.html. | ||
""" | ||
if cdli_number in catalog: | ||
pnum = catalog[cdli_number]['pnum'] | ||
edition = catalog[cdli_number]['edition'] | ||
metadata = '<br>\n'.join(catalog[cdli_number]['metadata']) | ||
transliteration = '<br>\n'.join(catalog[cdli_number]['transliteration']) | ||
normalization = '<br>\n'.join(catalog[cdli_number]['normalization']) | ||
translation = '<br>\n'.join(catalog[cdli_number]['translation']) | ||
self.html_single = """<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="UTF-8"> | ||
<title>{edition}</title> | ||
</head> | ||
<body><table cellpadding="10"; border="1"> | ||
<tr><th> | ||
<h2>{edition}<br>{pnum}</h2> | ||
</th><th> | ||
<h3>transliteration</h3> | ||
</th><th> | ||
<h3>normalization</h3> | ||
</th><th> | ||
<h3>translation</h3> | ||
</tr><tr><td> | ||
{metadata}</td><td> | ||
<p>{trans} | ||
</td><td> | ||
<p>{norm} | ||
</td><td> | ||
<font size='2'> | ||
{translation} | ||
</font></td></tr> | ||
</table> | ||
<br> | ||
</body> | ||
</html>""".format( | ||
pnum=pnum, edition=edition, metadata=metadata, | ||
trans=transliteration, norm=normalization, | ||
translation=translation) | ||
with open(destination, mode='r+', encoding='utf8') as t_f: | ||
t_f.write(self.html_single) |
Oops, something went wrong.