In [100]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [101]:
#!pip install gensim

In [102]:
#!pip install underthesea

In [103]:
import glob
import os
import re

import unidecode

import tmdglobals

In [104]:
tmdglobals.text_paths.keys()

dict_keys(['am_iish', 'el_mod', 'en_aveling', 'en_fowkes', 'en_reitter', 'de_tge', 'kk_prog', 'ru_morg', 'vi_cw', 'zh_morg'])

In [105]:
doc1 = 'de_tge'
lang1 = tmdglobals.get_lang_code(doc1)
print(lang1)
doc2 = 'en_reitter'
lang2 = tmdglobals.get_lang_code(doc2)
print(lang2)

de
en


In [106]:
def clean_chapters(input_path, verbose=False):
    vprint = print if verbose else lambda x: None
    # The path where the cleaned versions of each chapter will be saved
    cleaned_path = os.path.join(input_path, "cleaned")
    if not os.path.isdir(cleaned_path):
        os.mkdir(cleaned_path)
    ## Get all the files in that dir
    all_fpaths = glob.glob(os.path.join(input_path, "*.txt"))
    # print(all_paths)
    ## Or just a single file, for debugging
    # all_paths = [input_dir + "\\ch02.de.txt"]
    for fpath_num, cur_fpath in enumerate(all_fpaths):
        # This gets the info from the filename
        cur_fname, fname_prefix, tr_code, ch_name = tmdglobals.get_file_info(cur_fpath)
        print(f"Processing file #{fpath_num}: {cur_fname}")
        ## Load chapter
        ch_text = load_chapter(cur_fpath, verbose=verbose)
        ## Tokenize sentences
        # Get the lang code from the full tr_code
        lang_code = tmdglobals.get_lang_code(tr_code)
        print(f"Extracted lang_code: {lang_code}")
        tokens = tokenize_chapter(ch_text, lang_code, verbose=verbose)
        print("Num sentences: " + str(len(tokens)))
        ## Output cleaned versions
        cleaned_fname = f"{fname_prefix}_clean.{tr_code}.txt"
        cleaned_fpath = os.path.join(cleaned_path, cleaned_fname)
        print(f"Outputting to {cleaned_fpath}")
        output_cleaned_tokens(tokens, cleaned_fpath)

In [107]:
def load_chapter(chapter_fpath, verbose=False):
    with open(chapter_fpath, "r", encoding="utf-8") as infile:
        file_text = infile.read().replace('\n', ' ')
        file_text = file_text.replace('\r', ' ')
    return file_text

In [108]:
def output_cleaned_tokens(tokens, output_filename):
    with open(output_filename, "w", encoding="utf-8") as outfile:
        outfile.write(" ".join(tokens))

In [109]:
def tokenize_chapter(fulltext, lang, verbose=False):
    vprint = print if verbose else lambda x: None
    vprint(f"tokenize_chapter(): lang = {lang}")
    # First we clean footnotes
    clean_text = tmdglobals.remove_footnotes(fulltext, lang)
    # Then, if german, we do cleaning of e.g. unicode chars
    if lang in ["de"]:
        clean_text = unidecode.unidecode(clean_text)
    # Use the right tokenizer for the language
    tokenized = tmdglobals.sent_tokenize(clean_text, lang)
    # There's one issue where it thinks some footnotes and section numbers are
    # sentences, in german version. So remove these.
    if lang == "de":
        nonsent_reg = tmdglobals.nonsent_reg_de
        tokenized = [sent for sent in tokenized if not re.match(nonsent_reg, sent)]
    return tokenized

In [110]:
# en_dir = os.path.join("..","Texts","FirstEnglishEdition")
clean_chapters(tmdglobals.text_paths[doc1], verbose=True)
clean_chapters(tmdglobals.text_paths[doc2], verbose=True)

Processing file #0: ch01.de_tge.txt
Extracted lang_code: de
tokenize_chapter(): lang = de
Num sentences: 686
Outputting to ..\Texts\ThirdGermanEdition\cleaned\ch01_clean.de_tge.txt
Processing file #1: ch02.de_tge.txt
Extracted lang_code: de
tokenize_chapter(): lang = de
Num sentences: 122
Outputting to ..\Texts\ThirdGermanEdition\cleaned\ch02_clean.de_tge.txt
Processing file #2: ch03.de_tge.txt
Extracted lang_code: de
tokenize_chapter(): lang = de
Num sentences: 681
Outputting to ..\Texts\ThirdGermanEdition\cleaned\ch03_clean.de_tge.txt
Processing file #3: ch04.de_tge.txt
Extracted lang_code: de
tokenize_chapter(): lang = de
Num sentences: 465
Outputting to ..\Texts\ThirdGermanEdition\cleaned\ch04_clean.de_tge.txt
Processing file #4: ch05.de_tge.txt
Extracted lang_code: de
tokenize_chapter(): lang = de
Num sentences: 167
Outputting to ..\Texts\ThirdGermanEdition\cleaned\ch05_clean.de_tge.txt
Processing file #5: ch06.de_tge.txt
Extracted lang_code: de
tokenize_chapter(): lang = de
Num s

In [111]:
# Check to make sure they're cleaned

In [112]:
doc_id = doc2
doc_path = tmdglobals.text_paths[doc_id]
doc_fpath = os.path.join(doc_path, "cleaned", f"ch14_clean.{doc_id}.txt")
with open(doc_fpath, 'r', encoding='utf-8') as infile:
    text = infile.read()

In [113]:
text

' Chapter 12  The Division of Labor and the Manufacturing System  1. The Double Origin of the Manufacturing System  Cooperation based on the division of labor found its classic form in the manufacturing system. As the characteristic form of the capitalist production process, such cooperation predominated during the era of manufacturing proper, which lasted from around 1550 to the last third of the eighteenth century. The manufacturing system arose in two ways. 1. A single capitalist assembles under his command different types of independent craftsmen in one workshop, where a product has to pass through the hands of each type of worker in order to reach the point of completion. For example, a carriage was once the product of many different craftsmen working on their own: wheelwrights, harness-makers, tailors, locksmiths, upholsterers, turners, fringe-makers, glaziers, painters, polishers, gilders, and so on. But when carriages are made in the manufacturing workshop, these craftsmen are 