In [73]:
import glob
import os
import re

#html_reg = re.compile(r'<[a-zA-Z !="\-,/\\.\' 0-9;_#:%\\n\u0430-\u044f]+>', re.MULTILINE + re.UNICODE)
# Thanks https://pythex.org/
html_reg = re.compile(r'<[a-zA-Z0-9А-Яа-я !="\-,/\.\';_#:%\(\)\n]+>', re.MULTILINE + re.UNICODE)
zh_fn_reg = re.compile(r'【[0-9]+】')
fn_reg = re.compile(r'\[[0-9]+\]')

# For knowing the code for the encodings for with open():
# https://docs.python.org/3.7/library/codecs.html#standard-encodings

In [121]:
# Custom cleaning
header_strs = ["MIA главная страница", "Глав. стр. Иноязычной секции", "Глав. стр. Русской секции",
               "Маркс Энгельс архив", "|", "Оригинал находится на странице",
               "http://www.esperanto.mv.ru/Marksismo/Kapital1/index.html",
               "Последнее обновление Февраль 2012 г.", "Последнее обновление Декабрь 2011г.",
               "Карл Маркс"]

In [533]:
# See e.g. https://www.ou.edu/research/electron/internet/special.shtml
html_code_map = {
    "&laquo;": "«", "&#171;": "«",
    "&raquo;": "»", "&#187;": "»",
    "&mdash;": "-", "&ndash;": "-",
    "&nbsp;": " ", "&hellip;": "...",
    "&deg;": "°",
    "&times;": "×", "&frac12;": "½", "&frac14;": "¼", "&frac34;": "¾", "&#x2153;": "⅓",
    "&acirc;": "â", "&agrave;": "à", "&auml;": "ä",
    "&Eacute;": "É", "&eacute;": "é", "&euml;": "ë", "&egrave;": "è", "&ecirc;": "ê",
    "&ucirc;": "û", "&uuml;": "ü",
    "&szlig;": "ß",
    # Greek
    "&#x3ac;": "ά", # alpha with tonos https://www.compart.com/en/unicode/U+03AC
    "&alpha;": "α", "&beta;": "β", "&gamma;": "γ", "&Delta;": "Δ", "&delta;": "δ",
    "&epsilon;": "ε", "&eta;": "η",
    "&iota;": "ι", "&kappa;": "κ",
    "&lambda;": "λ", "&mu;": "μ",
    "&nu;": "ν", "&omicron;": "ο",
    "&pi;": "π", "&rho;": "ρ",
    "&sigma;": "σ", "&tau;": "τ", "&upsilon;": "υ",
    "&chi;": "χ",
    "&#x3cd;": "ύ", # https://www.compart.com/en/unicode/U+03cd
    "&#x3cb;": "ϋ", # upsilon with dialytika https://www.compart.com/en/unicode/U+03CB
    "&#x3cc;": "ό", # Omicron with tonos
    "&#x3af;": "ί", # iota with tonos https://www.compart.com/en/unicode/U+03AF
    "&sigmaf;": "ς",
    "&#x384;": "΄", # Greek tonos https://www.compart.com/en/unicode/U+0384
    "&#x301;": "◌́", # https://www.compart.com/en/unicode/U+0301
    "&#x3ad;": "έ", # https://www.compart.com/en/unicode/U+03ad
    "&#x3ae;": "ή", # eta with tonos https://www.compart.com/en/unicode/U+03ae
}

In [534]:
ru_path = "../../Translation_Documents/Raw_Capital/html/RU_marxists.org"
ru_glob_str = os.path.join(ru_path, "*.htm")
ru_fpaths = glob.glob(ru_glob_str)
ru_fpaths

['../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\01.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\02.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\03.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\04.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\05.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\06.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\07.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\08.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\09.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\10.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\11.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\12.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_marxists.org\\13.htm',
 '../../Translation_Documents/Raw_Capital/html/RU_m

In [535]:
output_path = "../../Translation_Documents/Raw_Capital/txt/RU_clean"
if not os.path.isdir(output_path):
    os.mkdir(output_path)

In [536]:
tnum_start = 1
tnum_end = 66

In [545]:
clean_texts = []
for cur_tnum in range(tnum_start, tnum_end+1):
    tnum_str = str(cur_tnum).zfill(2)
    cur_fpath = os.path.join(ru_path, f"{tnum_str}.htm")
    with open(cur_fpath, 'r', encoding='iso8859_5', errors='replace') as infile:
        text = infile.read()
    # Just get rid of all the newlines for now
    clean_text = text.replace("\n"," ")
    # And replace html codes with ascii
    for cur_key, cur_val in html_code_map.items():
        clean_text = clean_text.replace(cur_key, cur_val)
    # Remove page header strings
    for cur_header_str in header_strs:
        clean_text = clean_text.replace(cur_header_str, "")
    clean_text = html_reg.sub("", clean_text)
    clean_text = zh_fn_reg.sub("", clean_text)
    clean_text = fn_reg.sub("", clean_text)
    clean_text = clean_text.replace("\u3000","")
    clean_text = clean_text.replace("&gt;",">")
    # Save individually
    cur_output_fpath = os.path.join(output_path, f"{tnum_str}_morg.txt")
    with open(cur_output_fpath, 'w', encoding='utf-8') as outfile:
        outfile.write(clean_text)
    clean_texts.append(clean_text)

In [546]:
full_text = "\n\n\n".join(clean_texts)

In [547]:
full_output_fpath = os.path.join(output_path, "..", "RU_clean.txt")

In [548]:
with open(full_output_fpath, 'w', encoding='utf-8') as outfile:
    outfile.write(full_text)