Extract and preprocess text data for each language

Using /iscl-thesis/WikiExtractor

In [1]:
import re, os, string

def clean_sentences(content):
    # Remove page titles
    no_titles = re.sub(r'(?<=\s\n).+:\n', '', content)
    # Remove section headers
    no_headers = re.sub(r'.+\/\/\/\/\/\n', '', no_titles)
    # Remove picture links
    no_pics = re.sub(r'(?<=\n)(\w+\|)+.+[^\.]\n', '', no_headers)
    # Remove fragments
    no_fragments = re.sub(r'(?<=\n)\w+\:\w+.*[^\.](?=\n)', '', no_pics)
    # Remove other links
    no_links = re.sub(r'http[^\s\n]+(?=\s|\n)', '', no_fragments)
    # Remove paranthesis
    no_paranthesis = re.sub(r'\([^\)]+\)', '', no_links)
    # Removes braces
    no_braces = re.sub(r'\[+\w+[^\]]*\]+', '', no_paranthesis)
    # Remove lists
    no_lists = re.sub(r'(?<=\n)[^\.\n]+\:\s?\n', '', no_braces)
    # Remove html tags
    no_html = re.sub(r'\<[^\>]+\>', '', no_lists)
    # Remove number fragments
    no_num = re.sub(r'\d+(?=\n)', '', no_html)
    # Replacing hyphens, dashes, and forward slashes in compound words with spaces
    no_hyph = re.sub(r'[\-\‐\‑\‒\–\—\―\⁃\−\/]', ' ', no_num)
    # Split at periods while trying to overlook abbreviations
    sentences = re.split(r'(?<!\w\.\w)(?<!\w\.)(?<!Sr)(?<!Mr)(?<!Ms)(?<!Mrs)(?<!Jr)(?<!Dr)\.(?!\w)', no_hyph)
    # Remove punctuation, numbers, make lowercase
    return [s.translate(str.maketrans('', '', string.punctuation + string.digits)).lower() for s in sentences]

In [2]:
# Directory with extracted files
maindir = '/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/NewWikiExtractorCorpora/'
# Dict of languages + their corresponding alphabets
alphabets = {
    # Volapük
    'vo': 'abcdefghijklmnoprstuvxyzöäü',
    # Kotava
    'avk': 'abcdefgijklmnoprstuvwxyzáéíóú',
    # Lingua Franca Nova
    'lfn': 'abcdefghijklmnopqrstuvwxyz',
    # Interlingua
    'ia': 'abcdefghijklmnopqrstuvwxyz', 
    # Esperanto
    'eo': 'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz', 
    # German
    'de': 'abcdefghijklmnopqrstuvwxyzäöüß', 
    # French
    'fr': 'abcdefghijklmnopqrstuvwxyzéàèùëüïâêîôûçæœ', 
    # English
    'en': 'abcdefghijklmnopqrstuvwxyz',
    # Finnish
    'fi': 'abcdefghijklmnopqrstuvwxyzšžåäö',
    # Tagalog
    'tl': 'abcdefghijklmnopqrstuvwxyzñ',
    # Turkish
    'tr': 'abcçdefgğhıijklmnoöprsştuüvyz',
    # Vietnamese
    'vi': 'aáàảãạăắằẳẵặâấầẩẫậbcdđeéèẻẽẹêếềểễệfghiíìỉĩịjklmnoóòỏõọôốồổỗộơớờởỡợpqrstuúùủũụưứừửữựvwxyýỳỷỹỵz',
    # Polish
    'pl': 'aąbcćdeęfghijklłmnńoópqrsśtuvwxyzźż',
    # Indonesian
    'id': 'abcdefghijklmnopqrstuvwxyz',
    # Ido
    'io': 'abcdefghijklmnopqrstuvwxyz',
    # Italian
    'it': 'abcdefghijklmnopqrstuvwxyz',
    # Dutch
    'nl': 'abcdefghijklmnopqrstuvwxyz',
    # Occitan
    'oc': 'abcdefghijklmnopqrstuvwxyzàèòáéíóúïüç',
    # Danish
    'da': 'abcdefghijklmnopqrstuvwxyzæøå',
    # Swedish
    'sv': 'abcdefghijklmnopqrstuvwxyzåäö',
    # Hungarian
    'hu': 'aábccsddzdzseéfggyhiíjkllymnnyoóöőpqrsszttyuúüűvwxyzzs',
    # Spanish
    'es': 'abcdefghijklmnopqrstuvwxyzñ',
    # Afrikaans
    'af': 'aáäbcdeéèêëfghiíîïjklmnŉoóôöpqrstuúûüvwxyýz',
    # Icelandic
    'is': 'aábdðeéfghiíjklmnoóprstuúvxyýþæö',
    }

# LFN corpus is the smallest size based on word count, at about 650000 words, so shrink all other corpora to about the same
# Edit: now using LFN corpus with about 630000 words
max_wc = 630000
file_count = 1

# for file in os.listdir(maindir):
#     if file.endswith('.txt'):
#         lang = os.path.splitext(os.path.basename(file))[0].split('w')[0]
#         print(lang)

for file in os.listdir(maindir):
    if file.endswith('.txt'):
        fullpath = os.path.join(maindir, file)
        lang = os.path.splitext(os.path.basename(file))[0].split('w')[0]
        # current_wc = 0
        if lang in alphabets:
            alpha = set(alphabets[lang])
        with open(f'/Users/k/Docs/School/Tuebingen/Thesis/iscl-thesis/2024_corpora/{lang}_wiki_cleaned.txt', 'w', encoding='utf-8') as out:
            with open(fullpath, 'r', encoding='utf-8') as f: 
                content = f.read()
                f.close()
            sentences = clean_sentences(content)
            sen_count = 0
            current_wc = 0
            for sentence in sentences:
                # Remove all chars not language's alphabet
                s = ''.join(char if char in alpha or char.isspace() else '' for char in sentence)
                # Remove leftover whitespaces
                s = re.sub(r'\s+', ' ', s).strip()
                # Remove sentence fragments containing only 1 word
                if len(s.split()) > 1:
                    if (len(s.split()) + current_wc) < max_wc:
                        current_wc += len(s.split())
                        out.write(s + '\n')
                        sen_count += 1
                    else:
                        break
                else:
                    continue
        
        print(f'{file_count}/24 {lang} corpus finished. {current_wc} words. {sen_count} sentences.')
        file_count += 1
        out.close()

1/24 is corpus finished. 629995 words. 41847 sentences.
2/24 de corpus finished. 629987 words. 37261 sentences.
3/24 pl corpus finished. 629997 words. 42138 sentences.
4/24 io corpus finished. 629990 words. 43496 sentences.
5/24 af corpus finished. 629994 words. 30737 sentences.
6/24 avk corpus finished. 617400 words. 48145 sentences.
7/24 hu corpus finished. 629946 words. 39916 sentences.
8/24 lfn corpus finished. 628683 words. 32188 sentences.
9/24 da corpus finished. 629999 words. 38260 sentences.
10/24 es corpus finished. 629978 words. 24886 sentences.
11/24 ia corpus finished. 629996 words. 32229 sentences.
12/24 fr corpus finished. 629983 words. 27248 sentences.
13/24 oc corpus finished. 629998 words. 33762 sentences.
14/24 eo corpus finished. 629994 words. 33317 sentences.
15/24 nl corpus finished. 629997 words. 34627 sentences.
16/24 tr corpus finished. 629995 words. 43573 sentences.
17/24 en corpus finished. 629958 words. 29574 sentences.
18/24 tl corpus finished. 629989 words