Extract and preprocess text data for each language

Using /iscl-thesis/WikiExtractor

In [18]:
import re, os, string

def clean_sentences(content):
    # Remove page titles
    no_titles = re.sub(r'(?<=\s\n).+:\n', '', content)
    # Remove section headers
    no_headers = re.sub(r'.+\/\/\/\/\/\n', '', no_titles)
    # Remove picture links
    no_pics = re.sub(r'(?<=\n)(\w+\|)+.+[^\.]\n', '', no_headers)
    # Remove fragments
    no_fragments = re.sub(r'(?<=\n)\w+\:\w+.*[^\.](?=\n)', '', no_pics)
    # Remove other links
    no_links = re.sub(r'http[^\s\n]+(?=\s|\n)', '', no_fragments)
    # Remove paranthesis
    no_paranthesis = re.sub(r'\([^\)]+\)', '', no_links)
    # Removes braces
    no_braces = re.sub(r'\[+\w+[^\]]*\]+', '', no_paranthesis)
    # Remove lists
    no_lists = re.sub(r'(?<=\n)[^\.\n]+\:\s?\n', '', no_braces)
    # Remove html tags
    no_html = re.sub(r'\<[^\>]+\>', '', no_lists)
    # Remove number fragments
    no_num = re.sub(r'\d+(?=\n)', '', no_html)
    # Replacing hyphens, dashes, and forward slashes in compound words with spaces
    no_hyph = re.sub(r'[\-\‐\‑\‒\–\—\―\⁃\−\/]', ' ', no_num)
    # Split at periods while trying to overlook abbreviations
    sentences = re.split(r'(?<!\w\.\w)(?<!\w\.)(?<!Sr)(?<!Mr)(?<!Ms)(?<!Mrs)(?<!Jr)(?<!Dr)\.(?!\w)', no_hyph)
    # Remove punctuation, numbers, make lowercase
    return [s.translate(str.maketrans('', '', string.punctuation + string.digits)).lower() for s in sentences]

In [19]:
# Directory with extracted files
maindir = '/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/WikiExtractorCorpora/'
# Dict of languages + their corresponding alphabets
alphabets = {
    # Lingua Franca Nova
    'lfn': 'abcdefghijklmnopqrstuvwxyz', 
    # Interlingua
    'ia': 'abcdefghijklmnopqrstuvwxyz', 
    # Esperanto
    'eo': 'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz', 
    # German
    'de': 'abcdefghijklmnopqrstuvwxyzäöüß', 
    # French
    'fr': 'abcdefghijklmnopqrstuvwxyzéàèùëüïâêîôûçæœ', 
    # English
    'en': 'abcdefghijklmnopqrstuvwxyz',
    # Finnish
    'fi': 'abcdefghijklmnopqrstuvwxyzšžåäö',
    # Tagalog
    'tl': 'abcdefghijklmnopqrstuvwxyzñ',
    # Turkish
    'tr': 'abcçdefgğhıijklmnoöprsştuüvyz',
    # Vietnamese
    'vi': 'aáàảãạăắằẳẵặâấầẩẫậbcdđeéèẻẽẹêếềểễệfghiíìỉĩịjklmnoóòỏõọôốồổỗộơớờởỡợpqrstuúùủũụưứừửữựvwxyýỳỷỹỵz',
    # Polish
    'pl': 'aąbcćdeęfghijklłmnńoópqrsśtuvwxyzźż',
    # Indonesian
    'id': 'abcdefghijklmnopqrstuvwxyz',
    # Ido
    'io': 'abcdefghijklmnopqrstuvwxyz',
    # Italian
    'it': 'abcdefghijklmnopqrstuvwxyz',
    # Dutch
    'nl': 'abcdefghijklmnopqrstuvwxyz',
    # Occitan
    'oc': 'abcdefghijklmnopqrstuvwxyzàèòáéíóúïüç',
    # Danish
    'da': 'abcdefghijklmnopqrstuvwxyzæøå',
    # Swedish
    'sv': 'abcdefghijklmnopqrstuvwxyzåäö',
    # Hungarian
    'hu': 'aábccsddzdzseéfggyhiíjkllymnnyoóöőpqrsszttyuúüűvwxyzzs',
    # Spanish
    'es': 'abcdefghijklmnopqrstuvwxyzñ',
    # Afrikaans
    'af': 'aáäbcdeéèêëfghiíîïjklmnŉoóôöpqrstuúûüvwxyýz',
    # Icelandic
    'is': 'aábdðeéfghiíjklmnoóprstuúvxyýþæö',
    }

# LFN corpus is the smallest size based on word count, at about 650000 words, so shrink all other corpora to about the same
max_wc = 650000
file_count = 1

for file in os.listdir(maindir):
    if file.endswith('.txt'):
        fullpath = os.path.join(maindir, file)
        lang = os.path.splitext(os.path.basename(file))[0].split('_')[0]
        current_wc = 0
        if lang in alphabets:
            alpha = set(alphabets[lang])
        with open(f'/Users/k/Docs/School/Tuebingen/Thesis/iscl-thesis/current_corpora/{lang}_wiki_cleaned.txt', 'w', encoding='utf-8') as out:
            with open(fullpath, 'r', encoding='utf-8') as f: 
                content = f.read()
                f.close()
            sentences = clean_sentences(content)
            sen_count = 0
            for sentence in sentences:
                # Remove all chars not language's alphabet
                s = ''.join(char if char in alpha or char.isspace() else '' for char in sentence)
                # Remove leftover whitespaces
                s = re.sub(r'\s+', ' ', s).strip()
                # Remove sentence fragments containing only 1 word
                if len(s.split()) > 1:
                    if (len(s.split()) + current_wc) < max_wc:
                        current_wc += len(s.split())
                        out.write(s + '\n')
                        sen_count += 1
                    else:
                        break
                else:
                    continue
        
        print(f'{file_count}/22 {lang} corpus finished. {current_wc} words. {sen_count} sentences.')
        file_count += 1
        out.close()

1/22 tl corpus finished. 649999 words. 30748 sentences.
2/22 oc corpus finished. 649976 words. 34847 sentences.
3/22 af corpus finished. 649985 words. 31840 sentences.
4/22 eo corpus finished. 649988 words. 34620 sentences.
5/22 en corpus finished. 649975 words. 30333 sentences.
6/22 tr corpus finished. 649990 words. 45329 sentences.
7/22 io corpus finished. 649994 words. 45076 sentences.
8/22 de corpus finished. 649989 words. 38738 sentences.
9/22 fr corpus finished. 649993 words. 28319 sentences.
10/22 id corpus finished. 649989 words. 35504 sentences.
11/22 vi corpus finished. 649981 words. 21795 sentences.
12/22 sv corpus finished. 649998 words. 38051 sentences.
13/22 ia corpus finished. 649999 words. 33181 sentences.
14/22 nl corpus finished. 649995 words. 35800 sentences.
15/22 es corpus finished. 649983 words. 25724 sentences.
16/22 da corpus finished. 649981 words. 39197 sentences.
17/22 it corpus finished. 649994 words. 25326 sentences.
18/22 pl corpus finished. 649991 words. 