In [12]:
import os, nltk, math, glob, re
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
def extract_texts_by_language(folder_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        language = file_name.split('_')[-1].split('.')[0]

        try:
            with open(file_path, encoding='utf-8') as file:
                text = file.read()
                pass
        except UnicodeDecodeError:
            print(f"Error opening file: {file_name}")

        output_file = os.path.join(output_folder, f"{language}.txt")
        with open(output_file, 'a', encoding='utf-8') as output:
            output.write(text + '\n')

folder_path = '/Users/k/Dev/Webscraping/build-source-text/parallel'
output_folder = '/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/Parallel/'

extract_texts_by_language(folder_path, output_folder)


In [24]:
# Balance Corpus Data

directory = '/Users/k/Dev/Webscraping/build-source-text/parallel2'
output_dir = '/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/Parallel/Balanced/'

files = glob.glob(os.path.join(directory, '*.txt'))
topics = {}

def shorten_text(text, target_wc):
    words = text.split()
    if len(words) <= target_wc:
        return text
    
    sentences = text.split('. ')  
    wc = 0
    output_sen = []

    for sentence in sentences:
        words = sentence.split()
        sentence_wc = len(words)
        if wc + sentence_wc <= target_wc:
            output_sen.append(sentence)
            wc += sentence_wc
        else:
            break

    shortened_text = '.'.join(output_sen)
    return shortened_text

for file in files:
    topic = os.path.basename(file).split('_')[0]
    if topic in topics:
        topics[topic].append(file)
    else:
        topics[topic] = [file]
        
for topic, files in topics.items():
    wc_min = float('inf')
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            wc = len(f.read().split())
            
        if wc < wc_min:
            wc_min = wc
        f.close()
        
    for file in files:
        language = file.split('_')[-1].split('.')[0]
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read()
            shortened_text = shorten_text(text, wc_min)
            f.close()
        output_file = os.path.join(output_dir, f"{language}.txt")
        with open(output_file, 'a', encoding='utf-8') as output:
            output.write(shortened_text + '\n')        
        

In [None]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    return tokens

def calculate_word_probabilities(tokens):
    word_counts = Counter(tokens)
    total_words = len(tokens)
    word_probabilities = {word: count / total_words for word, count in word_counts.items()}
    return word_probabilities

def calculate_entropy(word_probabilities):
    entropy = 0
    for word, probability in word_probabilities.items():
        entropy += probability * math.log2(probability)
    entropy = -entropy
    return entropy

In [None]:
with open('/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/Parallel/de.txt', encoding='utf-8') as f:
    corpus_text = f.read()
    tokens = preprocess_text(corpus_text)
    word_probabilities = calculate_word_probabilities(tokens)
    entropy = calculate_entropy(word_probabilities)
    
    print("Entropy: ", entropy)

In [None]:
dir = '/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/Parallel/'

for file in os.listdir(dir):
    if file.endswith('.txt'):  
        with open(file, encoding='utf-8') as f:
            corpus_text = f.read()
            tokens = preprocess_text(corpus_text)
            word_probabilities = calculate_word_probabilities(tokens)
            entropy = calculate_entropy(word_probabilities)

            print(f'Entropy of {f}: ', entropy)

Word Length Distribution

In [None]:
with open('/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/Parallel/en.txt') as f:
    text = f.read()
    tokens = nltk.word_tokenize(text.lower())
    word_lengths = [len(word) for word in tokens]
    frequency_distribution = nltk.FreqDist(word_lengths)
    print(frequency_distribution)

In [None]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    return tokens

def calculate_word_lengths(tokens):
    word_lengths = [len(word) for word in tokens]
    fdist = nltk.FreqDist(word_lengths)
    return fdist

def plot_word_length_distribution(frequency_distribution):
    plt.bar(frequency_distribution.keys(), frequency_distribution.values())
    plt.xlabel('Word Length')
    plt.ylabel('Frequency')
    plt.title('Word Length Distribution')
    plt.show()

In [None]:
with open('/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/Parallel/en.txt', encoding='utf-8') as f:
    corpus_text = f.read()
    tokens = preprocess_text(corpus_text)
    fdist = calculate_word_lengths(tokens)
    plot_word_length_distribution(fdist)
    

In [None]:
with open('/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/Parallel/lfn.txt', encoding='utf-8') as f:
    corpus_text = f.read()
    tokens = preprocess_text(corpus_text)
    fdist = calculate_word_lengths(tokens)
    fdist.plot(title='Word Length Dist', cumulative=False, percents=False, show=True)

Type Token Ratio

In [None]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    return tokens

def calculate_ttr(tokens):
    unique_words = set(tokens)
    total_tokens = len(tokens)
    ttr = len(unique_words) / total_tokens
    return ttr

with open('/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/Parallel/en.txt', encoding='utf-8') as f:
    text = f.read()
    tokens = preprocess_text(text)
    ttr = calculate_ttr(tokens)
    print("Type-Token Ratio:", ttr)


In [18]:
from lexicalrichness import LexicalRichness

In [22]:
with open('/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/Parallel/Balanced/en.txt') as f:
    text = f.read()

lex = LexicalRichness(text)
lex.mattr(window_size=25)

0.8761885656971639

In [23]:
with open('/Users/k/Docs/School/Tuebingen/Thesis/Corpuses/Parallel/Balanced/lfn.txt') as f:
    text = f.read()

lex = LexicalRichness(text)
lex.mattr(window_size=25)

0.8082704607754454