In [1]:
import torch, os, re, glob, string, pandas as pd, numpy as np
from collections import Counter
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
corpora = ['Esperanto.txt', 'Interlingua.txt', 'Lojban.txt', 'Lfn.txt', 'Russian.txt', 'English.txt', 'German.txt', 'Japanese.txt', 'Mandarin.txt', 'Hindi.txt']

In [7]:
def char_frequency(corpus):
    with open(corpus) as f:
        text = f.read().splitlines()
        f.close()
    df = pd.DataFrame(columns=['Char', 'Act_Freq', 'Rel_Freq', 'Zipf_Freq'])
    frequencies = Counter(char for line in text for char in line if char.split())
    frequencies = frequencies.most_common()
    top_frequency = frequencies[0][1]
    
    for index, item in enumerate(frequencies, start=1):
        relative_freq = "1/{}".format(index)
        zipf_freq = top_frequency * (1/index)
        df.loc[index] = [item[0], item[1], relative_freq, zipf_freq]
    # Normalize
    # df['Act_Freq'] = preprocessing.minmax_scale(df['Act_Freq'], feature_range=(0.1, 1))
    return df


def char_vocabulary(corpus=None, df=None):
    if corpus:
        df = char_frequency(corpus)
        return {char: freq for char, freq in zip(df['Char'], df['Act_Freq'])}
    return {char: freq for char, freq in zip(df['Char'], df['Act_Freq'])}


def plot_char_dist(corpus):
    df = char_frequency(corpus)
    plt.figure(figsize=(10, 10))
    plt.ylabel('Zipf Frequency')
    plt.xlabel('Char')
    plt.xticks(rotation=90)
    plt.bar(df['Char'], df['Zipf_Freq'])
    plt.show()
    
    
def obfuscate_text(corpus, vocab):
    with open(corpus, 'r', encoding='utf-8') as f:
        text = f.read().splitlines()
        f.close()
    mapping = {value: chr(97 + i) for i, value in enumerate(vocab.values())}
    vocab = {k: mapping[v] for k, v in vocab.items()}
    table = str.maketrans(vocab)
    trans = [line.translate(table) for line in text]
    return trans


def preprocess_text(corpora):
    data = []
    for corpus in corpora:
        df = char_frequency(corpus)
        vocab = char_vocabulary(corpus=None, df=df)
        data += obfuscate_text(corpus, vocab)
    return data

In [8]:
data = preprocess_text(corpora)

In [9]:
def lengths(corpora):
    for file in corpora:
        with open(file, 'r', encoding='utf-8') as f:
            data = f.read().splitlines()
            f.close()
        print(f'{file} : {len(data)}')
        
lengths(corpora)

Esperanto.txt : 300000
Interlingua.txt : 1297382
Lojban.txt : 16287
Lfn.txt : 136241
Russian.txt : 100000
English.txt : 100000
German.txt : 100000
Japanese.txt : 100000
Mandarin.txt : 100000
Hindi.txt : 100000
