In [27]:
import torch, os, re, glob, string, pandas as pd
from collections import Counter
from matplotlib import pyplot as plt
from sklearn import preprocessing

In [46]:
def char_frequency(corpus):
    with open(corpus) as f:
        text = f.read().splitlines()
    
        df = pd.DataFrame(columns=['Char', 'Act_Freq', 'Rel_Freq', 'Zipf_Freq'])
        frequencies = Counter(char for line in text for char in line if char.split())
        frequencies = frequencies.most_common()
        top_frequency = frequencies[0][1]
        
        for index, item in enumerate(frequencies, start=1):
            relative_freq = "1/{}".format(index)
            zipf_freq = top_frequency * (1/index)
            
            df.loc[index] = [item[0], item[1], relative_freq, zipf_freq]
            
        # Normalize
        # df['Act_Freq'] = preprocessing.minmax_scale(df['Act_Freq'], feature_range=(0.1, 1))
        
        f.close()
        
    return df


def char_vocabulary(df):
    return {char: freq for char, freq in zip(df['Char'], df['Act_Freq'])}

def plot_char_dist(df):
    plt.figure(figsize=(10, 10))
    plt.ylabel('Zipf Frequency')
    plt.xlabel('Char')
    plt.xticks(rotation=90)
    plt.bar(df['Char'], df['Zipf_Freq'])
    plt.show()

In [19]:
def word_frequency(corpus):
    with open(corpus) as f:
        frequencies = Counter([word.lower().strip(exclude) for line in f for word in line.split() if word.strip(exclude)])
        
        df = pd.DataFrame(columns=['Word', 'Act_Freq', 'Rel_Freq', 'Zipf_Freq'])
        frequencies = frequencies.most_common()
        top_frequency = frequencies[0][1]
        
        for index, item in enumerate(frequencies, start=1):
            relative_freq = "1/{}".format(index)
            zipf_freq = top_frequency * (1/index)
            
            df.loc[index] = [item[0], item[1], relative_freq, zipf_freq]
        
        f.close()
        
        # lowest = df[df['Act_Freq'] == 1].index
        # df.drop(lowest, inplace=True)
        
    return df


def word_vocabulary(df):
    return {word: freq for word, freq in zip(df['Word'], df['Act_Freq'])}


def plot_word_dist(df):
    plt.figure(figsize=(10, 10))
    plt.ylabel('Zipf Frequency')
    plt.xlabel('Word')
    plt.xticks(rotation=90)
    x = df.iloc[:20, df.columns.get_loc('Word')]
    y = df.iloc[:20, df.columns.get_loc('Zipf_Freq')]
    plt.bar(x, y)
    plt.show()

In [45]:
def obfuscate_text(corpus, vocab):
    with open(corpus, 'r', encoding='utf-8') as f:
        text = f.read().splitlines()
        f.close()
    mapping = {value: chr(97 + i) for i, value in enumerate(vocab.values())}
    vocab = {k: mapping[v] for k, v in vocab.items()}
    table = str.maketrans(vocab)
    trans = [line.translate(table) for line in text]
    return trans

In [None]:
languages = ['Dothraki', 'Lojban', 'LdP', 'LFN', 'Esperanto', 'Interlingua', 'Klingon', 'German', 'English', 'Japanese', 'Russian', 'Chinese', 'Hindi', 'Arabic']

In [None]:
# Join conlang corpora together

files = glob.glob('./Data/Conlangs/*.txt')

def join_corpora(files):
    conlang_dfs = []
    for corpus in files:
        df = char_frequency(corpus)
        conlang_dfs += df               

In [None]:
def wordToTensor(word):
    tensor = torch.zeros(len(word), 1, n_dim)