In [1]:
import torch, re, pandas as pd, numpy as np
from collections import Counter
from matplotlib import pyplot as plt
from sklearn import naive_bayes, svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [2]:
corpora = ['Esperanto.txt', 'Interlingua.txt', 'Lojban.txt', 'Lfn.txt', 'Russian.txt', 'English.txt', 'German.txt', 'Japanese.txt', 'Mandarin.txt', 'Hindi.txt']
natural = ['Russian.txt', 'German.txt', 'English.txt', 'Japanese.txt', 'Mandarin.txt', 'Hindi.txt']
constructed = ['Esperanto.txt', 'Interlingua.txt', 'Lojban.txt', 'Lfn.txt']

Using character frequency distributions

In [3]:
def char_frequency(corpus, limit=None):
    with open(corpus) as f:
        if limit:
            text = f.read().splitlines()[:int(limit / len(corpora))]
        else:
            text = f.read().splitlines()
        f.close()
    df = pd.DataFrame(columns=['Char', 'Act_Freq', 'Rel_Freq', 'Zipf_Freq'])
    frequencies = Counter(char for line in text for char in line if char.split())
    frequencies = frequencies.most_common()
    top_frequency = frequencies[0][1]
    
    for index, item in enumerate(frequencies, start=1):
        relative_freq = "1/{}".format(index)
        zipf_freq = top_frequency * (1/index)
        df.loc[index] = [item[0], item[1], relative_freq, zipf_freq]
    return df


def char_vocabulary(corpus=None, df=None, limit=None):
    if corpus:
        df = char_frequency(corpus, limit=limit)
    return {char: freq for char, freq in zip(df['Char'], df['Act_Freq'])}


def plot_char_dist(corpus):
    df = char_frequency(corpus=corpus, df=None, limit=None)
    plt.figure(figsize=(10, 10))
    plt.ylabel('Zipf Frequency')
    plt.xlabel('Char')
    plt.xticks(rotation=90)
    plt.bar(df['Char'], df['Zipf_Freq'])
    plt.show()
    
    
def corpus_lengths(corpora):
    lengths = {}
    for file in corpora:
        with open(file, 'r', encoding='utf-8') as f:
            data = f.read().splitlines()
            f.close()
        lengths[f'{file.split(".")[0]}'] = len(data)
    return lengths


def vocab_sizes(corpora):
    sizes = {}
    for file in corpora:
        vocab = char_vocabulary(corpus=file, df=None, limit=None)
        sizes[f'{file.split(".")[0]}'] = len(vocab)
    return sizes
    
    
def obfuscate_label_text(corpus, vocab):
    with open(corpus, 'r', encoding='utf-8') as f:
        # Labeling the data for binary classification, 0 for Natlang and 1 for Conlang
        if corpus in natural:
            text = f.read().splitlines()
            label = [0] * len(text)
            labeled = list(zip(text, label))
        elif corpus in constructed:
            text = f.read().splitlines()
            label = [1] * len(text)
            labeled = list(zip(text, label))
        f.close()
    # Char-based frequency mapping of characters in each string    
    mapping = {value: chr(97 + i) for i, value in enumerate(vocab.values())}
    vocab = {k: mapping[v] for k, v in vocab.items()}
    table = str.maketrans(vocab)
    obfuscated = [(line[0].translate(table), line[1]) for line in labeled]
    return obfuscated


# def one_hot_encode(data, vocab):
#     seq_length = max(len(line[0] for line in data))
#     string_encoded = np.zeros((seq_length, len(vocab)), dtype=np.float32)
#     for string, label in data:
#         for i, char in enumerate(string):
#             if i >= seq_length:
#                 break
#             string_encoded[i][vocab[char]] = 1
#         label_encoded = np.array([label], dtype=np.int64)
#     return string_encoded, label_encoded


def preprocess_text(corpora, limit=None):
    data = []
    for corpus in corpora:
        vocab = char_vocabulary(corpus=corpus, df=None, limit=limit)
        data.extend(obfuscate_label_text(corpus=corpus, vocab=vocab))
    return data

In [4]:
data = preprocess_text(corpora, limit=100000)
text, labels = [d[0] for d in data], [d[1] for d in data]
vectorizer = CountVectorizer(analyzer='char')
vectorizer.fit(text)
text = vectorizer.transform(text)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=42, shuffle=True)

Classifying using SVM

In [9]:
clf = SGDClassifier(loss="log_loss", tol=1e-3, verbose=1, random_state=42, validation_fraction=0.1)
batch_size = 10000
num_batches = x_train.shape[0] // batch_size + 1

# Training
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, x_train.shape[0])
    batch_x, batch_y = x_train[start:end], y_train[start:end]
    # batch_x = vectorizer.transform(batch_x)
    clf.partial_fit(batch_x, batch_y, classes=[0, 1])

-- Epoch 1
Norm: 483.62, NNZs: 571, Bias: -1.930660, T: 10000, Avg. loss: 171.696578
Total training time: 0.00 seconds.
-- Epoch 1
Norm: 294.83, NNZs: 670, Bias: -1.394658, T: 10000, Avg. loss: 31.843293
Total training time: 0.00 seconds.
-- Epoch 1
Norm: 217.72, NNZs: 787, Bias: -1.090127, T: 10000, Avg. loss: 19.308277
Total training time: 0.00 seconds.
-- Epoch 1
Norm: 171.48, NNZs: 889, Bias: -0.791514, T: 10000, Avg. loss: 13.971323
Total training time: 0.00 seconds.
-- Epoch 1
Norm: 149.21, NNZs: 974, Bias: -0.568083, T: 10000, Avg. loss: 10.667529
Total training time: 0.00 seconds.
-- Epoch 1
Norm: 131.01, NNZs: 1083, Bias: -0.385974, T: 10000, Avg. loss: 8.216275
Total training time: 0.00 seconds.
-- Epoch 1
Norm: 119.12, NNZs: 1166, Bias: -0.235999, T: 10000, Avg. loss: 7.601711
Total training time: 0.00 seconds.
-- Epoch 1
Norm: 108.87, NNZs: 1238, Bias: -0.110418, T: 10000, Avg. loss: 6.014324
Total training time: 0.00 seconds.
-- Epoch 1
Norm: 99.32, NNZs: 1312, Bias: 0.013

In [10]:
# Metrics
accuracy = clf.score(x_test, y_test)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9304228672587461


Classifying Using RNN