In [1]:
import re
import pandas as pds
import numpy as np

from gensim.models import FastText
from nltk import word_tokenize
from nltk import ngrams



In [2]:
def read_file(fname):
    with open(fname, encoding="utf8") as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    
    return pds.DataFrame(content)

Import files

In [3]:
train = pds.concat([read_file('x_train.txt'), read_file('y_train.txt')], axis=1)
train.columns = ['x', 'label']

test = pds.concat([read_file('x_test.txt'), read_file('y_test.txt')], axis=1)
test.columns = ['x', 'label']

train['cat'] = train.label.factorize()[0]
id_to_lang = train[['label', 'cat']].drop_duplicates().sort_values('cat').reset_index(drop=True)

cats = []
for idx, row in test.iterrows():
    cats.append(id_to_lang[id_to_lang.label == row.label].cat.values[0])
    
test['cat'] = cats
test.cat.to_pickle("y_test_original.pkl")

Regex for sentence splitting

In [4]:
# Regex testing cell
x = train.iloc[52].x
reg = re.compile("[,.、，() «»\[\]0-9-_\!\?—\"=&%#。‧《》〈〉 ໌﹏༌་\s\n\r\t \xA0\u1680\u180E\u2000-\u200B\u202F\u205F\u3000\uFEFF]+")
print(x.split("，"), "\n----")
print(word_tokenize(x), "\n----")
print(reg.split(x))

['Indtil 1545 havde flådecheferne kunnet hyre et betydeligt antal frie mænd til galejerne, selv om de kun sjældent var venetianere. De kom fra Dalmatien, Kreta og Grækenland. Herefter gik man i stigende grad over til tvangsudskrivning af fanger og skyldnere, ligesom det længe havde været normalt i resten af Europa. På langt sigt havde det den konsekvens for arbejdsmarkedet, at stadig færre lønmodtagere tjente deres penge på havet.'] 
----
['Indtil', '1545', 'havde', 'flådecheferne', 'kunnet', 'hyre', 'et', 'betydeligt', 'antal', 'frie', 'mænd', 'til', 'galejerne', ',', 'selv', 'om', 'de', 'kun', 'sjældent', 'var', 'venetianere', '.', 'De', 'kom', 'fra', 'Dalmatien', ',', 'Kreta', 'og', 'Grækenland', '.', 'Herefter', 'gik', 'man', 'i', 'stigende', 'grad', 'over', 'til', 'tvangsudskrivning', 'af', 'fanger', 'og', 'skyldnere', ',', 'ligesom', 'det', 'længe', 'havde', 'været', 'normalt', 'i', 'resten', 'af', 'Europa', '.', 'På', 'langt', 'sigt', 'havde', 'det', 'den', 'konsekvens', 'for', 

In [5]:
reg = re.compile("[,.、，;:() «»\[\]0-9-_\!\?—\"=&%#。‧《》〈〉 ໌﹏༌་\s\n\r\t \xA0\u1680\u180E\u2000-\u200B\u202F\u205F\u3000\uFEFF]+")
corpus = [[w for w in reg.split(sent.x)] for idx, sent in train.iterrows()]
corpus_y = [[w for w in reg.split(sent.x)] for idx, sent in test.iterrows()]

... and count the words used

In [6]:
words = 0
for c in corpus:
    words += len(c)
print(words)

6695838


Concatenate corpus and save

In [7]:
lens = [len(sent) for sent in corpus]
ncorp = [" ".join(row) for row in corpus]
y_ncorp = [" ".join(row) for row in corpus_y]

pds.DataFrame(ncorp).to_pickle("train_data_regex.pkl")
pds.DataFrame(y_ncorp).to_pickle("test_data_regex.pkl")
id_to_lang.to_pickle('id_to_lang.pkl')

Function to generate n-gram dataset for LSTM

In [8]:
def write_to_file(file, grams, N, c, cats, prob):
    g_len = len(grams)
    lines_written = 0
    for i in range(g_len):
        if np.random.random() < prob:
            for j in range(i,i+N):
                if j >= g_len:
                    file.write("".join(grams[j%g_len]))
                else:
                    file.write("".join(grams[j]))
                if j != i+N-1:
                    file.write("&")
            lines_written += 1
            file.write("\n")
            cats.append(c)
        
    return lines_written

Write the new datasets, they are in .txt format to be used by generators

In [9]:
n_gram = 5
N = 10
train_cats = []
valid_cats = []
msk = np.random.rand(len(ncorp)) < 0.9
with open("x_train_data.txt", "w", encoding="utf8") as file, open("x_valid_data.txt", "w", encoding="utf8") as file2:
    for i in range(len(ncorp)):
        row = ncorp[i]
        c = train.iloc[i]['cat']
        grams = list(ngrams(row, n_gram, pad_left=True, 
                       left_pad_symbol=" ",
                       pad_right=True,
                       right_pad_symbol=" "))
        if msk[i]:
            write_to_file(file, grams, N, c, train_cats, 0.3)
        else:
            write_to_file(file2, grams, N, c, valid_cats, 0.3)

In [10]:
test_cats = []
test_lens = []
with open("x_test_data.txt", "w", encoding="utf8") as file:
    for i in range(len(y_ncorp)):
        row = y_ncorp[i]
        c = test.iloc[i]['cat']
        grams = list(ngrams(row, n_gram, pad_left=True, 
                       left_pad_symbol=" ",
                       pad_right=True,
                       right_pad_symbol=" "))
        test_lens.append(write_to_file(file, grams, N, c, test_cats, 0.1))

Also save the length of all n-gram sequence belonging to one row in the test set, and the labels for each set

In [11]:
pds.Series(test_lens).to_pickle('x_test_lens.pkl')

In [12]:
with open("y_train_id.txt", "w", encoding="utf8") as file:
    for row in train_cats:
        file.write(str(row) +"\n")
        
with open("y_valid_id2.txt", "w", encoding="utf8") as file:
    for row in valid_cats:
        file.write(str(row) + "\n")
        
with open("y_test_id.txt", "w", encoding="utf8") as file:
    for row in test_cats:
        file.write(str(row) + "\n")

To each (test set) n-gram sequence, also save the corresponding labels

In [14]:
ids = id_to_lang.to_dict('index')
y_test_text_labels = [ids[test_cats[i]]['label'] for i in range(len(test_cats))]
pds.Series(y_test_text_labels).to_pickle("y_test_labels.pkl")

Generate fastText model

In [15]:
wv_size = 100
ft_model = FastText(corpus, size = wv_size, workers = 8, iter= 10)

In [16]:
ft_model.save("fasttext_5-gram.model")