In [0]:
import re
from tqdm import tqdm
import numpy as np
import random
from copy import deepcopy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

from collections import defaultdict, Counter
#from umap import UMAP
#from sklearn.decomposition import PCA
#%matplotlib notebook
#import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
#from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, \
                            precision_score, recall_score, accuracy_score, f1_score


In [0]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential

In [0]:
def encodeString(s, c_freqs) -> list:
    res = []
    for c in s:
        res.append(c_freqs[c])
    return res

# def encodeString2(s) -> list:
#     res = np.zeros(len(c_freqs)*7)
#     for i, c in enumerate(s):
#         res[c_freqs[c] + len(c_freqs) * i] = 1
#     return res

def decodeString(s, c_freqs) -> list:
    res = []
    for c in s:
        for c2, k2 in c_freqs.items():
            if k2 == c:
                res.append(c2)
                break
    return "".join(res)

# Это 7-граммы
def extractNGrammsFromFile(filename:str, soi, freq_thr=1) -> (dict, dict):
    with open(filename) as infile:
        test_text = infile.read(200000000)
    for c in tqdm(".,?!:;\t\n\r/[{(<>)}]\"'@#$%^*_+&0123456789=", desc='Replacement'):
        test_text = test_text.replace(c, " ")
    for i in tqdm(range(16), desc='Replacement'):
        test_text = test_text.replace("  ", " ")
    test_text = test_text.lower()
        
    words = test_text.split(" ")
    word_freqs = defaultdict((int))
    for word in tqdm(words, desc='Creating n-gramms'):
        if len(word) < 4:
            continue
        cur = "   " + word[:4]
        if cur[3] in soi:
            word_freqs[cur] += 1
        for c in word[4:]:
            cur = cur[1:] + c
            if cur[3] in soi:
                word_freqs[cur] += 1
        for i in [1,2,3]:
            cur = cur[1:] + " "
            if cur[3] in soi:
                word_freqs[cur] += 1
                
    to_del = []
    for word, freq in word_freqs.items():
        #if freq<10:
        if freq < freq_thr:
            to_del.append(word)

    for word in to_del:
        del word_freqs[word]
    to_del = []
    
    c_freqs = Counter(test_text[:10000000])
    # Можно обойтись без сортировки, если что.
    s_freqs = sorted([(c, f) for c, f in c_freqs.items() if f>2], key=lambda x: x[1], reverse=True)
    c_freqs = {c:i for i, c in enumerate([x[0] for x in s_freqs])}
    return word_freqs, c_freqs

replaces = {"ä":"a", "Ä":"A", "à":"a", "À":"A","â":"a", "Â":"A", "á":"a", "Á":"A","ā":"a", "Ā":"A", "ă":"a", "Ă":"A", \
            "č":"c", "Č":"C", "ć":"c", "Ć":"C", "ç":"c", "Ç":"C", \
            "đ":"d", "Ð": "D", "ď":"d", "Ď":"D", \
            "é":"e", "É":"E", "è":"e", "È":"E", "ê":"e", "Ê":"E", "ē":"e", "Ē":"E", \
            "ğ":"g", "Ğ":"G", "ģ":"g", "Ģ":"G", \
            "î":"i", "Î":"I", "ī":"i", "Ī":"I", "í":"i", "Í":"I", \
            "ķ":"k", "Ķ":"K", \
            "ļ":"l", "Ļ":"L", "ĺ":"l", "Ĺ":"L", "ľ":"l", "Ľ":"L", \
            "ņ":"n", "Ņ":"N", "ň":"n", "Ň":"N", \
            "ö":"o", "Ö":"O", "ô":"o", "Ô":"O", "ó":"o", "Ó":"O", \
            "ŕ":"r", "Ŕ":"R", \
            "ş":"s", "Ş":"S", "š":"s", "Š":"S", \
            "ț":"t", "Ț":"T", "ť":"t", "Ť":"T", \
            "ü":"u", "Ü":"U", "ù":"u", "Ù":"U", "ū":"u", "Ū":"U", "û":"u", "Û":"U", "ú":"u", "Ú":"U", \
            "ý":"y", "Ý":"Y", \
            "ž":"z", "Ž":"Z"
           }

symbols_of_interest = list(replaces.keys())

def makeNumericalVectors(word_freqs: dict, c_freqs: dict) -> (list, list):
    vectors = []
    classes = []
    for word in tqdm(word_freqs.keys(), desc='Vectorizing words'):
        try:
            if word[3] in symbols_of_interest:
                c = 1
                word2 = word[:3] + replaces[word[3]] + word[4:]
            else:
                c = 0
                word2 = word
            e = encodeString(word2, c_freqs) 
            for i in range(word_freqs[word]):
                vectors.append(e)
                classes.append(c)
        except Exception:
            pass
        
    return vectors, classes

def makeNormalizedNumericalVectors(word_freqs: dict, c_freqs: dict) -> (list, list):
    total = sum(c_freqs.values())
    for k in c_freqs.keys():
        c_freqs[k] /= total
    return makeNumericalVectors(word_freqs, c_freqs)

def makeData4GeneratorsAndSplit(word_freqs: dict, c_freqs: dict, percent: float) -> (dict, dict):
    thr = sum(word_freqs.values()) * percent
    prc = 0
    test_words = {}
    train_words = {}
    len1 = len(word_freqs)
    words1 = list(word_freqs.keys())
    
    fail = 0
    while prc < thr and fail < 20:
        index = random.randint(0, len1 - 1)
        w = words1[index]
        try:
            e = encodeString(w, c_freqs) 
        except Exception:
            del words1[index]
            len1 -= 1
            continue
        if w in test_words.keys():
            fail += 1
            continue
        test_words[w] = word_freqs[w]
        prc += word_freqs[w]
        fail = 0
        
    for w, f in word_freqs.items():
        if w not in test_words.keys():
            try:
                e = encodeString(w, c_freqs) 
            except Exception:
                continue
            train_words[w] = f

    return train_words, test_words

def makeData4GeneratorsAndSplit2(word_freqs: dict, c_freqs: dict, percent: float) -> (dict, dict):
    thr = sum(word_freqs.values()) * percent
    prc = 0
    test_words = defaultdict((int))
    train_words = deepcopy(word_freqs)

    for w in word_freqs:
        try:
            e = encodeString(w, c_freqs) 
        except Exception:
            del train_words[w]

    len1 = len(train_words)
    words1 = list(train_words.keys())
    freqs1 = list(train_words.values())
    sum_words = sum(freqs1)
    
    while prc < thr:
        index = random.randint(0, sum_words-1)
        i, j = 0, 0
        while i<len1 and j<index:
          j += freqs1[i]
          i += 1
        if i == len1:
          i -= 1
        w = words1[i]
            
        if freqs1[i] > 10:
          delta = int(freqs1[i] * 0.1)
        else:
          delta = 1
        test_words[w] += delta
        train_words[w] -= delta
        freqs1[i] -= delta
        if train_words[w] == 0:
            del words1[i]
            del freqs1[i]
            len1 -= 1
            del train_words[w]

        prc += delta
        sum_words -= delta
        if (prc%100000) == 0:
          print(prc)
        
    return train_words, test_words


def makeNumericalVectorsAndSplit(word_freqs: dict, c_freqs: dict, percent: float) -> (list, list, list, list):
    train_words, test_words = makeData4GeneratorsAndSplit(word_freqs, c_freqs, percent)
    tx1, ty1 = makeNumericalVectors(train_words, c_freqs)
    tx2, ty2 = makeNumericalVectors(test_words, c_freqs)
    return tx1, tx2, ty1, ty2

def makeNumericalVectorsAndSplit2(word_freqs: dict, c_freqs: dict, percent: float) -> (list, list, list, list):
    tx1, ty1 = makeNumericalVectors(word_freqs, c_freqs)
    return train_test_split(tx1, ty1)



In [0]:
# def endEpoche():
#     train_dict2 = deepcopy(train_dict)

def encodeString3(s) -> list:
    res = np.zeros((len(c_freqs)*7, 1))
    for i, c in enumerate(s):
        res[c_freqs[c] + len(c_freqs) * i] = np.ones(1)
    return res

def generateOneBatch(keys):
    batch = []
    classes = []
    for j in range(batch_size):
#             for j in range(1):
        index = random.randint(0, len(keys) - 1)
        word = keys[index]
        if word[3] in symbols_of_interest:
            c = [1, 0]
            word2 = word[:3] + replaces[word[3]] + word[4:]
        else:
            c = [0, 1]
            word2 = word

        batch.append(encodeString3(word2))
        classes.append(c)
        if train_dict2[word] == 1:
            del keys[index]
            #print("del", len(keys))
        else:
            train_dict2[word] -= 1
    return np.array(batch), np.array(classes), keys
    
def generateBatch():
    while True:
        dlen = int(sum(train_dict2.values()) / batch_size)
        keys = list(train_dict2.keys())
        for i in range(dlen):
            batch, classes, keys = generateOneBatch(keys)
            yield batch, classes

def generateTestBatch():
    while True:
        dlen = int(sum(train_dict2.values()) / batch_size)
        keys = list(train_dict2.keys())
        for i in range(dlen):
            batch, classes, keys = generateOneBatch(keys)
        
            if (i%100) == 0:
                print("batch", i)
            global test_y
            test_y.extend([n[0] for n in classes])
            
            yield batch, classes
            
            
class MyCustomCallback(keras.callbacks.Callback):

    def on_epoch_begin(self, epoch, logs=None):
        global train_dict2, train_dict
        train_dict2 = deepcopy(train_dict)
        print("batch copied")


In [0]:
c_freqs = []

def getDenseData(filename, soi):
    word_freqs, c_freqs = extractNGrammsFromFile(filename, soi, 2)
    #train_x, test_x, train_y, test_y =
    return makeNumericalVectorsAndSplit(word_freqs, c_freqs, 0.2)

def getDenseData2(filename, soi):
    word_freqs, c_freqs = extractNGrammsFromFile(filename, soi, 2)
    #train_x, test_x, train_y, test_y =
    return makeNumericalVectorsAndSplit2(word_freqs, c_freqs, 0.2)

def getGeneratorData(filename, soi):
    global c_freqs
    word_freqs, c_freqs = extractNGrammsFromFile(filename, soi, 2)
    #train_dict, test_dict = 
    return makeData4GeneratorsAndSplit(word_freqs, c_freqs, 0.2)

def getGeneratorData2(filename, soi):
    global c_freqs
    word_freqs, c_freqs = extractNGrammsFromFile(filename, soi, 2)
    #train_dict, test_dict = 
    return makeData4GeneratorsAndSplit2(word_freqs, c_freqs, 0.2)

In [0]:
def getDenseModel():
    model = keras.Sequential()
    model.add(layers.Dense(128, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(layers.Dense(128, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(layers.Dense(128, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(layers.Dense(2, activation="softmax"))
    model.compile(keras.optimizers.Adam(learning_rate=0.001), 
              keras.losses.MeanSquaredError(reduction='sum'),
              metrics=['accuracy']
             )

    return model

def getConvModel():
    model2 = keras.Sequential()
    model2.add(layers.Conv1D(32, 3, activation='relu'))#,input_shape=(100, 7)))
    model2.add(layers.Conv1D(32, 3, activation='relu'))
    model2.add(layers.Flatten())
    model2.add(layers.Dense(128, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    model2.add(layers.Dense(2, activation="softmax"))

    model2.compile(keras.optimizers.Adam(learning_rate=0.001), keras.losses.BinaryCrossentropy(reduction='sum'))
    
    return model2

def getLSTMModel():
    model2 = keras.Sequential()
    model2.add(layers.Dense(32, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    # model2.add(layers.Bidirectional(layers.LSTM(128, activation='tanh')))
    #model2.add(layers.Embedding(256, 128, input_length=len(c_freqs)*7))
    model2.add(layers.Bidirectional(layers.LSTM(128)))
    # model2.add(layers.Dropout(0.1))
    model2.add(layers.Flatten())
    #model2.add(layers.Dense(128, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    model2.add(layers.Dense(2, activation="softmax"))

    model2.compile(keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.), 
    #                keras.losses.BinaryCrossentropy(reduction='sum'),
                   keras.losses.MeanSquaredError(reduction='sum'),
                   metrics=[keras.metrics.Precision()])
    
    return model2

def getLSTMModel2():
    model2 = keras.Sequential()
    #model2.add(layers.Dense(32, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    #model2.add(layers.Embedding(256, 128, input_length=len(c_freqs)*7))
    # model2.add(layers.Bidirectional(layers.LSTM(128, activation='tanh')))
    model2.add(layers.Bidirectional(layers.LSTM(128)))
    # model2.add(layers.Dropout(0.1))
    model2.add(layers.Flatten())
    #model2.add(layers.Dense(128, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    model2.add(layers.Dense(2, activation="softmax"))

    model2.compile(keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.), 
    #                keras.losses.BinaryCrossentropy(reduction='sum'),
                   keras.losses.MeanSquaredError(reduction='sum'),
                   metrics=[keras.metrics.Precision()])
    
    return model2


#train_y3 = [(1,0) if y==0 else (0,1) for y in train_y]
#test_y3 = [(1,0) if y==0 else (0,1) for y in test_y]


In [0]:
def evaluate(y_real, y_hat):
    to_print = str(confusion_matrix(y_real, y_hat)) + "\n" + \
               "recall =" + str(recall_score(y_real, y_hat)) + "\n" + \
               "precision =" + str(precision_score(y_real, y_hat)) + "\n" + \
               "accuracy =" + str(accuracy_score(y_real, y_hat)) + "\n" + \
               "f1 =" + str(f1_score(y_real, y_hat)) + "\n"
    return to_print

In [0]:
turkish = {"name":"turkish", "filename":"drive/My Drive/birgun[tr]-2019.txt", 
           "soi":["uUüÜ", "sSşŞ", "cCçÇ", "oOöÖ", "gGğĞ"]}
french = {"name":"french", "filename":"drive/My Drive/figaro_economics[fr]-2018.txt", 
          # "soi":["eEéÉ", "eEèÈ", "aAàÀ",  "uUùÙ",  "aAâÂ", "eEêÊ", "iIîÎ", "oOôÔ", "uUûÛ","cCçÇ"]}
          "soi":["iIîÎ", "oOôÔ", "uUûÛ","cCçÇ"]}
german = {"name":"german", "filename":"drive/My Drive/bild[de]-2018.txt", 
          "soi":["aAäÄ", "oOöÖ", "uUüÜ"]}
croatian = {"name":"croatian", "filename": "drive/My Drive/hr_200.txt", 
           "soi": ["cCčČ", "cCćĆ", "dDđÐ", "sSšŠ", "zZžŽ"]}
latvian = {"name": "latvian", "filename":"drive/My Drive/lv_200.txt",
           "soi":["aAāĀ", "cCčČ", "eEēĒ", "gGģĢ", "iIīĪ", "kKķĶ", "lLļĻ", "nNņŅ", "sSšŠ", "uUūŪ", "zZžŽ"]}
romanian = {"name": "romanian", "filename": "drive/My Drive/ro_200.txt",
            "soi": ["aAăĂ", "aAâÂ", "iIîÎ", "sSșȘ", "tTțȚ"]}
slovak = {"name":"slovak", "filename":"drive/My Drive/sk_200.txt",
#          "soi":["aAáÁ", "aAäÄ", "cCčČ", "dDďĎ", "eEéÉ", "iIíÍ", "lLĺĹ", "lLľĽ", "nNňŇ", "oOóÓ", "oOôÔ", "rRŕŔ", "sSšŠ", "tTťŤ", "uUúÚ", "yYýÝ", "zZžŽ"]}
          "soi":["nNňŇ", "oOóÓ", "oOôÔ", "rRŕŔ", "sSšŠ", "tTťŤ", "uUúÚ", "yYýÝ", "zZžŽ"]}

#languages = [turkish, french, german]
#languages = [croatian, latvian, romanian, slovak]
languages = [french]
#languages = [latvian]
#languages = [turkish]
#languages = [slovak]
#languages = [romanian]
#languages = [croatian]

In [0]:
outfilename = "evaluation34.txt"

for lang in languages:
    for soi in lang["soi"]:
        train_dict, test_dict = getGeneratorData(lang["filename"], soi)

        if sum(train_dict.values()) > 10000000:
          for k, v in tqdm(train_dict.items(), desc = "Reducing frequencies"):
            if v > 5000:
              train_dict[k] = int(train_dict[k] / 2)
            elif v > 1000:
              train_dict[k] = int(train_dict[k] / 1.5)
            elif v > 500:
              train_dict[k] = int(train_dict[k] / 1.3)
            elif v > 100:
              train_dict[k] = int(train_dict[k] / 1.2)


        #model = getLSTMModel()
        model = getConvModel()

        train_dict2 = deepcopy(train_dict)
        batch_size = 512
        steps_per_epoche = int(sum(train_dict.values()) / batch_size) - 1

        model.fit(generateBatch(), steps_per_epoch=steps_per_epoche, epochs=10, callbacks=[MyCustomCallback()])        
        #model.summary()
        
        train_dict2 = deepcopy(test_dict)
        steps_per_epoche = int(sum(train_dict2.values()) / batch_size) - 1
        test_y = []
        print(f"should be {steps_per_epoche} batches")
        yn_hat3 = model.predict(generateTestBatch(), steps=steps_per_epoche)

        yn_hat4 = [0 if y[0]<y[1] else 1 for y in yn_hat3]
        test_y = test_y[:len(yn_hat4)]
        
        eva = evaluate(test_y, yn_hat4)
        print(f"\nlanguage = {lang['name']}, soi={soi}")
        print(eva)
        s = f"------\nConvolution 2x3, one-hot data\n language = {lang['name']}, soi={soi}\n"+eva

        out_file = open(outfilename, "a")
        out_file.write(s)
        out_file.close()      


# with open(outfilename) as file:
#   text = file.read()
 

Replacement: 100%|██████████| 40/40 [00:09<00:00,  4.01it/s]
Replacement: 100%|██████████| 16/16 [00:09<00:00,  1.77it/s]
Creating n-gramms: 100%|██████████| 27262196/27262196 [00:48<00:00, 560516.69it/s]


batch copied
Epoch 1/10
batch copied
Epoch 2/10
batch copied
Epoch 3/10
batch copied
Epoch 4/10
batch copied
Epoch 5/10
batch copied
Epoch 6/10
batch copied
Epoch 7/10
batch copied
Epoch 8/10
batch copied
Epoch 9/10
batch copied
Epoch 10/10
should be 2996 batches
batch 0
batch 100
batch 200
batch 300
batch 400
batch 500
batch 600
batch 700
batch 800
batch 900
batch 1000
batch 1100
batch 1200
batch 1300
batch 1400
batch 1500
batch 1600
batch 1700
batch 1800
batch 1900
batch 2000
batch 2100
batch 2200
batch 2300
batch 2400
batch 2500
batch 2600
batch 2700
batch 2800
batch 2900


  _warn_prf(average, modifier, msg_start, len(result))



language = slovak, soi=rRŕŔ
[[1531538       0]
 [   2414       0]]
recall =0.0
precision =0.0
accuracy =0.9984262871328438
f1 =0.0



Replacement: 100%|██████████| 40/40 [00:09<00:00,  4.03it/s]
Replacement: 100%|██████████| 16/16 [00:09<00:00,  1.73it/s]
Creating n-gramms:  97%|█████████▋| 26329552/27262196 [00:47<00:01, 573136.04it/s]

KeyboardInterrupt: ignored

In [0]:
outfilename = "drive/My Drive/evaluation50.txt"

for lang in languages:
    for soi in lang["soi"]:
        train_dict, test_dict = getGeneratorData2(lang["filename"], soi)

        if sum(train_dict.values()) > 10000000:
          for k, v in tqdm(train_dict.items(), desc = "Reducing frequencies"):
            if v > 5000:
              train_dict[k] = int(train_dict[k] / 2)
            elif v > 1000:
              train_dict[k] = int(train_dict[k] / 1.5)
            elif v > 500:
              train_dict[k] = int(train_dict[k] / 1.3)
            elif v > 100:
              train_dict[k] = int(train_dict[k] / 1.2)


        #model = getLSTMModel()
        model = getConvModel()

        train_dict2 = deepcopy(train_dict)
        batch_size = 512
        steps_per_epoche = int(sum(train_dict.values()) / batch_size) - 1

        model.fit(generateBatch(), steps_per_epoch=steps_per_epoche, epochs=10, callbacks=[MyCustomCallback()])        
        #model.summary()
        
        train_dict2 = deepcopy(test_dict)
        steps_per_epoche = int(sum(train_dict2.values()) / batch_size) - 1
        test_y = []
        print(f"should be {steps_per_epoche} batches")
        yn_hat3 = model.predict(generateTestBatch(), steps=steps_per_epoche)

        yn_hat4 = [0 if y[0]<y[1] else 1 for y in yn_hat3]
        test_y = test_y[:len(yn_hat4)]
        
        eva = evaluate(test_y, yn_hat4)
        print(f"\nlanguage = {lang['name']}, soi={soi}")
        print(eva)
        s = f"------\nConvtolution 2x3, one-hot, mixed words\n language = {lang['name']}, soi={soi}\n"+eva

        out_file = open(outfilename, "a")
        out_file.write(s)
        out_file.close()      


# with open(outfilename) as file:
#   text = file.read()
 

Replacement: 100%|██████████| 40/40 [00:08<00:00,  4.64it/s]
Replacement: 100%|██████████| 16/16 [00:08<00:00,  1.92it/s]
Creating n-gramms: 100%|██████████| 20904669/20904669 [00:24<00:00, 845345.20it/s]


batch copied
Epoch 1/10
batch copied
Epoch 2/10
batch copied
Epoch 3/10
batch copied
Epoch 4/10
batch copied
Epoch 5/10
batch copied
Epoch 6/10
batch copied
Epoch 7/10
batch copied
Epoch 8/10
batch copied
Epoch 9/10
batch copied
Epoch 10/10

In [11]:
outfilename = "drive/My Drive/evaluation39.txt"

for lang in languages:
    for soi in lang["soi"]:
        train_x, test_x, train_y, test_y = getDenseData2(lang["filename"], soi)
        train_y3 = [(1,0) if y==0 else (0,1) for y in train_y]
        test_y3 = [(1,0) if y==0 else (0,1) for y in test_y]

        d1 = np.array(train_x, dtype=np.float32)
        d1 = d1.reshape(-1, 7, 1)
        d2 = np.array(train_y3, dtype=np.float32)
        d3 = np.array(test_x, dtype=np.float32)
        d3 = d3.reshape(-1, 7, 1)
        d4 = np.array(test_y3, dtype=np.float32)

        model = getLSTMModel2()
        #model = getLSTMModel()
        #model = getConvModel()
        model.fit(d1, d2, batch_size=2048, epochs=10, validation_data=(d3, d4))
        #model.fit(train_x, train_y3, batch_size=2048, epochs=15, validation_data=(test_x, test_y3))
        #model.summary()

        yn_hat = model.predict(d3)
        yn_hat2 = [0 if y[0]>y[1] else 1 for y in yn_hat]

        #ynt_hat = model.predict(test_x)
        #yn_hat2 = [0 if y[0]>y[1] and y[0]>0.7 else 1 for y in ynt_hat]

        eva = evaluate(test_y, yn_hat2)
        print(f"language={lang['name']}, soi={soi}")
        print("\n" + eva)
        
        out_file = open(outfilename, "a")
        s = f"------\nLSTM x 128, dense data, mixed words\n language = {lang['name']}, soi={soi}\n"+eva
        
        out_file.write(s)
        out_file.close()    

#with open(outfilename) as file:
#  text = file.read()



Replacement: 100%|██████████| 40/40 [00:12<00:00,  3.25it/s]
Replacement: 100%|██████████| 16/16 [00:10<00:00,  1.46it/s]
Creating n-gramms: 100%|██████████| 30767325/30767325 [00:42<00:00, 724574.77it/s]
Vectorizing words: 100%|██████████| 115287/115287 [00:02<00:00, 40469.72it/s]


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
language=romanian, soi=iIîÎ

[[3592423   19622]
 [  22454  205433]]
recall =0.901468710369613
precision =0.9128124236297794
accuracy =0.989042514294524
f1 =0.9071051039647461



Replacement: 100%|██████████| 40/40 [00:12<00:00,  3.25it/s]
Replacement: 100%|██████████| 16/16 [00:10<00:00,  1.48it/s]
Creating n-gramms: 100%|██████████| 30767325/30767325 [00:39<00:00, 783992.61it/s]
Vectorizing words: 100%|██████████| 41674/41674 [00:00<00:00, 42370.11it/s]


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)


language=romanian, soi=sSșȘ

[[1372908]]
recall =0.0
precision =0.0
accuracy =1.0
f1 =0.0



Replacement: 100%|██████████| 40/40 [00:12<00:00,  3.28it/s]
Replacement: 100%|██████████| 16/16 [00:10<00:00,  1.52it/s]
Creating n-gramms: 100%|██████████| 30767325/30767325 [00:39<00:00, 785175.97it/s]
Vectorizing words: 100%|██████████| 65917/65917 [00:02<00:00, 31482.23it/s]


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
language=romanian, soi=tTțȚ

[[2419529   12358]
 [  17659  190230]]
recall =0.9150556306490483
precision =0.938999348431299
accuracy =0.9886289594268605
f1 =0.9268728820372396



In [11]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
