In [1]:
import main
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
# en = main.SearchImageOfLanguage(
#     "data/Fitzgerald - Gatsby.pdf",
#     length_threshold=20,
#     amount_threshold=1,
# )
#
# ru = main.SearchImageOfLanguage(
#     "data/Достоевский - Идиот.pdf",
#     length_threshold=20,
#     amount_threshold=1,
# )
#
# en.save_as("en_data")
# ru.save_as("ru_data")

ru_data = pd.read_feather("data/ru_data.feather")
en_data = pd.read_feather("data/en_data.feather")

ru_data.drop(
    [
        "amount",
        "frequency",
    ],
    axis=1,
    inplace=True
)

en_data.drop(
    [
        "amount",
        "frequency",
    ],
    axis=1,
    inplace=True
)

ru_data["target"] = 1
en_data["target"] = 0

data = pd.concat([ru_data, en_data], ignore_index=True)
data

Unnamed: 0,word,target
0,annotation,1
1,в,1
2,котором,1
3,творческие,1
4,принципы,1
...,...,...
24580,public,0
24581,orgastic,0
24582,recedes,0
24583,boats,0


In [3]:
print(
    f"TensorFlow - {tf.__version__}\n"
    f"Keras - {tf.keras.__version__}"
)

if tf.test.gpu_device_name():
    print("GPU - On")
    # TensorFlow / CUDA / CUDnn compatibility table - https://www.tensorflow.org/install/source#gpu
    print(f"CUDA - {tf.sysconfig.get_build_info()['cuda_version']}")
else:
    print("GPU - Off")

TensorFlow - 2.9.0
Keras - 2.9.0
GPU - On
CUDA - 64_112


In [4]:
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_targets, test_targets = train_test_split(data["word"], data["target"], test_size=0.2, random_state=42, stratify=data["target"])

shape = train_inputs.shape[1:]

print(
    f"Train inputs shape - {train_inputs.shape}\n"
    f"Train targets shape - {train_targets.shape}\n\n"
    f"Test inputs shape - {test_inputs.shape}\n"
    f"Test targets shape - {test_targets.shape}"
)

Train inputs shape - (19668,)
Train targets shape - (19668,)

Test inputs shape - (4917,)
Test targets shape - (4917,)


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

num_words = 25000
max_len = 20
nb_classes = 2

train_targets = to_categorical(train_targets, nb_classes)
test_targets = to_categorical(test_targets, nb_classes)


In [6]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(data["word"])

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = tokenizer.texts_to_sequences(train_inputs)
train_inputs = pad_sequences(sequences, maxlen=max_len)

In [8]:
from tensorflow.keras import optimizers, models, layers, backend, callbacks

backend.clear_session()

model = models.Sequential(name="language_detection")

model.add(layers.Embedding(num_words, 32, input_length=max_len))
model.add(layers.GRU(16))
model.add(layers.Dense(units=2, activation="softmax"))

# model.build(input_shape=(1, ))

model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"],
)

checkpoint = callbacks.ModelCheckpoint(
    'best_model.hdf5' ,
    monitor = 'accuracy',
    verbose = True,
    save_best_only=True,
)

early_stop = callbacks.EarlyStopping(
    monitor='accuracy',
    patience=3,
    restore_best_weights=True
)

callbacks_list = [checkpoint, early_stop]

# model.summary()

In [9]:
history = model.fit(
    train_inputs,
    train_targets,
    verbose=True,
    epochs=5,
    callbacks=[
        callbacks_list,
    ],
)

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.78412, saving model to best_model.hdf5
Epoch 2/5
Epoch 2: accuracy improved from 0.78412 to 0.93375, saving model to best_model.hdf5
Epoch 3/5
Epoch 3: accuracy improved from 0.93375 to 0.99959, saving model to best_model.hdf5
Epoch 4/5
Epoch 4: accuracy improved from 0.99959 to 0.99964, saving model to best_model.hdf5
Epoch 5/5
Epoch 5: accuracy did not improve from 0.99964


In [10]:
test_sequences = tokenizer.texts_to_sequences(test_inputs)
test_inputs = pad_sequences(test_sequences, maxlen=max_len)

In [11]:
my_model = models.load_model("best_model.hdf5")

In [12]:
print(
    f"Train inputs shape - {train_inputs.shape}\n"
    f"Train targets shape - {train_targets.shape}\n\n"
    f"Test inputs shape - {test_inputs.shape}\n"
    f"Test targets shape - {test_targets.shape}"
)

Train inputs shape - (19668, 20)
Train targets shape - (19668, 2)

Test inputs shape - (4917, 20)
Test targets shape - (4917, 2)


In [17]:
my_model.evaluate(test_inputs, test_targets, verbose=1)



[0.6933085322380066, 0.709172248840332]

In [40]:
print(test_inputs[0])
print(test_inputs[0].shape)

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0 10445]
(20,)


In [53]:
word = "fields"

my_array = [0 for _ in range(19)]
my_array.append(tokenizer.word_index[word])

a = np.array(my_array).reshape(1, max_len)

my_model.predict(a)



array([[9.9979240e-01, 2.0759509e-04]], dtype=float32)

In [55]:
# import pickle
#
# # saving
# with open('tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # loading
with open('tokenizer.pickle', 'rb') as handle:
    aboba = pickle.load(handle)

In [56]:
aboba.word_index["fields"]

24570