In [1]:
!pip install python-Levenshtein



In [1]:
#imports
import pandas as pd

from IPython.utils.path import get_long_path_name # TBD
import Levenshtein

Helper Functions:

In [2]:
# Load vocabulary into the set from file
def get_language_words_set(file_path, encoding='utf-8'):

    words_set = set()
    try:
        #with open(file_path, 'r', encoding='cp1251') as file:
        with open(file_path, 'r', encoding=encoding) as file:
            for line in file:
                word = line.strip().lower()  # Remove leading/trailing whitespaces and convert to lowercase
                words_set.add(word)
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return words_set

In [6]:
# Save vocabulary into the file
def save_words_to_file(words_set, file_path, encoding='utf-8'):

    try:
        with open(file_path, 'w', encoding=encoding) as file:
            for word in words_set:
                file.write(word + '\n')
    except Exception as e:
        print(f"An error occurred: {e}")

In [7]:
# Check, if word exists in vocabulary
def is_in_set(words_set, word):
    return word in words_set

In [6]:
def jaccard_similarity(str1, str2):
    set1 = set(str1)
    set2 = set(str2)

    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    similarity = intersection / union
    return similarity

In [7]:
def ngram_similarity(str1, str2, n):
    def get_ngrams(s, n):
        return [s[i:i+n] for i in range(len(s) - n + 1)]

    ngrams1 = get_ngrams(str1, n)
    ngrams2 = get_ngrams(str2, n)

    intersection = len(set(ngrams1).intersection(ngrams2))
    union = len(set(ngrams1).union(ngrams2))

    similarity = intersection / union
    return similarity

# ---

In [3]:
# Load the language sets:
english_vocabulary = get_language_words_set('vocabulary/english_vocabulary')
hebrew_vocabulary = get_language_words_set('vocabulary/hebrew_vocabulary')
russian_vocabulary = get_language_words_set('vocabulary/russian_vocabulary')

In [1]:
# Make the layouts
russian_layout = 'ёйцукенгшщзхъфывапролджэ\ячсмитьбю.ЁЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭ/ЯЧСМИТЬБЮ,'
english_layout = '''`qwertyuiop[]asdfghjkl;'\zxcvbnm,./~QWERTYUIOP{}ASDFGHJKL:"|ZXCVBNM<>?'''
hebrew_layout = ";/'קראטוןםפ][שדגכעיחלךף,\זסבהנמצתץ."

In [10]:
# Creating conversion maps per characted position
def create_conversion_map(source_layout, target_layout):
    conversion_map = {}

    for src_char, tgt_char in zip(source_layout, target_layout):
        conversion_map[ord(src_char)] = ord(tgt_char)
    return conversion_map

In [11]:
# Converting text from one map to another
def convert_text(text, conversion_map):
    return text.translate(conversion_map)

In [12]:
# Making the conversion
def convert_text_bidirectional(text, from_layout, to_layout):
    if to_layout == hebrew_layout:
        text = text.lower()
    return convert_text(text, create_conversion_map(from_layout, to_layout))

In [13]:
def get_language(word):
    """
    en = 1
    he = 2
    ru = 3
    NA = 0
    """
    lang = [is_in_set(english_vocabulary,word),is_in_set(hebrew_vocabulary,word),is_in_set(russian_vocabulary,word)]
    if lang.count(True) == 0 or lang.count(True)>1:
        return 0
    else:
        for index, value in enumerate(lang):
            if value:
                return index+1

In [14]:
# Generate the maps
russian_to_english_map = create_conversion_map(russian_layout, english_layout)
russian_to_hebrew_map = create_conversion_map(russian_layout, hebrew_layout)
english_to_russian_map = create_conversion_map(english_layout, russian_layout)
english_to_hebrew_map = create_conversion_map(english_layout, hebrew_layout)
hebrew_to_russian_map = create_conversion_map(hebrew_layout, russian_layout)
hebrew_to_english_map = create_conversion_map(hebrew_layout, english_layout)

some examples to use the helpers:

In [15]:
is_in_set(english_vocabulary,'hellow')

False

In [16]:
str1 = "hello"
str2 = "hello1"

lev_distance = Levenshtein.distance(str1, str2)/max(len(str1),len(str2))
print(f"Normalized Levenshtein distance between '{str1}' and '{str2}' is {lev_distance}")

Normalized Levenshtein distance between 'hello' and 'hello1' is 0.16666666666666666


In [17]:
string1 = "hello"
string2 = "hello"
n_value = 2

similarity = ngram_similarity(string1, string2, n_value)
print(f"N-gram similarity between '{string1}' and '{string2}': {similarity}")

N-gram similarity between 'hello' and 'hello': 1.0


In [18]:
convert_text_bidirectional('привет',russian_layout,english_layout)

'ghbdtn'

General ideas.
We need to generate the train/test sets from the vocabularies, together with the dummy words and consequently train the model on dataset n-grams.
The model will learn to predict the correct language set.
short n-grams will probably have a bad accuracy (precision,recall,f1), therefore we will need to understand the correct minimal length for the good prediction

First of all, lets create the ballanced dataset, where the words will be taken from the real vocabulary and the dummies will be converted by transorming the encoding to other forms
en-word : he->en word
en-word : ru->en word
he word : en->he word
he word : ru->he word
ru word : en->ru word
ru word : he->ru word

In [19]:
print(f'the english vocabulary contains {len(english_vocabulary)} words')
print(f'the russian vocabulary contains {len(russian_vocabulary)} words')
print(f'the hebrew vocabulary contains {len(hebrew_vocabulary)} words')

the english vocabulary contains 466562 words
the russian vocabulary contains 1528910 words
the hebrew vocabulary contains 469509 words


english and hebrew vocabularies are almost identical in length. russian is 3 times bigger. we will need to take that into account.
May be the dataset shall contain all the words from all the datasets with 3 different layouts as a data for feature extraction and a label as target?

In [20]:
# make pandas dataset
def makeDataset():
    dataset = pd.DataFrame(columns=['en', 'he', 'ru','target'])


    rset = pd.DataFrame([{'en': convert_text_bidirectional(word,russian_layout,english_layout),
                        'he': convert_text_bidirectional(word,russian_layout,hebrew_layout),
                        'ru': word,
                        'target':get_language(word)}
                         for word in russian_vocabulary])


    eset = pd.DataFrame([{'en': word,
                        'he': convert_text_bidirectional(word,english_layout,hebrew_layout),
                        'ru': convert_text_bidirectional(word,english_layout,russian_layout),
                        'target':get_language(word)}
                         for word in english_vocabulary])

    hset = pd.DataFrame([{'en': convert_text_bidirectional(word,hebrew_layout,english_layout),
                        'he': word,
                        'ru': convert_text_bidirectional(word,hebrew_layout,russian_layout),
                        'target':get_language(word)}
                         for word in hebrew_vocabulary])

    dataset = pd.concat([dataset,rset,eset,hset],ignore_index=True)

    # Shuffle the DataFrame
    #dataset = dataset.sample(frac=1).reset_index(drop=True)

    return dataset

In [32]:
df = makeDataset()

In [33]:
df

Unnamed: 0,en,he,ru,target
0,",fhf,fyobwt.",תכיכתכטםנ'אץ,барабанщицею,3
1,fdnjgjl]`vybrfvb,כגמחעחך[;הטנרכהנ,автоподъёмниками,3
2,pfneitdfkcz,פכמקןאגכלבז,затушевался,3
3,"ibhjrjkj,s[",ןניחרחלחתד],широколобых,3
4,ghjvjrfire,עיחהחרכןרק,промокашку,3
...,...,...,...,...
2464976,vurhi,הורין,мгкрш,2
2464977,fshvo,כדיהם,аырмщ,2
2464978,dhjfubu,גיחכונו,вроагиг,2
2464979,nuskfi,מודלכן,тгылаш,2


In [67]:
# Create a new DataFrame for the entire dataset
def map_target(row):
    def lang(x):
        if x == 1:
            return 'en'
        if x == 2:
            return 'he'
        if x == 3:
            return 'ru'

    new_rows = []
    for i in range(1, 4):
        new_row = {'word': row[lang(i)], 'target': i if row['target'] == i else 0}
        new_rows.append(new_row)

    return pd.DataFrame(new_rows)

new_df = df.apply(lambda row: map_target(row), axis=1)

# Reshape the resulting DataFrame
new_df = pd.concat(new_df.tolist(), ignore_index=True)

In [72]:
shuffled_df.to_pickle('df_data.pkl')

In [73]:
shuffled_df.to_csv('df_data.csv', index=False)

In [70]:
# Shuffle the rows
shuffled_df = new_df.sample(frac=1, random_state=42)

Steps towards the model (it shall be a multi-layer LSTM)
--------

In [1]:
import pandas as pd

In [2]:
# Load the DataFrame from a pickle file
df = pd.read_pickle('df_data.pkl')

In [3]:
# Creating a combined symbol layout for 3 languages:
# Make the layouts
russian_layout = 'ёйцукенгшщзхъфывапролджэ\ячсмитьбю.ЁЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭ/ЯЧСМИТЬБЮ,'
english_layout = '''`qwertyuiop[]asdfghjkl;'\zxcvbnm,./~QWERTYUIOP{}ASDFGHJKL:"|ZXCVBNM<>?'''
hebrew_layout = ";/'קראטוןםפ][שדגכעיחלךף,\זסבהנמצתץ."
special_characters = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ "
numbers = "0123456789"
combined_layout = english_layout+hebrew_layout+russian_layout+numbers+special_characters
unique_chars = set()

for char in combined_layout:
    if char not in unique_chars:
        unique_chars.add(char)

layout = list(unique_chars)

In [4]:
char_to_index = {char: i for i, char in enumerate(layout)}
index_to_char = {i: char for i, char in enumerate(layout)}

In [52]:
import pickle
with open('dictionary.pkl', 'wb') as file:
    pickle.dump(char_to_index, file)

Model

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
torch.manual_seed(0)

<torch._C.Generator at 0x1d56c692590>

In [6]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU")

Using GPU: NVIDIA GeForce GTX 1660 Ti


In [4]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
# Максимальная длина слова для паддинга
max_length = max([len(word) for word in df['word']])

In [8]:
# Разделение датасета на train и test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
# Гиперпараметры
input_size = len(char_to_index)  # Размер словаря (количество уникальных символов)
hidden_size = 64  # Размер скрытого состояния LSTM
num_layers = 3  # Количество слоев LSTM
num_classes = 4  # Количество классов (количество языков) включая 0 = нет языка

In [3]:
# Создаем модель
class LanguageClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LanguageClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Изменяем размерность входных данных
        x = x.view(x.size(0), -1)  # Приводим к размерности (batch_size, sequence_length)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(self.embedding(x), (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [11]:
# Создаем кастомный Dataset для загрузки данных из датафрейма
class CustomDataset(Dataset):
    def __init__(self, dataframe, char_to_index, max_length):
        self.data = dataframe
        self.char_to_index = char_to_index
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word = self.data.iloc[idx]["word"]
        target = self.data.iloc[idx]["target"]

        # Преобразование слова в индексы символов с ограничением до размера словаря
        input_indices = [self.char_to_index.get(char, 0) for char in word if char in self.char_to_index]

        # Добавление паддинга
        if len(input_indices) < self.max_length:
            num_padding = self.max_length - len(input_indices)
            input_indices += [0] * num_padding  # 0 - индекс паддинга

        # Преобразование в тензор PyTorch и изменение формы
        input_tensor = torch.tensor(input_indices).view(1, -1)  # 1 - размер батча, -1 - автоматический расчет размерности

        return input_tensor, target

In [12]:
# Создаем кастомный Dataset
# custom_dataset = CustomDataset(df, char_to_index, max_length)
train_dataset = CustomDataset(train_df, char_to_index, max_length)
test_dataset = CustomDataset(test_df, char_to_index, max_length)

In [13]:
# Создаем DataLoader для обучения
batch_size = 1024  # Размер батча
# dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [4]:
# Создание экземпляра модели
model = LanguageClassifier(input_size, hidden_size, num_layers, num_classes)
model = model.to(device)

NameError: name 'input_size' is not defined

In [15]:
# Определение функции потерь и оптимизатора
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [16]:
def validate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    average_loss = total_loss / len(dataloader)

    return accuracy, average_loss

In [17]:
patience = 3
min_delta = 0.001  # Minimum improvement required to continue

In [23]:
writer = SummaryWriter()

In [24]:
# Обучение модели
num_epochs = 5  # Количество эпох

model.train()

best_loss = float('inf')
consecutive_no_improvement = 0

for epoch in range(num_epochs):
    print(f"Epoch [{epoch + 1}/{num_epochs}]")

    total_loss = 0.0  # To keep track of the total loss in the epoch

    for batch_idx, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        writer.add_scalar('Loss/train', loss, global_step=epoch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Print progress every n batches (e.g., every 10 batches)
        if batch_idx % 10 == 0:
            print(f"Batch [{batch_idx}/{len(train_dataloader)}] Loss: {loss.item():.4f}")

    # Calculate the average loss for the epoch
    average_loss = total_loss / len(train_dataloader)

    # Print the average loss for the epoch
    print(f"Epoch [{epoch + 1}/{num_epochs}] Average Loss: {average_loss:.4f}")

    # Validation
    model.eval()  # Switch to evaluation mode
    val_accuracy, val_loss = validate_model(model, val_dataloader, criterion)

    print(f"Epoch [{epoch + 1}/{num_epochs}] Validation Loss: {val_loss:.4f} Accuracy: {val_accuracy:.2f}%")
    model.train()  # Switch back to training mode

    # Check for early stopping
    if val_loss < best_loss - min_delta:
        best_loss = val_loss
        consecutive_no_improvement = 0
        # Save the best model if needed
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        consecutive_no_improvement += 1

    if consecutive_no_improvement >= patience:
        print(f"Early stopping after {epoch + 1} epochs without improvement.")
        break


Epoch [1/5]
Batch [0/5778] Loss: 0.9452
Batch [10/5778] Loss: 0.9665
Batch [20/5778] Loss: 0.9579
Batch [30/5778] Loss: 0.9776
Batch [40/5778] Loss: 0.9112
Batch [50/5778] Loss: 0.9261
Batch [60/5778] Loss: 0.9132
Batch [70/5778] Loss: 0.9361
Batch [80/5778] Loss: 0.9572
Batch [90/5778] Loss: 0.9729
Batch [100/5778] Loss: 0.9371
Batch [110/5778] Loss: 0.9426
Batch [120/5778] Loss: 0.9366
Batch [130/5778] Loss: 0.9413
Batch [140/5778] Loss: 0.9490
Batch [150/5778] Loss: 0.9235
Batch [160/5778] Loss: 0.9416
Batch [170/5778] Loss: 0.9686
Batch [180/5778] Loss: 0.9568
Batch [190/5778] Loss: 0.9646
Batch [200/5778] Loss: 0.9442
Batch [210/5778] Loss: 0.9153
Batch [220/5778] Loss: 0.9404
Batch [230/5778] Loss: 0.9170
Batch [240/5778] Loss: 0.8915
Batch [250/5778] Loss: 0.9583
Batch [260/5778] Loss: 0.9284
Batch [270/5778] Loss: 0.9992
Batch [280/5778] Loss: 0.9665
Batch [290/5778] Loss: 0.9410
Batch [300/5778] Loss: 0.9500
Batch [310/5778] Loss: 0.9199
Batch [320/5778] Loss: 0.9344
Batch [33

In [25]:
# Оценка модели на тестовом наборе данных
# Load the best one
model.load_state_dict(torch.load('best_model.pth'))

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy on test dataset: {accuracy:.2f}%')

Accuracy on test dataset: 99.60%


In [26]:
torch.save(model.state_dict(), 'lang_model.pth')

In [41]:
model.load_state_dict(torch.load('lang_model.pth'))

<All keys matched successfully>

Predicted Values:

In [47]:
def predict_language(text, model, char_to_index, max_length):
    # Шаг 1: Токенизация текста на символы и преобразование символов в индексы
    input_indices = [char_to_index.get(char, 0) for char in text if char in char_to_index]

    # Шаг 2: Добавление паддинга, если необходимо
    if len(input_indices) < max_length:
        num_padding = max_length - len(input_indices)
        input_indices += [0] * num_padding

    # Шаг 3: Преобразование в тензор PyTorch и изменение размерности
    input_tensor = torch.tensor(input_indices).view(1, -1).to(device)

    # Шаг 4: Передача данных в модель для получения предсказания
    with torch.no_grad():
        model.eval()  # Перевести модель в режим оценки (не тренировки)
        output = model(input_tensor)

    # Обработка предсказания
    _, predicted_class = torch.max(output, 1)

    # Вернуть класс языка на основе предсказания
    if predicted_class.item() == 0:
        return "Нет языка"
    elif predicted_class.item() == 1:
        return "Английский"
    elif predicted_class.item() == 2:
        return "Иврит"
    elif predicted_class.item() == 3:
        return "Русский"

# Пример использования
input_text = "привет"
predicted_language = predict_language(input_text, model, char_to_index, max_length)
print("Предсказанный язык:", predicted_language)

Предсказанный язык: Русский
