In [1]:
from transformers import AutoTokenizer
transformers_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [2]:
from datasets import load_dataset
data = load_dataset("csv", data_files="pairs_dataset.csv", sep="#")
data = data['train']

In [3]:
def batch_iterator(dataset, batch_size):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["equation"] + "<sep>" + dataset[i : i + batch_size]["answer"]

In [4]:
import pandas as pd
df = pd.read_csv('pairs_dataset.csv', sep='#')
df.head()

Unnamed: 0,equation,answer
0,"y\prime= \frac{3}{x^3+x}, \;\;\;\; y(1)=0",y = 3 \ln x -\frac{3}{2} \ln {(x^2+1)} + \frac...
1,y\prime=3xy,y = C e^{\frac{3}{2} x^2}
2,\frac{dy}{dx}= xy^2 + 4x + 2y^2 + 8,y =2 \tan{(x^2 +4x +2C)}
3,"\frac{dy}{dx}= e^{x+2y}, y(0)=1",y = -\frac{1}{2} \ln{(-2 e^x + 2+e^{-2})}
4,y\prime = x e^{2x+y},y = - \ln {(-\frac{1}{2} x e^{2x} + \frac{1}{4...


In [5]:
special_tokens = ['<sos>', '<eos>', '<sep>', '<pad>']
tokenizer = transformers_tokenizer.train_new_from_iterator(batch_iterator(df, 32), vocab_size=80, new_special_tokens=special_tokens)






In [6]:
from torch.utils.data import Dataset
import torch
class MyDataset(Dataset):
    def __init__(self, tokenizer, data, maxlen, sep="<sep>"):
        self.tokenizer = tokenizer
        self.data = data
        self.maxlen = maxlen
        self.vocab_size = len(tokenizer.vocab)
        self.sep = sep
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        src = self.tokenizer("<sos>" + self.data.iloc[index, 0] + self.sep)['input_ids']
        trg = self.tokenizer(self.sep + self.data.iloc[index, 1] + "<eos>")['input_ids']
        src += self.tokenizer("<pad>")['input_ids'] * (self.maxlen - len(src))
        trg += (self.tokenizer("<pad>")['input_ids'] * (self.maxlen - len(trg)))
        return torch.tensor(src[:self.maxlen]), torch.tensor(trg[:self.maxlen])

In [7]:
from torch.utils.data import random_split
maxlen = max(df['answer'].apply(len).max(), df['equation'].apply(len).max())
data = MyDataset(tokenizer, df, maxlen)
train_size = int(0.8 * len(data))
val_size = int(0.1 * len(data))
test_size = len(data) - train_size - val_size
train_data, val_data, test_data = random_split(data, [train_size, val_size, test_size])

In [8]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [9]:
import torch.nn as nn
class Decoder(nn.Module):
    def __init__(
            self, 
            rnn_type,
            vocab_size, 
            embedding_dim, 
            hidden_dim, 
            num_layers, 
            padding_idx,
            dropout=0.1,
            bidirectional=False
        ):
        super().__init__()
        self.rnn_type = rnn_type
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        if rnn_type == "rnn":
            self.rnn = nn.RNN(
                embedding_dim, 
                hidden_dim, 
                num_layers, 
                batch_first=True, 
                bidirectional=self.bidirectional
            )
        elif rnn_type == "gru":
            self.rnn = nn.GRU(
                embedding_dim, 
                hidden_dim, 
                num_layers, 
                batch_first=True, 
                bidirectional=self.bidirectional
            )
        else:
            self.rnn = nn.LSTM(
                embedding_dim,
                hidden_dim,
                num_layers,
                batch_first=True,
                dropout=dropout,
                bidirectional=bidirectional
            )
        self.fc = nn.Linear(hidden_dim + self.bidirectional * hidden_dim, vocab_size)

    def forward(self, x, hidden, cell=None):
        x = self.embedding(x)
        if self.rnn_type == 'lstm':
            output, (hidden, cell) = self.rnn(x, (hidden, cell))
            prediction = self.fc(output.squeeze(0))
            return prediction, hidden, cell
        else:
            output, hidden = self.rnn(x, hidden)
            output = self.fc(output)
            return output, hidden

    def init_hidden_cell(self, batch_size):
        if self.rnn_type == 'lstm':
            return (torch.zeros(self.num_layers * (1 + self.bidirectional), batch_size, self.hidden_dim),
                    torch.zeros(self.num_layers * (1 + self.bidirectional), batch_size, self.hidden_dim))
        else:
            return torch.zeros(self.num_layers * (1 + self.bidirectional) , batch_size, self.hidden_dim)

In [10]:
import torch.optim as optim
padding_idx = tokenizer("<pad>")['input_ids'][0]
vocab_size = tokenizer.vocab_size
embedding_dim = 64
hidden_dim = 128
num_layers = 4
learning_rate = 0.005
model = Decoder("lstm", vocab_size, embedding_dim, hidden_dim, num_layers, padding_idx, 0.1, True)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer("<pad>")['input_ids'][0])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
from IPython.display import clear_output
import matplotlib.pyplot as plt

def train(model, criterion, train_loader, val_loader, optimizer, num_epochs = 1, show=False):
    train_history = []
    val_history = []
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        val_loss = 0
        for questions, answers in train_loader:
            optimizer.zero_grad()
            if model.rnn_type == "lstm":
                hidden, cell = model.init_hidden_cell(len(questions))
                outputs, _, _ = model(questions, hidden, cell)
            else:         
                hidden = model.init_hidden_cell(len(questions))
                outputs, _ = model(questions, hidden)
            loss = criterion(outputs.transpose(1, 2), answers) 
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_history.append(train_loss / len(train_loader))
        

        model.eval()
        with torch.no_grad():
            for questions, answers in val_loader:  
                if model.rnn_type == "lstm":
                    hidden, cell = model.init_hidden_cell(len(questions))
                    outputs, _, _ = model(questions, hidden, cell)
                else:
                    hidden = model.init_hidden_cell(len(questions))
                    outputs, hidden = model(questions, hidden)
                loss = criterion(outputs.transpose(1, 2), answers) 
                # if loss.item() < model.best_loss:
                #     model.best_loss = loss.item()
                #     torch.save(model.state_dict(), f'best_{model.rnn_type}.pt')
                val_loss += loss.item()
        val_history.append(val_loss / len(val_loader)) 
        clear_output(True)
        if show:
            plt.plot(val_history, label="test")
            plt.plot(train_history, label="train")
            plt.legend()
            plt.show()
            print(f'Epoch {epoch + 1}, Train loss: {train_history[-1]}, Val loss: {val_history[-1]}')
        return train_history[-1], val_history[-1]

In [12]:
import torch.nn.functional as F
import numpy as np
def translate(model, sentence, tokenizer, max_length=50):
    model.eval()
    answer = tokenizer.encode("<sos>")
    for token in tokenizer.encode(sentence):
        answer.append(token)
    answer = torch.tensor([answer], dtype=torch.int64)
    if model.rnn_type == "lstm":
        hidden, cell = model.init_hidden_cell(batch_size=1)
        hidden = hidden.squeeze()
        cell = cell.squeeze()
    else:
        hidden = model.init_hidden_cell(batch_size=1).squeeze()
    for i in range(len(answer) - 1):
        if model.rnn_type == "lstm":
            _, hidden, cell = model(answer[:, i], hidden, cell)
        else:
            _, hidden = model(answer[:, i], hidden)
    for _ in range(max_length - len(sentence)):
        if model.rnn_type == "lstm":
            logits_next, hidden, cell = model(answer[:, -1], hidden, cell)
            p_next = F.softmax(logits_next, dim=0).data.numpy()
        else:
            logits_next, hidden = model(answer[:, -1], hidden)
            p_next = F.softmax(logits_next, dim=-1).data.numpy()[0]
        next_ix = np.random.choice(len(p_next), p=p_next)
        if next_ix == tokenizer('<eos>')['input_ids'][0]:
            break
        next_ix = torch.tensor([[next_ix]], dtype=torch.int64)
        answer = torch.cat([answer, next_ix], dim=1)

    return "".join([tokenizer.decode(list(answer.cpu().data.numpy())[0])[len(sentence):]])

In [13]:
valid_eqs = [
    "2xy\mathrm{d}x + (x^2 - y^2)\mathrm{d}y = 0",  # в полных дифференциалах
    "\frac{3x^2 + y^2}{y^2}\mathrm{d}x - \frac{2x^3 + 5y}{y^3}\mathrm{d}y",  # в полных дифференциалах
    "y^{\prime}=\mathrm{tg}{\frac{y}{x}}+{\frac{y}{x}}",  # однородное
    "y^{\prime}=\cos^{2}{\frac{y}{x}}+{\frac{y}{x}}",  # однородное
    "y^{\prime}-y={\frac{e^{x}}{x^{2}}}",  # линейное 1-го порядка
    "(2x+y^{2})y^{\prime}=y",  # линейное 1-го порядка
    "y y^{\prime3}+x=1",  # не разрешенное относительно производной
    "y^{\prime^{3}}+y^{2}=y y^{\prime}(y^{\prime}+1)",  # не разрешенное относительно производной
    "2y^{\prime}-\frac{y}{x}=\frac{4x^{2}}{y}",  # уравнение Бернулли
    "xy^{\prime}-2y={\frac{x}{y}}",  # уравнение Бернулли
    "2y^{\prime\prime}+3y^{\prime}-5y=10",  # неоднородные линейные
    "y^{\prime\prime}-2y^{\prime}-8y=x^{2}+3",  # неоднородные линейные
    "{\frac{x\,d x+y\,d y}{y\,\overline{{{1+x^{2}+y^{2}}}}}}+{\frac{y\,d x-x\,d y}{x^{2}+y^{2}}}=0",  # интегрирующий множитель
    "(x^{2}y^{2}-1)\,d y+2x y^{3}\,d x=0",  # интегрирующий множитель
]
valid_eqs_answers = [
    "3x^2 - y^3 = C",
    "x + \frac{x^3}{y^2} + \frac{5}{y} = C",
    "y=x\arcsin(C x)",
    "y(x)=x\tan^{-1}(c_{1}+\log(x))",
    "y(x)=c_{1}\,e^{x}-{\frac{e^{x}}{x}}",
    "x=y^{2}(\ln y + C)",
    "(x-1)^{4/3}+y^{4/3}=C",
    "4y=(x+C)^{2}",
    "y(x)=-{\sqrt{x}}\;{\sqrt{c_{1}+2x^{2}}}",
    "y(x)={\frac{\sqrt{x}\;{\sqrt{c_{1}\,x^{3}-2}}}{\sqrt{3}}}",
    "y(x)=c_{1}\;e^{-(5x)/2}+c_{2}\;e^{x}-2",
    "y(x)=c_{1}\;e^{-2x}+c_{2}\;e^{4x}-{\frac{x^{2}}{8}}+{\frac{x}{16}}-{\frac{27}{64}}",
    "{\sqrt{{1+x^{2}+y^{2}}}}+\arctan{\frac{x}{y}}=C",
    "x^{2}y+{\frac{1}{y}}=C",
]

In [14]:
import evaluate

bleu = evaluate.load("bleu")

val_references = [
    tokenizer.decode(val_data[i][1].tolist())
    for i in range(len(val_data))
]
test_references = [
    tokenizer.decode(test_data[i][1].tolist())
    for i in range(len(test_data))
]

def bleu_score(preds, refs):
    return bleu.compute(predictions=preds, references=refs)["bleu"]


def accuracy(preds, refs):
    equal_count = 0
    for i in range(len(refs)):
        if preds[i] == refs[i]:
            equal_count += 1
    return equal_count / len(refs)

2024-04-29 16:38:09.685822: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-29 16:38:09.690160: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-29 16:38:09.754496: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [15]:
import pickle

def predict_valid_eqs(model, file_name):
    preds = []
    for eq in valid_eqs:
        preds.append(translate(model, eq, tokenizer))
    d = dict(zip(valid_eqs_answers, preds))
    with open(f"valid_eqs_preds/{file_name}.pickle", "wb") as handle:
        pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
result = {
    "rnn_type": [],
    "optimizer": [],
    "bidirectional": [],
    "hidden_dim": [],
    "n_layers": [],
    "learning_rate": [],
    "embedding_dim": [],
    "epoch": [],
    "val_bleu": [],
    "val_accuracy": [],
    "test_bleu": [],
    "test_accuracy": [],
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rnn_type_options = ["rnn", "gru", "lstm"]
optimizer_options = ["Adam", "AdamW"]
#teacher_forcing_ratio_options = [0.1, 0.3, 0.5, 0.7]
embedding_dim_options = [16, 32, 84, 128]
lr_options = [0.0001, 0.001, 0.01, 0.1]
n_layers_options = [2, 4, 6, 8]
hidden_dim_options = [256, 512, 1024]
pad_index = tokenizer("<pad>")['input_ids'][0]
for rnn_type in rnn_type_options:
    for embedding_dim in embedding_dim_options:
        for optimizer_name in optimizer_options:
            for bidirectional in [False, True]:
                for hidden_dim in hidden_dim_options:
                    for n_layers in n_layers_options:
                        for lr in lr_options:
                            model = Decoder(
                                rnn_type, 
                                vocab_size, 
                                embedding_dim, 
                                hidden_dim, 
                                n_layers, 
                                pad_index, 
                                bidirectional=bidirectional
                            )
                            if optimizer_name == "Adam":
                                optimizer = optim.Adam(model.parameters(), lr=lr)
                            else:
                                optimizer = optim.AdamW(model.parameters(), lr=lr)
                            criterion = nn.CrossEntropyLoss(ignore_index=pad_index)
                            n_epochs = 6
                            best_valid_loss = float("inf")
                            for epoch in range(n_epochs):
                                print(f"EPOCH {epoch+1}")
                                unique_name = f"type-{rnn_type}_optim-{optimizer_name}__emdeding-dim-{embedding_dim}_bidir-{int(bidirectional)}_hiddim-{hidden_dim}_layers-{n_layers}_lr-{lr}_epoch-{epoch+1}"
                                train_loss, valid_loss = train(
                                    model,
                                    criterion,
                                    train_loader,
                                    val_loader,
                                    optimizer
                                )
                                if valid_loss < best_valid_loss:
                                    best_valid_loss = valid_loss
                                    torch.save(
                                        model.state_dict(),
                                        f"rnn_models/model_{unique_name}.pt",
                                    )
                                print(
                                    f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}"
                                )
                                print(
                                    f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}"
                                )

                                # compute metrics
                                val_predictions = [
                                    translate(
                                        model,
                                        tokenizer.decode(val_data[i][0]),
                                        tokenizer,
                                        len(val_data[i][1])
                                    )
                                    for i in range(len(val_data))
                                ]
                                test_predictions = [
                                    translate(
                                        model,
                                        tokenizer.decode(test_data[i][0]),
                                        tokenizer,
                                        len(test_data[i][1])
                                    )
                                    for i in range(len(test_data))
                                ]
                                val_bleu = bleu_score(
                                    preds=val_predictions, refs=val_references
                                )
                                test_bleu = bleu_score(
                                    preds=test_predictions, refs=test_references
                                )
                                val_accuracy = accuracy(
                                    preds=val_predictions, refs=val_references
                                )
                                test_accuracy = accuracy(
                                    preds=test_predictions, refs=test_references
                                )

                                predict_valid_eqs(model, f"pred_{unique_name}")

                                print(
                                    f"\tValid BLEU: {val_bleu:7.3f} | Valid Accuracy: {val_accuracy:7.3f}"
                                )
                                result["rnn_type"].append(rnn_type)
                                result["embedding_dim"].append(embedding_dim)
                                result["optimizer"].append(optimizer_name)
                                result["bidirectional"].append(bidirectional)
                                result["n_layers"].append(n_layers)
                                result["hidden_dim"].append(hidden_dim)
                                result["learning_rate"].append(lr)
                                result["epoch"].append(epoch + 1)
                                result["val_bleu"].append(round(val_bleu, 3))
                                result["test_bleu"].append(round(test_bleu, 3))
                                result["val_accuracy"].append(round(val_accuracy, 3))
                                result["test_accuracy"].append(round(test_accuracy, 3))
                                res_df = pd.DataFrame(result)
                                res_df.to_csv("results.csv", index=False)

	Train Loss:   3.303 | Train PPL:  27.204
	Valid Loss:   3.281 | Valid PPL:  26.599
	Valid BLEU:   0.000 | Valid Accuracy:   0.000
EPOCH 5


KeyboardInterrupt: 