In [2]:
from io import open
import unicodedata
import string
import re
import torch
import torch.nn as nn
from torch import optim
import warnings
from scipy.spatial.distance import hamming

warnings.filterwarnings("ignore")

In [None]:
BATCH_SIZE = 512

In [3]:
#алфавит для шифрования Цезаря (без пробела)
letters="abcdefghijklmnopqrstuvwxyz"

CHAR2INDEX={l: i for i, l in enumerate(letters)}
INDEX2CHAR={i: l for i, l in enumerate(letters)}

In [4]:
#функция шифровальщик Цезаря
def Cesar_encoder(s, N):
    #N-количество символов на сколько мы сдвигаемся
    new_s=[]
    for word in s.split(" "):
        new_word=[]
        for c in word.lower().strip():
            if c>='a' and c <='z':
              c=INDEX2CHAR[(CHAR2INDEX[c]+N)%26]
            new_word.append(c)
        new_word="".join(new_word)
        new_s.append(new_word)
    new_s=" ".join(new_s)
    return new_s

In [5]:
#функция дешифровальщик Цезаря
def Cesar_decoder(s, N):
    #N-количество символов на сколько мы сдвигаемся
    new_s=[]
    for word in s.split(" "):
        new_word=[]
        for c in word.lower().strip():
            if c>='a' and c<='z':
              c=INDEX2CHAR[(CHAR2INDEX[c]-N)%26]
            new_word.append(c)
        new_word="".join(new_word)
        new_s.append(new_word)
    new_s=" ".join(new_s)
    return new_s

In [196]:
def load_text(txt_path):
  with open(txt_path, encoding='utf-8') as txt_file:
        text = txt_file.read()
        text = re.sub('[^a-z\.!? ]', '', text)
  txt_file.close()
  return text

In [282]:
def get_sentences(text):
  text=text.lower()
  text = re.sub('[^a-z\.!? ]', '', text)
  sentences=re.split(r"\.\.\.\s*|[.?!]\s*", text)
  for sentence in sentences:
    if len(sentence)==0:
      sentences.remove(sentence)
  return sentences

In [None]:
#функция для подготовки предложений
def prepare_data(txt_path):
    text=load_text(txt_path)
    sentences=get_sentences(text)
    MAX_LEN=max([len(sentence) for sentence in sentences])
    return sentences, MAX_LEN

In [234]:
sentences, MAX_LEN=prepare_data('nietzsche.txt')

In [250]:
#алфавит для векторизации слов (с пробелом)
alphabet=" abcdefghijklmnopqrstuvwxyz"

letter2index={l: i for i, l in enumerate(alphabet)}
index2letter={i: l for i, l in enumerate(alphabet)}

In [None]:
def make_data(sentences, MAX_LEN, N):
    ENCODED_TEXT = []
    DECODED_TEXT = []
    # Формируем X и Y
    for sentence in sentences:
        ENCODED_TEXT.append(Cesar_encoder(sentence,N))  # Формируем наши X
        DECODED_TEXT.append(sentence)    # Формируем Y расшифрованный текст

   #кодируем наши фразы в тензоры
    X = torch.zeros((len(ENCODED_TEXT),MAX_LEN), dtype=int)
    Y = torch.zeros((len(DECODED_TEXT),MAX_LEN), dtype=int)

# Пробегаемся по нашим кусочкам предложений и кодируем под сформированные символы
    for i, sentence in enumerate(ENCODED_TEXT):
        for t, char in enumerate(sentence):
            X[i, t] = letter2index[char]

    for i, sentence in enumerate(DECODED_TEXT):
        for t, char in enumerate(sentence):
            Y[i, t] = letter2index[char]

    dataset = torch.utils.data.TensorDataset(X, Y)
    data = torch.utils.data.DataLoader(dataset, BATCH_SIZE, shuffle=True)

    return data

In [None]:
data = make_data(sentences, MAX_LEN, N=3)

In [253]:
# Строим класс RNN
class Rnn(torch.nn.Module):
                        # тип     размер словаря  размер эмб       скрытые слои   классы
    def __init__(self, dictionary_size, embedding_size, num_hiddens, output_size):
        super().__init__()
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(dictionary_size, embedding_size)
        self.hidden = nn.RNN(embedding_size, num_hiddens, batch_first=True)
        self.fc = nn.Linear(num_hiddens, output_size)
        

    def forward(self, X):
        out = self.embedding(X)
        out, state = self.hidden(out)  
        predictions = self.fc(out) 
        return predictions

In [254]:
model = Rnn(len(letter2index), 64, 128, len(letter2index))

In [255]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [258]:
for ep in range(20):
    train_loss = 0.
    train_passed = 0

    model.train()
    for X_b, y_b in data:
        #print(X_b.shape)
        #print(y_b.shape)
        y_b=y_b.flatten()
        #print(y_b.shape)
        optimizer.zero_grad()
        answers = model(X_b)
        answers = answers.view(-1, len(letter2index))
        #print(answers.shape)
        loss = criterion(answers, y_b)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}   Train Loss: {:.3f}".format(ep, train_loss / train_passed))

Epoch 0   Train Loss: 0.026
Epoch 1   Train Loss: 0.025
Epoch 2   Train Loss: 0.023
Epoch 3   Train Loss: 0.021
Epoch 4   Train Loss: 0.020
Epoch 5   Train Loss: 0.018
Epoch 6   Train Loss: 0.017
Epoch 7   Train Loss: 0.016
Epoch 8   Train Loss: 0.015
Epoch 9   Train Loss: 0.014
Epoch 10   Train Loss: 0.013
Epoch 11   Train Loss: 0.013
Epoch 12   Train Loss: 0.012
Epoch 13   Train Loss: 0.011
Epoch 14   Train Loss: 0.011
Epoch 15   Train Loss: 0.010
Epoch 16   Train Loss: 0.009
Epoch 17   Train Loss: 0.009
Epoch 18   Train Loss: 0.009
Epoch 19   Train Loss: 0.008


# Проверка Модели

In [283]:
sentence="In cryptography, a Caesar cipher, also known as Caesar's cipher, the shift cipher, Caesar's code or Caesar shift, is one of the simplest and most widely known encryption techniques."


In [284]:
sentence=get_sentences(sentence)[0]

In [285]:
print(sentence)
Encoded_text=Cesar_encoder(sentence, 3)
print(Encoded_text)

in cryptography a caesar cipher also known as caesars cipher the shift cipher caesars code or caesar shift is one of the simplest and most widely known encryption techniques
lq fubswrjudskb d fdhvdu flskhu dovr nqrzq dv fdhvduv flskhu wkh vkliw flskhu fdhvduv frgh ru fdhvdu vkliw lv rqh ri wkh vlpsohvw dqg prvw zlghob nqrzq hqfubswlrq whfkqltxhv


In [286]:
model.eval()

Rnn(
  (embedding): Embedding(27, 64)
  (hidden): RNN(64, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=27, bias=True)
)

In [287]:
X_test=torch.zeros((1, len(sentence)), dtype=int)
Y_test=torch.zeros((1, len(sentence)), dtype=int)
for i in range(len(sentence)):
    X_test[0, i] = letter2index[Encoded_text[i]]
    Y_test[0, i] = letter2index[sentence[i]]


Y_test=Y_test.flatten()
answers = model(X_test)
answers = answers.view(-1, len(letter2index))

phrase_indexes=torch.argmax(answers, dim=1)
accuracy=1-hamming(phrase_indexes, Y_test)
print("accuracy: {:.4f}".format(accuracy))
phrase=[]
for i in phrase_indexes:
  phrase.append(index2letter[i.item()])
phrase="".join(phrase)  
print("predicted: ", phrase)

accuracy: 0.9827
predicted:  in cryptography a caesar cipher also fnown as caesars cipher the shift cipher caesars code or caesar shift is one of the simplest and most widely fnown encryption technieues


In [299]:
def evaluate_accuracy(data_iter, model):
   accur=0.0
   data_passed=0
   for X_test,Y_test in data_iter:
       Y_test=Y_test.flatten()
       answers=model(X_test)
       answers = answers.view(-1, len(letter2index))
       accur += (torch.argmax(answers,dim=1) == Y_test).sum().item()
       data_passed += Y_test.shape[0]
   return accur/data_passed    


In [None]:
sentences, MAX_LEN=prepare_data('harry_potter.txt')

In [None]:
data = make_data(sentences, MAX_LEN, N=3)

In [297]:
model.eval()

Rnn(
  (embedding): Embedding(27, 64)
  (hidden): RNN(64, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=27, bias=True)
)

In [300]:
accur=evaluate_accuracy(data, model)
print("accuracy: {:.4f}".format(accur))

accuracy: 0.9990
