In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
%cd "/content/drive/MyDrive/Ingeniaritza Informatikoa/4. Maila/2. Lauhilekoa/HP/Lana/dialbot/notebook"

[Errno 2] No such file or directory: '/content/drive/MyDrive/Ingeniaritza Informatikoa/4. Maila/2. Lauhilekoa/HP/Lana/dialbot/notebook'
/content


In [4]:
%cd "/content/drive/MyDrive/dialbot/notebook"

/content/drive/.shortcut-targets-by-id/13R7fsJaBA2ra3u5WI5-hbHRUYUzCn976/dialbot/notebook


In [5]:
pip install python-telegram-bot



In [6]:
save_path='token.txt'

In [7]:
with open(save_path) as creds:
    for i, line in enumerate(creds):
        if i == 1:
            TOKEN = line.replace("token=", "").replace("\n", "")

In [8]:
import random
from typing import Tuple
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor


class Encoder(nn.Module):
    def __init__(self,
                 input_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: float):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)

        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self,
                src: Tensor) -> Tuple[Tensor]:

        embedded = self.dropout(self.embedding(src))

        outputs, hidden = self.rnn(embedded)

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

        return outputs, hidden

In [9]:
class Attention(nn.Module):
    def __init__(self,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 attn_dim: int):
        super().__init__()

        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim

        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tensor:

        src_len = encoder_outputs.shape[0]

        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden,
            encoder_outputs),
            dim = 2)))

        attention = torch.sum(energy, dim=2)

        return F.softmax(attention, dim=1)

In [10]:
class Decoder(nn.Module):
    def __init__(self,
                 output_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: int,
                 attention: nn.Module):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)

        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)


    def _weighted_encoder_rep(self,
                              decoder_hidden: Tensor,
                              encoder_outputs: Tensor) -> Tensor:

        a = self.attention(decoder_hidden, encoder_outputs)

        a = a.unsqueeze(1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        weighted_encoder_rep = torch.bmm(a, encoder_outputs)

        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)

        return weighted_encoder_rep, a


    def forward(self,
                input: Tensor,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tuple[Tensor]:

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        weighted_encoder_rep, a = self._weighted_encoder_rep(decoder_hidden,
                                                          encoder_outputs)

        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)

        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)

        output = self.out(torch.cat((output,
                                     weighted_encoder_rep,
                                     embedded), dim = 1))

        return output, decoder_hidden.squeeze(0), a.squeeze(1)

In [11]:
class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder: nn.Module,
                 decoder: nn.Module,
                 device: torch.device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,
                src: Tensor,
                trg: Tensor,
                teacher_forcing_ratio: float = 0.5) -> Tensor:

        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        # first input to the decoder is the <sos> token
        output = trg[0,:]
        for t in range(1, max_len):
            output, hidden, _ = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

In [12]:
!pip install tokenizers
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

def train_tokenizer(input_path, output_path, vocab_size=10000):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=[input_path], vocab_size=vocab_size, special_tokens=["[PAD]", "<s>", "</s>", "<unk>"])
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.save_model(output_path)
    return tokenizer

def get_tokenizer(path):
    tokenizer = ByteLevelBPETokenizer(path + 'vocab.json', path + 'merges.txt')
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    return tokenizer



In [13]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re

def clean_line(s):
    s = s.lower()
    s = re.sub(r"\.{3}", r".", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def tokenize_line(line):
    tokens = word_tokenize(line)
    tokens_text = ' '.join(tokens)
    return tokens_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
import torch
import random

MAX_LENGTH = 30
decoding_strategy='multinomial'

def decode(logits, tokenizer, decoding_strategy='multinomial', k=3, temp=0.4):
    tokenizer.decode(logits.topk(10)[1][0].numpy())
    if decoding_strategy=='top1':
        target = logits.max(1)[1]
    elif decoding_strategy=='topk':
        target = logits.topk(k)[1][0][random.randint(0, k-1)].unsqueeze(-1)
    else:
        target = torch.multinomial(logits.squeeze().div(temp).exp().cpu(), 1)
    return target

def evaluate(sentence, model, tokenizer, decoding_strategy='multinomial', k=3, temp=0.4):
    sentence = clean_line(sentence)
    sentence = tokenize_line(sentence)
    with torch.no_grad():
        target = torch.Tensor([tokenizer.token_to_id('<s>')]).long()
        output_sentence = []
        encoder_outputs, hidden = model.encoder(torch.Tensor(tokenizer.encode(sentence).ids).long().unsqueeze(-1))
        attentions = torch.zeros(MAX_LENGTH, 1, len(tokenizer.encode(sentence).ids)).to(device)
        for i in range(MAX_LENGTH):
            # first input to the decoder is the <sos> token
            output, hidden, attention = model.decoder(target, hidden, encoder_outputs)
            attentions[i] = attention
            target = decode(output, tokenizer, decoding_strategy, k, temp)
            if target.numpy() == tokenizer.token_to_id('</s>'):
                return sentence, tokenizer.decode(output_sentence), attentions[:i+1]
            else:
                output_sentence.append(target.numpy()[0])
    return sentence, tokenizer.decode(output_sentence), attentions

In [15]:
INPUT_DIM = 10000
OUTPUT_DIM = 10000
ENC_EMB_DIM = 512
DEC_EMB_DIM = 512
ENC_HID_DIM = 1024
DEC_HID_DIM = 1024
ATTN_DIM = 1024
ENC_DROPOUT = 0.2
DEC_DROPOUT = 0.2
#Load model
device = 'cpu'
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
model_en = Seq2Seq(enc, dec, device).to(device)
model_en.load_state_dict(torch.load('../model/en/model.pt', map_location=device))
tokenizer_en = get_tokenizer('../model/en/')
model_en.eval()

INPUT_DIM_EU = 10000
OUTPUT_DIM_EU = 10000
ENC_EMB_DIM_EU = 256
DEC_EMB_DIM_EU = 256
ENC_HID_DIM_EU = 512
DEC_HID_DIM_EU = 512
ATTN_DIM_EU = 64
ENC_DROPOUT_EU = 0.5
DEC_DROPOUT_EU = 0.5
#Load model
device = 'cpu'
enc = Encoder(INPUT_DIM_EU, ENC_EMB_DIM_EU, ENC_HID_DIM_EU, DEC_HID_DIM_EU, ENC_DROPOUT_EU)
attn = Attention(ENC_HID_DIM_EU, DEC_HID_DIM_EU, ATTN_DIM_EU)
dec = Decoder(OUTPUT_DIM_EU, DEC_EMB_DIM_EU, ENC_HID_DIM_EU, DEC_HID_DIM_EU, DEC_DROPOUT_EU, attn)
model_eu = Seq2Seq(enc, dec, device).to(device)
model_eu.load_state_dict(torch.load('../model/eu/model.pt', map_location=device))
tokenizer_eu = get_tokenizer('../model/eu/')
model_eu.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(10000, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=64, bias=True)
    )
    (embedding): Embedding(10000, 256)
    (rnn): GRU(1280, 512)
    (out): Linear(in_features=1792, out_features=10000, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [16]:
from telegram import InlineKeyboardButton, InlineKeyboardMarkup, Update

```
/en - change language to english
/eu - change language to euskera
/top1 - change decoding to top1
/topk - change decoding to topk
/multinomial - change decoding to multinomial
/language - check current language
/setlanguage - choose a language
/decoding - check current decoding strategy
/setdecoding - choose a decoding strategy
/help - show list of commands
/settings - show settings
```

In [17]:
import logging

from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackQueryHandler, ConversationHandler ,CallbackContext
#language
lang='en'
# Enable logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    level=logging.INFO)

logger = logging.getLogger(__name__)


# Define a few command handlers. These usually take the two arguments update and
# context. Error handlers also receive the raised TelegramError object in error.
def start(update, context):
    """Send a message when the command /start is issued."""
    if lang == 'eu':
        update.message.reply_text('Kaixo! Dialbot naiz. Komandoen zerrenda ikusteko /help erabili🧐.')
    else:
        update.message.reply_text('Hi! I\'m dialbot. Use /help to see a list of all comands🧐.')


def language_changed(update, context):
    if lang == 'eu':
        update.message.reply_text('Hizkuntza aldatu da.')
    else:
        update.message.reply_text('Language changed.')

def decoding_changed(update, context):
    if lang == 'eu':
        update.message.reply_text(f"Dekodeketa estrategia aldatu da: {decoding_strategy}")
    else:
        update.message.reply_text(f"Decoding strategy changed: {decoding_strategy}")

def help(update, context):
    """Send a message when the command /help is issued."""
    if lang == 'eu':
        text = """
*Hizkuntza*
/en \- hizkuntza ingelesera aldatu
/eu \- hizkuntza euskerara aldatu
/language \- hizkuntza egiaztatu
/setlanguage \- hizkuntza aukeratu \n
*Dekodeketa estrategia*
/multinomial \- dekodeketa estrategia aldatu multionomial\-era
/top1 \- dekodeketa estrategia aldatu top1\-era
/topk \- dekodeketa estrategia aldatu topk\-ra
/decoding \- aukeratutako dekodeketa estrategia ikusi 
/setdecoding \- aukeratu dekodeketa estrategia \n
*Ezarpen nagusiak*
/settings \- ezarpenak ikusi
/help \- komandoen lista ikusi"""
    else:
        text = """
*Language*
/en \- change language to English
/eu \- change language to Euskera 
/language \- check current language
/setlanguage \- choose a language \n
*Decoding strategy*
/multinomial \- change decoding to multinomial
/top1 \- change decoding to top1
/topk \- change decoding to topk
/decoding \- check current decoding strategy
/setdecoding \- choose a decoding strategy \n
*General settings*
/settings \- show settings
/help \- show list of commands"""
    update.message.reply_text(text, parse_mode='MarkdownV2')

def eu(update, context):
    """Change language to Euskera."""
    global lang
    lang = 'eu'
    language_changed(update,context)

def en(update, context):
    """Change language to English"""
    global lang
    lang = 'en'
    language_changed(update, context)

def top1(update, context):
    """Change language to Euskera."""
    global decoding_strategy
    decoding_strategy = 'top1'
    decoding_changed(update,context)

def topk(update, context):
    """Change language to English"""
    global decoding_strategy
    decoding_strategy = 'topk'
    decoding_changed(update, context)

def multinomial(update, context):
    """Change language to English"""
    global decoding_strategy
    decoding_strategy = 'multinomial'
    decoding_changed(update, context)

def language(update, context):
    """Reply with current language"""
    if lang == 'eu':
        update.message.reply_text('Euskara')
    else:
        update.message.reply_text('English')

def settings(update, context):
    """Reply with current language"""
    if lang == 'eu':
        text = f""" 
*Ezarpenak*
Hizkuntza: {lang}
Deskodeketa estrategia: {decoding_strategy}"""
    else:
        text = f""" 
*Settings*
Language: {lang}
Decoding strategy: {decoding_strategy}"""
    update.message.reply_text(text, parse_mode='MarkdownV2')

def set_language(update, context):
    """Set language to English or Euskara"""
    bot = context.bot
    languages = ['English', 'Euskara']
    callback = ['en', 'eu']
    button_list = []
    if lang == 'eu':
        senc='Aukeratu hizkuntza'
    else:
        senc='Choose a language'
    for i, each in enumerate(languages):
        button_list.append(InlineKeyboardButton(each, callback_data=callback[i]))
    reply_markup = InlineKeyboardMarkup(build_menu(button_list, n_cols=1))
    bot.send_message(chat_id=update.message.chat_id, text=senc, reply_markup=reply_markup)

def decoding(update, context):
    """Reply with current decoding strategy"""
    update.message.reply_text(decoding_strategy)
    
def set_decoding(update, context):
    """Set decoding strategy to top1, topk or multinomial"""
    bot = context.bot
    list_of_strategies = ['top1', 'topk', 'multinomial']
    button_list = []
    if lang == 'eu':
        senc='Aukeratu deskodeketa estrategia'
    else:
        senc='Choose a decoding strategy'
    for each in list_of_strategies:
        button_list.append(InlineKeyboardButton(each, callback_data = each))
    reply_markup = InlineKeyboardMarkup(build_menu(button_list, n_cols=1))
    bot.send_message(chat_id=update.message.chat_id, text=senc, reply_markup=reply_markup)

def build_menu(buttons, n_cols, header_buttons=None, footer_buttons=None):
    menu = [buttons[i:i + n_cols] for i in range(0, len(buttons), n_cols)]
    if header_buttons:
        menu.insert(0, header_buttons)
    if footer_buttons:
        menu.append(footer_buttons)
    return menu

# Callbacks

def decoding_callback(update, context):
    global decoding_strategy
    decoding_strategy = update.callback_query.data
    id = update.callback_query.message.chat.id
    if lang == 'eu':
        context.bot.send_message(chat_id=id, text=f"Dekodeketa estrategia aldatu da: {decoding_strategy}")
    else:
        context.bot.send_message(chat_id=id, text=f"Decoding strategy changed: {decoding_strategy}")

def language_callback(update, context):
    global lang
    id = update.callback_query.message.chat.id
    lang = update.callback_query.data
    if lang == 'eu':
        context.bot.send_message(chat_id=id, text="Hizkuntza aldatu da")
    else:
        context.bot.send_message(chat_id=id, text="Language changed")
    
# Messages

def answer(update, context):
    """Answer to the user message."""
    global model_en
    global model_eu
    global tokenizer_en
    global tokenizer_eu
    global decoding_strategy
    input = update.message.text
    if lang == 'eu':
        sentence, output, attention = evaluate(input, model_eu, tokenizer_eu, decoding_strategy)
    else:
        sentence, output, attention = evaluate(input, model_en, tokenizer_en, decoding_strategy)
    update.message.reply_text(output.capitalize())

def unknown(update, context):
    if lang == 'eu':
        text = "Barkatu, komando hori ez dut ezagutzen. Komandoen zerrenda ikusteko /help erabili."
    else:
        text = "Sorry, I didn't understand that command. Use /help to see a list of all comands."
    update.message.reply_text(text)

# Errors

def error(update, context):
    """Log Errors caused by Updates."""
    logger.warning('Update "%s" caused error "%s"', update, context.error)

def main():
    """Start the bot."""
    # Create the Updater and pass it your bot's token.
    # Make sure to set use_context=True to use the new context based callbacks
    # Post version 12 this will no longer be necessary
    updater = Updater(TOKEN, use_context=True)

    # Get the dispatcher to register handlers
    dp = updater.dispatcher

    # on different commands - answer in Telegram
    dp.add_handler(CommandHandler("start", start))
    dp.add_handler(CommandHandler("help", help))
    dp.add_handler(CommandHandler("eu", eu))
    dp.add_handler(CommandHandler("en", en))
    dp.add_handler(CommandHandler("top1", top1))
    dp.add_handler(CommandHandler("topk", topk))
    dp.add_handler(CommandHandler("multinomial", multinomial))
    dp.add_handler(CommandHandler('settings', settings))
    dp.add_handler(CommandHandler('language', language))
    dp.add_handler(CommandHandler('setlanguage', set_language))
    dp.add_handler(CommandHandler('decoding', decoding))
    dp.add_handler(CommandHandler('setdecoding', set_decoding))
    # callback handlers
    dp.add_handler(CallbackQueryHandler(decoding_callback, pattern="^top1|topk|multinomial$"))
    dp.add_handler(CallbackQueryHandler(language_callback, pattern="^en|eu$"))
    # on noncommand message
    dp.add_handler(MessageHandler(Filters.text & (~Filters.command), answer))
    # on unknown commmand
    dp.add_handler(MessageHandler(Filters.command, unknown))

    # log all errors
    dp.add_error_handler(error)

    # Start the Bot
    updater.start_polling()


    # Run the bot until you press Ctrl-C or the process receives SIGINT,
    # SIGTERM or SIGABRT. This should be used most of the time, since
    # start_polling() is non-blocking and will stop the bot gracefully.
    updater.idle()


if __name__ == '__main__':
    main()

2021-05-06 14:43:48,545 - apscheduler.scheduler - INFO - Scheduler started
2021-05-06 15:16:03,258 - telegram.ext.updater - INFO - Received signal 2 (SIGINT), stopping...
2021-05-06 15:16:03,266 - apscheduler.scheduler - INFO - Scheduler has been shut down
