## Requirements

In [1]:
# !pip install transformers==4.18.0
# !pip install sentencepiece==0.1.96
# !pip install sanic==22.3.1
# !pip install torch==1.11.0

# I. Load translation models and test

## 1. Facebook mbart large
multi-language MT model: Facebook mbart model https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt

- First load MT model and tokenizer

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("models/mbart-large-50-many-to-many-mmt")

tokenizer = MBart50TokenizerFast.from_pretrained("models/mbart-large-50-many-to-many-mmt", src_lang="en_XX")


- Then use model and tokenizer for inference

In [None]:
model_inputs = tokenizer("Hello, how are you today?", return_tensors = "pt")
generated_tokens = model.generate(**model_inputs, forced_bos_token_id = tokenizer.lang_code_to_id ["vi_VN"])
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
translation

## 2. Opus Machine translation En-Vi
Model for Eng-Viet translation from Helsinki-NLP https://huggingface.co/Helsinki-NLP/opus-mt-en-vi

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("models/opus-mt-en-vi")

# Initialize the model
model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-mt-en-vi")

# Tokenize text
text = "everything is the same"
tokenized_text = tokenizer.prepare_seq2seq_batch([text], return_tensors='pt')

# Perform translation and decode the output
translation = model.generate(**tokenized_text)
translated_text = tokenizer.batch_decode(translation, skip_special_tokens=True)[0]

# Print translated text
print(translated_text)

  from .autonotebook import tqdm as notebook_tqdm
2022-06-26 11:02:54.563276: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-26 11:02:54.563395: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For

♪ mọi chuyện là như nhau ♪


## 3. NlpHUST Machine Translation
Model for Viet-Eng and Eng-Viet translation from NlpHUST

- Eng-Viet translation model: https://huggingface.co/Helsinki-NLP/opus-mt-en-vi

In [1]:
import torch

from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

t5_model = T5ForConditionalGeneration.from_pretrained("NlpHUST/t5-en-vi-small")
t5_tokenizer = T5Tokenizer.from_pretrained("NlpHUST/t5-en-vi-small")
t5_model.to(device)

srcs = ["In school , we spent a lot of time studying the history of Kim Il-Sung , but we never learned much about the outside world , except that America , South Korea , Japan are the enemies .",
       "I hope you're doing fine",
       "When the rain keeps falling down. Trouble keeps on coming round",
       "When the rain keeps falling down, trouble keeps on coming round",
       "If you're hiding what you feel. Underneath a bitter pill",
       "If you're hiding what you feel. underneath a bitter pill",
       "hello", "hi", "the robots",
       "the butterflies", "robots", "butterfly",
       "i'm my own boss", "i am my own boss",
       "i am ok",
       "hi, so sweet i can't resist your charm",
       "don't play with my heart. Before we let it fall too far",
       "don't play with my heart. before we let it fall too far",
       "It is a little chilly, but not too bad. how about where you are at?",
       "It is a little chilly, but not too bad. How about where you are at?",
       "It is a little chilly, but not too bad. how about Where you are at?".lower(),
       "hello. i'm doing just fine. how about you?",
       "hello, i'm doing just fine. How about you?",
       "i am right here. But why can't you see. Please scan more!",
       "i am right here. but why can't you see. Please scan more!",
       "One room just ain't enough. But it's hard to leave you alone",
       "One room just ain't enough. but it's hard to leave you alone",
       "One room just ain't enough. but it's hard to leave you alone".lower(),
       "That I know that i'm still free. Be anywhere that I wanna be",
       "That I know that i'm still free. be anywhere that I wanna be",
       "That I know that i'm still free. be anywhere that I wanna be".lower(),
       "Distance makes the heart grows. Even when I'm lonely, happy knowing that your love is never far. When we are apart",
       "Distance makes the heart grows. Even when I'm lonely, happy knowing that your love is never far. when we are apart",
       "Distance makes the heart grows. Even when I'm lonely, happy knowing that your love is never far. when we are apart".lower(),
       "When you love. It makes a heart. In the middle of a fight. Walk away to make it right",
       "When you love, It makes a heart. In the middle of a fight; Walk away to make it right",
       "When you love, It makes a heart. In the middle of a fight; Walk away to make it right".lower(),]
for src in srcs:
    tokenized_text = t5_tokenizer.encode(src, return_tensors="pt").to(device)
    t5_model.eval()
    summary_ids = t5_model.generate(
                        tokenized_text,
                        max_length=128, 
                        num_beams=5,
                        repetition_penalty=2.5, 
                        length_penalty=1.0, 
                        early_stopping=True)
    output = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print(f"{src} \n--->\t {output}\n\n")

  from .autonotebook import tqdm as notebook_tqdm
2022-06-30 10:06:43.867673: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-30 10:06:43.867714: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


No GPU available, using the CPU instead.
In school , we spent a lot of time studying the history of Kim Il-Sung , but we never learned much about the outside world , except that America , South Korea , Japan are the enemies . 
--->	 Ở trường, chúng tôi dành nhiều thời gian để nghiên cứu về lịch sử Kim Il-Sung, nhưng chúng tôi chưa bao giờ học được nhiều về thế giới bên ngoài, ngoại trừ Mỹ, Hàn Quốc, Nhật Bản là kẻ thù.


I hope you're doing fine 
--->	 Tôi hi vọng anh đang làm tốt.


When the rain keeps falling down. Trouble keeps on coming round 
--->	 Khi mưa tiếp tục rơi xuống.


When the rain keeps falling down, trouble keeps on coming round 
--->	 Khi mưa tiếp tục rơi xuống, rắc rối tiếp tục đến.


If you're hiding what you feel. Underneath a bitter pill 
--->	 Nếu anh đang che giấu những gì anh cảm thấy.


If you're hiding what you feel. underneath a bitter pill 
--->	 Nếu cô đang che giấu những gì cô cảm thấy. dưới một viên thuốc cay


hello 
--->	 Xin chào.


hi 
--->	 Xin chào

- Viet-Eng translation model: https://huggingface.co/Helsinki-NLP/opus-mt-vi-en

In [2]:
t5_model_vi = T5ForConditionalGeneration.from_pretrained("NlpHUST/t5-vi-en-small")
t5_tokenizer_vi = T5Tokenizer.from_pretrained("NlpHUST/t5-vi-en-small")
t5_model_vi.to(device)

srcs = ["xin chào ngày mới. thời tiết ở đó thế nào?",
       "Indonesia phỏng đoán nguyên nhân tàu ngầm chở 53 người mất tích bí ẩn",
       "bạn khỏe không? Rất vui được gặp mặt!",
       "bạn có khỏe không? rất vui được gặp lại bạn!",
       "bạn khỏe không? rất vui được gặp mặt!",
       "bạn khỏe không? Rất vui được gặp mặt!"]
for src in srcs:
    tokenized_text = t5_tokenizer_vi.encode(src, return_tensors="pt").to(device)
    t5_model_vi.eval()
    summary_ids = t5_model_vi.generate(
                        tokenized_text,
                        max_length=256, 
                        num_beams=5,
                        repetition_penalty=2.5, 
                        length_penalty=1.0, 
                        early_stopping=True
                    )
    output = t5_tokenizer_vi.decode(summary_ids[0], skip_special_tokens=True)
    print(src, " ----> \t", output)

Downloading: 100%|██████████| 607/607 [00:00<00:00, 301kB/s]
Downloading: 100%|██████████| 1.12G/1.12G [06:05<00:00, 3.29MB/s]   
Downloading: 100%|██████████| 4.11M/4.11M [00:02<00:00, 1.68MB/s]
Downloading: 100%|██████████| 98.0/98.0 [00:00<00:00, 22.4kB/s]
Downloading: 100%|██████████| 81.0/81.0 [00:00<00:00, 23.8kB/s]


xin chào ngày mới. thời tiết ở đó thế nào?  ----> 	 Hello, new day. How's the weather there?
Indonesia phỏng đoán nguyên nhân tàu ngầm chở 53 người mất tích bí ẩn  ----> 	 Indonesia anticipates the cause of the submarine transporting 53 mysterious missing persons
bạn khỏe không? Rất vui được gặp mặt!  ----> 	 How are you?
bạn có khỏe không? rất vui được gặp lại bạn!  ----> 	 How are you? I'm so happy to see you again!
bạn khỏe không? rất vui được gặp mặt!  ----> 	 How are you? I'm so happy to meet you!
bạn khỏe không? Rất vui được gặp mặt!  ----> 	 How are you?


# II. Test Sanic Service
Following cells tests open-chat-bot-kit: en-vi vi-en services and open-chatbot service
- translate services, en-vi vi-en:  port 8000
- open-chatbot service listens:     port 8001

In [None]:
# deploy sanic service on local machine by executing:
## python -m sanic <app file name, e.g app.py then use 'app'>:<sanic app defined in python file> -H 0.0.0.0 -p <listening port>

- 3 service blocks tested separately:

In [4]:
# VIETNAMESE-TO-ENGLISH TRANSLATION SERVICE
import requests
import json

url_obj = {'text': "xin chào, hôm nay thời tiết như thế nào?"}

stt_server = 'http://0.0.0.0:8000/vi-to-en'

text = requests.post(stt_server, data=json.dumps(url_obj))  # parse the dictionary to json with json.dumps()
vi_en = text.text
print(vi_en)

Hello, how's the weather today?


In [4]:
# OPEN-DOMAIN-CHATBOT SERVICE

# pass vi_en from previous cell to open chat service
stt_server = 'http://0.0.0.0:8001/open-chat'
text_obj = {'text': vi_en}
text = requests.post(stt_server, data=json.dumps(text_obj))
bot_answer = text.text[9:-2]
print(bot_answer)

 It is a little chilly, but not too bad. How about where you are at?


In [5]:
# ENGLISH-TO-VIETNAMESE TRANSLATION SERVICE

url_obj = {'text': "i'm my own boss"}
stt_server = 'http://0.0.0.0:8000/en-to-vi'

text = requests.post(stt_server, data=json.dumps(url_obj))
en_vi = text.text
print(en_vi)

tôi là sếp của riêng tôi.


- 3 blocks tested in 1 flow (raw input fed into translation models)

In [14]:
payload = {'text': "xin chào một ngày tốt lành".lower()}


text = requests.post('http://0.0.0.0:8000/vi-to-en', data=json.dumps(payload))
vi_en = text.text

text_obj = {'text': vi_en.lower()}
text = requests.post('http://0.0.0.0:8001/open-chat', data=json.dumps(text_obj))
bot_answer = text.text[9:-2]

text_obj = {'text': bot_answer.lower()}
text = requests.post('http://0.0.0.0:8000/en-to-vi', data=json.dumps(text_obj))
en_vi = text.text

- 3 blocks tested in 1 flow: with preprocessed inputs (break inputs into partitions by punctuations) before fed into MT models for better translation quality

In [9]:
import requests
import json
import re

print("\t###EN-TO-VI")
msg = "cung hoàng đạo của bạn là gì?"
url_objs = []

# break original input by punctuations, then translate each partition, which gives better translation result
for i in re.split("([,\?\!\.])", msg.lower()):
    if len(i) > 0:
        url_objs.append({'text': i})

stt_server = 'http://0.0.0.0:8000/vi-to-en'
vi_en = ""
# translate each partition
for i, url_obj in enumerate(url_objs):
    if url_obj["text"] not in [",", "?", "!", "."]:
        text = requests.post(stt_server, data=json.dumps(url_obj))
        vi_en += text.text
    else:
        vi_en += url_obj["text"] + " "
print(vi_en)


print("\t###BLENDER-BOT")
stt_server = 'http://0.0.0.0:8001/open-chat'
url_obj = {'text': re.sub(r"[\?\!\.]", ",", vi_en.lower())}

text = requests.post(stt_server, data=json.dumps(url_obj))
bot_answer = text.text[9:-2]
print(bot_answer)


print("\t###EN-TO-VI")
url_objs = []
# break original input by punctuations, then translate each partition, which gives better translation result
for i in re.split("([,\?\!\.])", bot_answer.lower()):
    if len(i) > 0:
        url_objs.append({'text': i})
print(url_objs)

stt_server = 'http://0.0.0.0:8000/en-to-vi'
en_vi = ""
# translate each partition
for i, url_obj in enumerate(url_objs):
    if url_obj["text"] not in [",", "?", "!", "."]:
        text = requests.post(stt_server, data=json.dumps(url_obj))
        en_vi += text.text
    else:
        en_vi += url_obj["text"] + " "
print(en_vi)

	###EN-TO-VI
[{'text': 'cung hoàng đạo của bạn là gì'}, {'text': '?'}]
What is your horoscope?? 
	###BLENDER-BOT
i don't have a horoscope. i'm not sure what it is.
	###EN-TO-VI
[{'text': "i don't have a horoscope"}, {'text': '.'}, {'text': " i'm not sure what it is"}, {'text': '.'}]
tôi không có tử vi. tôi không chắc nó là gì. 


# III. Test Rasa chatbot eqquiped with Open-domain-kit via REST

The 3 services of open-domain-kit are deployed along with Rasa server (in docker-compose), and will be summoned in action server (handled via out_of_scope intents)

In [26]:
import json
import requests

payload = {"sender": "test", "message": "dự báo thời tiết hôm nay"}
# open-domain-kit integrated into Rasa at 
text = requests.post('http://localhost:5005/webhooks/rest/webhook', data=json.dumps(payload))
vi_en = text.text
print(json.loads(vi_en))

[{'recipient_id': 'test', 'text': 'Lúc nào cũng vui khi có cơ hội ngắm nhìn thời tiết tốt nhất thế giới.'}]
