In [55]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
import pandas as pd
import math
import numpy as np
import re
import torch
import shutil

from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from google.colab import files


# keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization, Activation
from tensorflow.keras.losses import MeanSquaredError
from keras.callbacks import EarlyStopping

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder

# **1. Dataset**

In [57]:
ds = pd.read_csv("/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/dataset/domain_specific_chatbot_data.csv")
ds

Unnamed: 0,query,response,intent,domain
0,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
1,How can I schedule an appointment with my doctor?,You can schedule an appointment by calling our...,appointment booking,healthcare
2,What should I do if I miss a dose of my medica...,"If you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
3,How can I check my account balance?,You can check your balance by logging into you...,balance inquiry,finance
4,What is the interest rate for a personal loan?,The current interest rate for personal loans i...,loan inquiry,finance
...,...,...,...,...
2995,"I lost my credit card, what should I do?",Please contact our customer service immediatel...,lost card reporting,finance
2996,What are the symptoms of flu?,"Flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
2997,How do I update my contact details on my account?,"To update your contact details, log into your ...",contact update,finance
2998,Can I make changes to my loan repayment schedule?,Changes to your loan repayment schedule can be...,loan repayment adjustment,finance


In [58]:
ds.shape

(3000, 4)

In [59]:
ds["query"] # series 1D

Unnamed: 0,query
0,What are the side effects of the COVID-19 vacc...
1,How can I schedule an appointment with my doctor?
2,What should I do if I miss a dose of my medica...
3,How can I check my account balance?
4,What is the interest rate for a personal loan?
...,...
2995,"I lost my credit card, what should I do?"
2996,What are the symptoms of flu?
2997,How do I update my contact details on my account?
2998,Can I make changes to my loan repayment schedule?


In [60]:
ds[["query"]] # Dataframe 2D

Unnamed: 0,query
0,What are the side effects of the COVID-19 vacc...
1,How can I schedule an appointment with my doctor?
2,What should I do if I miss a dose of my medica...
3,How can I check my account balance?
4,What is the interest rate for a personal loan?
...,...
2995,"I lost my credit card, what should I do?"
2996,What are the symptoms of flu?
2997,How do I update my contact details on my account?
2998,Can I make changes to my loan repayment schedule?


In [61]:
ds.isnull().sum()   # check null

Unnamed: 0,0
query,0
response,0
intent,0
domain,0


In [62]:
ds.info() # check kiểu

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   query     3000 non-null   object
 1   response  3000 non-null   object
 2   intent    3000 non-null   object
 3   domain    3000 non-null   object
dtypes: object(4)
memory usage: 93.9+ KB


In [63]:
for col in ds.columns:
    print(ds[col].map(type).value_counts(), '\n')
    print("=====================================")

query
<class 'str'>    3000
Name: count, dtype: int64 

response
<class 'str'>    3000
Name: count, dtype: int64 

intent
<class 'str'>    3000
Name: count, dtype: int64 

domain
<class 'str'>    3000
Name: count, dtype: int64 



# **1.1. String-columns**

In [64]:
ds["intent"] = ds["intent"].map(lambda val: val.strip() if (type(val) == str) else val)  # bỏ khoảng trắng
ds["domain"] = ds["domain"].map(lambda val: val.strip() if (type(val) == str) else val)  # bỏ khoảng trắng


In [65]:
ds["intent"] = ds["intent"].apply(lambda val: str(val) if pd.notnull(val) else "") # chuyển thành string
ds["domain"] = ds["domain"].apply(lambda val: str(val) if pd.notnull(val) else "") # chuyển thành string


In [66]:
# mode
for col in ['intent', "domain"]:
  mode_value = ds[col].mode()[0]
  ds[col] = ds[col].fillna(mode_value)

In [67]:
def clean_text(text):
    text = re.sub(r'\r\n', ' ', text)       # "\n" thành " "
    text = re.sub(r'\s+', ' ', text)        # Khoảng trắng thành ""
    text = re.sub(r'<.*?>', '', text)       # Loại bỏ thẻ
    text = text.strip().lower()
    return text

for col in ['query', "response"]:
  ds[col] = ds[col].apply(clean_text)

# **1.2. Split**

In [68]:
train_ds, val_ds = train_test_split(ds, test_size=0.2, random_state=123)

# 2D
print("train_ds shape:", train_ds.shape)
print("val_ds shape:", val_ds.shape)

train_ds shape: (2400, 4)
val_ds shape: (600, 4)


In [69]:
train_ds

Unnamed: 0,query,response,intent,domain
497,how do i update my contact details on my account?,"to update your contact details, log into your ...",contact update,finance
21,how can i schedule an appointment with my doctor?,you can schedule an appointment by calling our...,appointment booking,healthcare
1710,what are the side effects of the covid-19 vacc...,common side effects of the covid-19 vaccine in...,side effects inquiry,healthcare
2323,how can i check my account balance?,you can check your balance by logging into you...,balance inquiry,finance
1516,what are the symptoms of flu?,"flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
...,...,...,...,...
1147,how do i update my contact details on my account?,"to update your contact details, log into your ...",contact update,finance
2154,what is the interest rate for a personal loan?,the current interest rate for personal loans i...,loan inquiry,finance
1766,what are the symptoms of flu?,"flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
1122,what should i do if i miss a dose of my medica...,"if you miss a dose, take it as soon as you rem...",medication inquiry,healthcare


In [70]:
train_ds = train_ds.reset_index(drop=True)
val_ds = val_ds.reset_index(drop=True)

In [71]:
train_ds

Unnamed: 0,query,response,intent,domain
0,how do i update my contact details on my account?,"to update your contact details, log into your ...",contact update,finance
1,how can i schedule an appointment with my doctor?,you can schedule an appointment by calling our...,appointment booking,healthcare
2,what are the side effects of the covid-19 vacc...,common side effects of the covid-19 vaccine in...,side effects inquiry,healthcare
3,how can i check my account balance?,you can check your balance by logging into you...,balance inquiry,finance
4,what are the symptoms of flu?,"flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
...,...,...,...,...
2395,how do i update my contact details on my account?,"to update your contact details, log into your ...",contact update,finance
2396,what is the interest rate for a personal loan?,the current interest rate for personal loans i...,loan inquiry,finance
2397,what are the symptoms of flu?,"flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
2398,what should i do if i miss a dose of my medica...,"if you miss a dose, take it as soon as you rem...",medication inquiry,healthcare


# **1.3. Tokenization**

In [72]:
type(train_ds)

In [73]:
train_ds.head(1)

Unnamed: 0,query,response,intent,domain
0,how do i update my contact details on my account?,"to update your contact details, log into your ...",contact update,finance


In [74]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

def preprocess_function(ds):
    # Tokenize input (query)
    model_inputs = tokenizer(
        ds["query"],
        padding="max_length",
        truncation=True,
        max_length=250
    )

    # T5 yêu cầu set chế độ target tokenizer khi tokenize đầu ra
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            ds["response"],
            padding="max_length",
            truncation=True,
            max_length=250
        )

    # Gán input_ids của response làm "labels" để huấn luyện
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

train_ds = train_ds.apply(preprocess_function, axis=1)   # axis = 1: theo dòng
val_ds = val_ds.apply(preprocess_function, axis=1)



In [75]:
type(train_ds)

In [76]:
# mảng 2D
train_ds = np.array(train_ds)
val_ds = np.array(val_ds)

train_ds[0]

{'input_ids': [149, 103, 3, 23, 2270, 82, 574, 1030, 30, 82, 905, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# **2. Model**

In [77]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="epoch",
    eval_steps=50,
    save_steps=500,
    report_to="none",  # Thêm dòng này để tắt báo cáo W&B
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,0.2734,0.186192
2,0.0252,0.005858
3,0.0073,0.000719
4,0.0039,0.00022
5,0.0027,0.000123
6,0.0024,0.000103


TrainOutput(global_step=1800, training_loss=0.7882537937578228, metrics={'train_runtime': 617.9074, 'train_samples_per_second': 23.304, 'train_steps_per_second': 2.913, 'total_flos': 951622041600000.0, 'train_loss': 0.7882537937578228, 'epoch': 6.0})

# **3. Đánh giá**

# **4. Save**

In [78]:
# save
model.save_pretrained("/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model")
tokenizer.save_pretrained("/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model")

('/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model/tokenizer_config.json',
 '/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model/special_tokens_map.json',
 '/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model/spiece.model',
 '/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model/added_tokens.json')

In [81]:
# load
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model")
tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model")

# **5. Chatbot System**

In [82]:
device = model.device
model.eval()  # chuyển sang inference mode

def generate_response(query):
    query = clean_text(query)

    inputs = tokenizer(
        query,
        padding="max_length",
        truncation=True,
        max_length=250,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=250,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            early_stopping=True
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [83]:
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = generate_response(user_input)
    print("Bot:", response)

You: how can I schedule an appointment with my doctor?


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Bot: you can schedule an appointment by calling our office or using our online portal.


KeyboardInterrupt: Interrupted by user

In [84]:
shutil.make_archive("/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model", 'zip', "/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model")
files.download("/content/drive/MyDrive/colab-notebooks/healthcare-customer-support-chatbot/model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>