<a href="https://colab.research.google.com/github/irhafidz/2024chatbot_halaltourism_WestSumatra/blob/main/chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install and Import Library**

In [None]:
# For Google Colab
!pip install torch
!pip install accelerate
!pip install torchinfo
!pip install Sastrawi

In [None]:
import re
import time
import json
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from scipy import stats
from nltk import download
from transformers import Trainer
from nltk.corpus import stopwords
from transformers import BertTokenizer
from transformers import TrainingArguments
from sklearn.preprocessing import LabelEncoder
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
# Download resource dan setting
download('stopwords')
tqdm.pandas()

# **Prepare Dataset**

## **Load Dataset**

In [None]:
df = pd.read_csv('data/chatlist.csv')

In [None]:
df.info()

In [None]:
# Cek jumlah variasi pertanyaan per label
print(f"Jumlah variasi pertanyaan per {df['labelx'].value_counts()}")

In [None]:
# Cek distribusi label
print(f"Distribusi {df['labelx'].value_counts(normalize = True)}")

In [None]:
# Dapatkan semua label unik
categories = np.unique(list(df['labelx']))
categories

## **Cleaning Dataset**

In [None]:
# Encode labelx menjadi angka
le = LabelEncoder()
df['label'] = le.fit_transform(df['labelx'])
print('Hasil label encoding:')
for i in range(len(le.classes_)):
  print(f"{i} = {le.classes_[i]}")

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop = stopwords.words('indonesian')

def prepare_question(text: str) -> str:
  text = text.lower()
  text = re.sub(r'[,](?!\s)', ', ', text)  # Add spasi pada koma tanpa spasi
  text = re.sub(r'[.](?!\s)', '. ', text)  # Add spasi pada titik tanpa spasi
  text = text.replace('\\t', ' ').replace('\\n', ' ').replace('\\u', ' ')  # Hapus tab, new line, , dll
  text = text.encode('ascii', 'replace').decode('ascii')  # Hapus karakter non ASCII (emoticon, chinese word, dll)
  text = re.sub(r"(?i)(?:https?:\/\/)?(?:www\.)?(?:[a-zA-Z0-9-.]+)(?:\.[a-zA-Z]{2,6})(?:\/[^\s\r\n]*)?", "", text)  # Hapus URL
  text = re.sub(r'[\!\"\#\$\%\&\'\(\)\*\+\,\.\-\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)  # Ubah tanda baca ke spasi
  text = text.strip()  # Hapus whitespace di depan/belakang teks
  text = re.sub('\s+', ' ', text)  # Hapus double++ spasi
  text = re.sub('\s+(?=\.)', '', text)  # Hapus spasi sebelum titik
  text = re.sub(r'\.{2,}', r'\.', text)  # Hapus titik++
  text = ' '.join([word for word in text.split() if word not in (stop)])  # Hapus stopwords
  return stemmer.stem(text)  # Return hasil stemming

In [None]:
df_shuffle = df.sample(frac=1, random_state=42)
df_shuffle['stem'] = df_shuffle['text'].progress_apply(prepare_question)
df_shuffle.head()

## **Split Training and Validation Set**

In [None]:
# Menggunakan seluruh data untuk training
dataset_text = list(df_shuffle['stem'])
dataset_labels = list(df_shuffle['label'])

# Split train_text menjadi 80% training set dan 20% validation set
train_texts, val_texts, train_labels, val_labels = train_test_split(
    dataset_text, dataset_labels, random_state=42, test_size=0.2
)

print(f"Data training   : {len(train_texts)}")
print(f"Data validation : {len(val_texts)}")
print(f"Total data      : {len(train_texts) + len(val_texts)}")

In [None]:
# Save Dataset
df_shuffle.to_csv('data/clean.csv', index=False)

# **Retraining BERT Model**

In [None]:
model_name = 'bert-base-multilingual-uncased'

In [None]:
# Cek apakah GPU tersedia
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running code akan menggunakan \"{device}\"")

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize input teks
train_encodings = tokenizer(train_texts, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, padding=True, truncation=True)

# Mendefinisikan PyTorch datasets
class PyTorchDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Menyiapkan dataset untuk PyTorch
train_dataset = PyTorchDataset(train_encodings, train_labels)
val_dataset = PyTorchDataset(val_encodings, val_labels)

## **Fine-Tune Pre-Trained Model**

In [None]:
# Fine-tune pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(categories))

training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train model
trainer.train()

## **Save Fine-Tuned Model**

In [None]:
# Save the fine-tuned model
output_dir = "./bert-base-multilingual-uncased-halal-tourism"
model.save_pretrained(output_dir)
print ("Model ", output_dir, "telah disimpan ....")

# **Chatbot**

## **Load Tag-Answser**

In [None]:
# Menggunakan dictionary merepresentasikan daftar tag-answer sebagai file JSON
with open('data/answer.json', 'r') as f:
    data = json.load(f)
data

In [None]:
# Ekstrak tags dari intents
tags = [intent["tag"] for intent in data["intents"]]
categories = np.unique(tags)
for category in categories:
  print(category)

## **Load BERT Model**

In [None]:
# Load BERT tokenizer
model_name = 'bert-base-multilingual-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model_name = "./bert-base-multilingual-uncased-halal-tourism"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(categories))

In [None]:
# Cek apakah GPU tersedia
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running code akan menggunakan \"{device}\"")

# Push model ke device
model = model.to(device)

from torchinfo import summary
summary(model)

## **Predict Intent**

In [None]:
def get_prediction(str):
  example_text = re.sub(r'[^a-zA-Z ]+', '', str)
  inputs = tokenizer(example_text, padding=True, truncation=True, return_tensors='pt').to(device)  # Tokenize input text
  with torch.no_grad():
    outputs = model(**inputs)                                 # Perform inference
  logits = outputs.logits                                     # Get predicted logits
  probs = torch.softmax(logits, dim=-1)	                      # Convert logits to probabilities
  predicted_label_index = torch.argmax(probs, dim=-1).item()  # Get predicted label (index of the maximum probability)
  predicted_label = categories[predicted_label_index]         # Get corresponding label name
  return predicted_label

def get_response(message):
	start = time.time()
	intent = get_prediction(message)
	for i in data['intents']:
		if i["tag"] == intent:
			result = random.choice(i["responses"])
			break
	end = time.time()
	interval = end - start
	return "Intent: "+ intent + '\n' + "Response: " + result, interval, intent

In [None]:
df_test = pd.read_csv('data/testing.csv')

print(f"Ada {len(df_test.columns.to_list())} kolom, yaitu {df_test.columns.to_list()}")
print(f"Ada {df_test.index.size} untuk testing\n")
print(df_test['labelx'].value_counts(), "\n")

df_test['stem'] = df_test['text'].apply(prepare_question)
df_test = df_test.sample(frac=1, random_state=42)
df_test.head()

In [None]:
time_list = list()
intent_list = list()

for index, row in df_test.iterrows():
  answer, interval, intent = get_response(row['stem'])
  intent_list.append(intent)
  time_list.append(interval)
  print(f"Question: {row['text']}")
  # print(f"Prepared Question : {row['stem']}")
  # print(f"Real Intent : {row['labelx']}")
  print(answer)
  print()

print("Selesai...")

## **Evaluate**

In [None]:
for i in range(len(df_test)):
  print(f"{round(time_list[i], 5)} detik => {df_test.loc[i, 'text']}")

print()
print(f"Waktu tercepat = {min(time_list)}")
print(f"Waktu terlama  = {max(time_list)}")
print(f"Rata-rata      = {np.mean(time_list)} detik")

In [None]:
print(classification_report(df_test.loc[:, 'labelx'], intent_list))

In [None]:
print(confusion_matrix(df_test.loc[:, 'labelx'], intent_list))