In [1]:
# Libraries
import tensorflow
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, ConcatDataset
from transformers import AutoTokenizer

import pandas as pd
import datetime
import time
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sn
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import pickle
from itertools import product

from ABSA_SentimentMultiEmiten.model.bert import bert_ABSA
from ABSA_SentimentMultiEmiten.data.dataset import dataset_ABSA

2023-06-21 22:08:13.528996: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
# Menentukan device yang akan digunakan untuk melakukan komputasi
DEVICE = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

print("Tensorflow\t: ", tensorflow.__version__)
print("Torch\t\t: ", torch.__version__)
print("Device\t\t: ", DEVICE)
print("GPU\t\t: ", torch.cuda.get_device_name())
print("CUDA\t\t: ", torch.version.cuda)

Tensorflow	:  2.4.1
Torch		:  1.1.0
Device		:  cuda:1
GPU		:  Tesla T4
CUDA		:  9.0.176


In [3]:
# Inisialisasi pre-trained model IndoBERT
pretrained_model_name = "indolem/indobert-base-uncased"
model_name = "indolem-indobert-gs"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
model_ABSA = bert_ABSA(pretrained_model_name)
model_ABSA.to(DEVICE)

KeyboardInterrupt: 

In [None]:
# Function untuk konversi waktu dari detik ke jam, menit, detik
def evl_time(t):
    min, sec= divmod(t, 60)
    hr, min = divmod(min, 60)
    return int(hr), int(min), int(sec)

# Function untuk menyimpan model
def save_model(model, name):
    torch.save(model.state_dict(), name)

# Function untuk load model
def load_model(model, path):
    model.load_state_dict(torch.load(path, map_location='cuda:1'), strict=False)
    return model

In [None]:
# Mengubah dataset menjadi beberapa mini-batch
def create_mini_batch(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    segments_tensors = [s[2] for s in samples]
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    label_ids = torch.stack([s[3] for s in samples])
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)

    return ids_tensors, segments_tensors, masks_tensors, label_ids

In [None]:
# Function untuk pelatihan model
def train_model_ABSA(loader, epochs, model_name):
    history = {'loss' : []}
    all_data = len(loader)
    
    # Pengulangan epoch
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        
        # Pengulangan setiap mini-batch
        for data in loader:
            t0 = time.time()
            ids_tensors, segments_tensors, masks_tensors, label_ids = data
            ids_tensors = ids_tensors.to(DEVICE)
            segments_tensors = segments_tensors.to(DEVICE)
            label_ids = label_ids.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = model_ABSA(ids_tensors=ids_tensors, lable_tensors=label_ids, masks_tensors=masks_tensors, segments_tensors=segments_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizer_ABSA.step()
            optimizer_ABSA.zero_grad()

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            print('epoch:', epoch+1, "/" , epochs," batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)

        history['loss'].append(np.mean(losses))
        save_model(model_ABSA, model_name)
    
    return history

# Function untuk pengujian model
def test_model_ABSA(loader):
    pred = []
    truth = []
    
    with torch.no_grad():
        # Pengulangan setiap mini-batch
        for data in loader:
            ids_tensors, segments_tensors, masks_tensors, label_ids = data
            ids_tensors = ids_tensors.to(DEVICE)
            segments_tensors = segments_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = model_ABSA(ids_tensors, None, masks_tensors=masks_tensors, segments_tensors=segments_tensors)
            
            _, predictions = torch.max(outputs, dim=1)

            pred += list([int(i) for i in predictions])
            truth += list([int(i) for i in label_ids])

    return truth, pred

In [None]:
def record_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_timedelta = datetime.timedelta(seconds=elapsed_time)

    str_obj = datetime.datetime.fromtimestamp(start_time)
    end_obj = datetime.datetime.fromtimestamp(end_time)

    time_str = str_obj.strftime("%d/%m/%y %H:%M:%S")
    time_end = end_obj.strftime("%d/%m/%y %H:%M:%S")

    return str(elapsed_timedelta)

# Function menghitung dua waktu dan menyimpan ke dalam file
def record_time_to_file(start_time, end_time, file):
    elapsed_time = end_time - start_time
    elapsed_timedelta = datetime.timedelta(seconds=elapsed_time)

    str_obj = datetime.datetime.fromtimestamp(start_time)
    end_obj = datetime.datetime.fromtimestamp(end_time)

    time_str = str_obj.strftime("%d/%m/%y %H:%M:%S")
    time_end = end_obj.strftime("%d/%m/%y %H:%M:%S")

    with open(file, "w") as f:
        f.write("Waktu mulai: " + time_str)
        f.write("\nWaktu selesai: " + time_end)
        f.write("\nWaktu total: " + str(elapsed_timedelta))

In [None]:
# Inisialisasi dataset
emiten_train_ds = dataset_ABSA(pd.read_csv("data_experiment/data_balance_experiment_training.csv"), tokenizer)
emiten_test_ds = dataset_ABSA(pd.read_csv("data_experiment/data_balance_experiment_testing.csv"), tokenizer)

train_ds = ConcatDataset([emiten_train_ds])
test_ds = ConcatDataset([emiten_test_ds])

In [None]:
# Inisialisasi hyperparameter space
bs_list = [16, 32]
lr_list = [0.00002, 0.00003, 0.00005]
epoch_list = [5, 25, 50]

In [None]:
# Inisialisasi variabel untuk menyimpan hasil
path = "indolem-indobert-gs/"
indolem_result = []
j = 0

# Waktu mulai
start_time = time.time()

# Grid Search
for bs, lr, epoch in product(bs_list, lr_list, epoch_list):
    # Inisialisasi pretrained-model
    model_ABSA = bert_ABSA(pretrained_model_name)
    model_ABSA.to(DEVICE)
    
    print(f'Training with batch size={bs}, learning rate={lr}, epoch={epoch}')

    optimizer_ABSA = torch.optim.Adam(model_ABSA.parameters(), lr=lr)

    # Mengubah dataset menjadi mini-batch
    train_loader = DataLoader(train_ds, batch_size=bs, collate_fn=create_mini_batch, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=bs, collate_fn=create_mini_batch, shuffle=True)
    
    # Waktu mulai training model
    model_start = time.time()
    
    # Train model
    history = %time train_model_ABSA(train_loader, epoch, (path+model_name+'-'+str(j)+'.pkl'))
    model_end = time.time()
    
    # Classification report
    model_ABSA = load_model(model_ABSA, (path+model_name+'-'+str(j)+'.pkl'))
    x, y = test_model_ABSA(test_loader)
    report = classification_report(x, y, target_names=[str(i) for i in range(3)])
    
    # Simpan hasil ke dalam variabel
    result = {'bs': bs, 'lr': lr, 'epoch': epoch, 'history': history, 'report': report,
              'name': (model_name+'-'+str(j)+'.pkl'), 'time': record_time(model_start, model_end)}
    indolem_result.append(result)
    
    j+=1
    
# Waktu selesai
end_time = time.time()  
record_time_to_file(start_time, end_time, path+'waktu_indolem_total.txt')

# Simpan hasil ke dalam file
with open(path+'indolem_gs_result.pkl', 'wb') as f:
    pickle.dump(indolem_result, f)