<a href="https://colab.research.google.com/github/khavitidala/vladilena-milize/blob/main/multi-class-classification/multi_class_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiclass Classification Using BERT

## Install Transformers Library and Download Helper Code

In [None]:
!pip install transformers
!wget https://raw.githubusercontent.com/khavitidala/vladilena-milize/main/multi-class-classification/utils.py

## Load Helper Code and Mount Drive

In [None]:
from utils import *
from google.colab import drive
drive.mount('/content/drive')

## Data Configuration and Hyperparameter Setup

In [13]:
i2w = {0:'C1', 1:'C2', 2:'C3', 3:'C4', 4:'C5'}
w2i = {'C1':0, 'C2':1, 'C3':2, 'C4':3, 'C5':4}

args = {}

args['path_data'] = '/content/drive/MyDrive/PDT/dataset awal.xlsx'

args['num_labels'] = 5
args['valid_criterion'] = 'F1' # if F1 = avg. macro F1
args['model_checkpoint'] = 'indobenchmark/indobert-base-p2'
args["experiment_name"] = "indobert_phase2_default" # Experiment name
args["model_dir"] ="/content/drive/MyDrive/PDT/model/save/" # Model directory
args["max_seq_len"] = 512 # Max number of tokens
args["train_batch_size"] = 8 # Batch size for training
args["valid_batch_size"] = 8 # Batch size for validation
args["lr"] = 4e-5 # Learning rate
args["dataset"] = 'dataset-awal'

args['task'] = 'sequence_classification'
args['forward_fn'] = forward_sequence_classification
args['metrics_fn'] = document_multiclass_metrics_fn
args['k_fold'] = 1
args['word_tokenizer_class'] = TweetTokenizer
args["max_norm"] = 10.0 # Clipping gradient norm
args["n_epochs"] = 25 # 10 # Number of training epochs
args["num_layers"] = 12 # Number of layers
args["device"] = 'cuda' #"Device (cuda or cpu)")
args["fp16"] = "" # "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
args["seed"] = 42 # Seed
args["step_size"] = 1 # Step size
args["early_stop"] = 12 #3 # Step size
args["gamma"] = 0.5 # Gamma
args["debug"] = True # debugging mode
args["force"] = True # force to rewrite experiment folder
args["no_special_token"] = True # not adding special token as the input
args["lower"] = True # lower case

## Load Data

In [14]:
def load_data(path_data):
  df_train = pd.read_excel(path_data, sheet_name='data_train')
  df_valid = pd.read_excel(path_data, sheet_name='data_test')
  df_test = pd.read_excel(path_data, sheet_name='data_test')
  df_train = df_train.dropna().reset_index(drop=True)
  df_valid = df_valid.dropna().reset_index(drop=True)
  df_test = df_test.dropna().reset_index(drop=True)
  # Mengonversi label menjadi bilangan bulat, misal C1 menjadi 0, C2 menjadi 1, dst.
  df_train.label = df_train.label.apply(lambda x:re.sub(' +', '', x))
  df_valid.label = df_train.label.apply(lambda x:re.sub(' +', '', x))
  df_test.label = df_test.label.apply(lambda x:re.sub(' +', '', x))
  df_train.label = df_train.label.apply(lambda x:w2i[x])
  df_valid.label = df_test.label.apply(lambda x:w2i[x])
  df_test.label = df_test.label.apply(lambda x:w2i[x])
  df_train.columns = ['text','label']
  df_valid.columns = ['text','label']
  df_test.columns = ['text','label']

  return df_train, df_valid, df_test

In [15]:
df_train, df_valid, df_test = load_data(args['path_data'])

## Load and Fine Tune the Pretrained Model

In [17]:
# Make sure cuda is deterministic
torch.backends.cudnn.deterministic = True

# create directory
model_dir = '{}/{}/{}'.format(args["model_dir"],args["dataset"],args['experiment_name'])
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)
elif args['force']:
    print(f'overwriting model directory `{model_dir}`')
else:
    raise Exception(f'model directory `{model_dir}` already exists, use force if you want to overwrite the folder')

# Set random seed
set_seed(args['seed'])  # Added here for reproductibility    

metrics_scores = []
result_dfs = []

# load model
model, tokenizer, vocab_path, config_path = load_model(args)
optimizer = optim.Adam(model.parameters(), lr=args['lr'])

if args['fp16']:
    from apex import amp  # Apex is only required if we use fp16 training
    model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16'])

if args['device'] == "cuda":
    model = model.cuda()

print("=========== TRAINING PHASE ===========")

train_dataset = DocumentMultiClassDataset(df_train, tokenizer, lowercase=args["lower"], no_special_token=args['no_special_token'])
train_loader = DocumentMultiClassDataLoader(dataset=train_dataset, max_seq_len=args['max_seq_len'], batch_size=args['train_batch_size'], num_workers=2, shuffle=False)  

valid_dataset = DocumentMultiClassDataset(df_valid, tokenizer, lowercase=args["lower"], no_special_token=args['no_special_token'])
valid_loader = DocumentMultiClassDataLoader(dataset=valid_dataset, max_seq_len=args['max_seq_len'], batch_size=args['valid_batch_size'], num_workers=2, shuffle=False)

test_dataset = DocumentMultiClassDataset(df_test, tokenizer, lowercase=args["lower"], no_special_token=args['no_special_token'])
test_loader = DocumentMultiClassDataLoader(dataset=test_dataset, max_seq_len=args['max_seq_len'], batch_size=args['valid_batch_size'], num_workers=2, shuffle=False)

# Train
train(args, model, train_loader=train_loader, valid_loader=valid_loader, optimizer=optimizer, forward_fn=args['forward_fn'], metrics_fn=args['metrics_fn'], valid_criterion=args['valid_criterion'], i2w=i2w, n_epochs=args['n_epochs'], evaluate_every=1, early_stop=args['early_stop'], step_size=args['step_size'], gamma=args['gamma'], model_dir=model_dir, exp_id=0)

# Save Meta
if vocab_path:
    shutil.copyfile(vocab_path, f'{model_dir}/vocab.txt')
if config_path:
    shutil.copyfile(config_path, f'{model_dir}/config.json')

# Load best model
model.load_state_dict(torch.load(model_dir + "/best_model_0.th"))

# Evaluate
print("=========== EVALUATION PHASE ===========")
test_loss, test_metrics, test_hyp, test_label, test_seq = evaluate(args, model, data_loader=test_loader, forward_fn=args['forward_fn'], metrics_fn=args['metrics_fn'], i2w=i2w, is_test=True)

metrics_scores.append(test_metrics)
result_dfs.append(pd.DataFrame({
    'seq':test_seq, 
    'hyp': test_hyp, 
    'label': test_label
}))
    
result_df = pd.concat(result_dfs)
metric_df = pd.DataFrame.from_records(metrics_scores)

print('== Prediction Result ==')
print(result_df.head())
print()

print('== Model Performance ==')
print(metric_df.describe())

result_df.to_excel(model_dir + "/prediction_result.xlsx")
metric_df.describe().to_excel(model_dir + "/evaluation_result.xlsx")



## Inference

In [None]:
text = input("Masukkan contoh data uji : ")
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

## Reference

All codes here is just refactor from indoNLU:

Bryan Wilie, Karissa Vincentio, Genta Indra Winata, Samuel Cahyawijaya, X. Li, Zhi Yuan Lim, S. Soleman, R. Mahendra, Pascale Fung, Syafri Bahar, & A. Purwarianti (2020). IndoNLU: Benchmark and Resources for Evaluating Indonesian Natural Language Understanding. In *Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing.*

