📌 **This notebook has been updated in this repository [here](https://github.com/jhj0517/document_classification)!**

🖋 **Author**: [jhj0517](https://github.com/jhj0517/document_classification/tree/master/notebook/document_classification.ipynb)

In [None]:
#@title #Check GPU
#@markdown To train the model, you should enable GPU
!nvidia-smi

In [None]:
#@title #Installation
#@markdown This cell will install dependencies for training
!git clone https://github.com/jhj0517/document_classification.git
%cd document_classification
!pip install -r requirements.txt

In [None]:
#@title #Mount your Gdrive
#@markdown The model file should be saved in your Gdrive path.
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

In [3]:
#@title # Configure arguments
#@markdown This section is used to configure some path arguments.

#@markdown The dataset must have "document" column and "label" column like this.

#@markdown | Label    | Document       |
#@markdown |----------|----------------|
#@markdown | sadness   | I'm so sad  |
#@markdown | happiness  | I'm so happy  |

#@markdown See [example dataset](https://github.com/jhj0517/document_classification/tree/master/example_data) to see what the actual dataset looks like.

DATA_PATH = '/content/document_classification/example_data/example_dataset.xlsx' #@param {type: "string"}
MODEL_PATH = '/gdrive/MyDrive/document_classification' #@param {type: "string"}

arguments = ""
if DATA_PATH:
  arguments += f" --data_path {DATA_PATH}"
if MODEL_PATH:
  arguments += f" --model_path {MODEL_PATH}"

In [14]:
#@title # Configure training arguments (Optional)
#@markdown This section is used to configure some training arguments. You can pass this section if you don't know about it.
batch_size = 32 #@param {type: "integer"}
learning_rate = 5e-5 #@param {type: "number"}
max_seq_length = 128 #@param {type: "integer"}
epochs = 6 #@param {type: "integer"}
seed = 7 #@param {type: "integer"}

if batch_size:
  arguments += f" --batch_size {batch_size}"
if learning_rate:
  arguments += f" --learning_rate {learning_rate}"
if max_seq_length:
  arguments += f" --max_seq_length {max_seq_length}"
if epochs:
  arguments += f" --epochs {epochs}"
if seed:
  arguments += f" --seed {seed}"

In [None]:
#@title #Train
#@markdown This section begins the training.
if 'arguments' in locals():
  print(f'training starts with arguments: {arguments}')
  !python train.py {arguments}
else:
    !python train.py

In [None]:
#@title #Test model
#@markdown Test your model here with any text input.

from ratsnlp.nlpbook.classification import ClassificationDeployArguments
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
import torch
from torch.nn.functional import softmax
import pandas as pd
import argparse

DATA_PATH = '/content/document_classification/example_data/example_dataset.xlsx' #@param {type: "string"}
MODEL_PATH = '/gdrive/MyDrive/document_classification' #@param {type: "string"}
INPUT = "I'm surprised" #@param {type: "string"}

class DataSet:
    def __init__(self):
        self.data_path = DATA_PATH
        self.df = pd.read_excel(self.data_path)
        pass

    def get_labels(self):
        excel_data_df = self.df
        intents = excel_data_df['label']
        return list(intents.unique())

    @property
    def num_labels(self):
        return len(self.get_labels())

class ClassificationModel:
    def __init__(self, data_set):
        self.data_set = data_set
        self.args = ClassificationDeployArguments(
            pretrained_model_name="beomi/kcbert-base",
            downstream_model_dir=MODEL_PATH,
            max_seq_length=128,
        )
        self.tokenizer = BertTokenizer.from_pretrained(
            self.args.pretrained_model_name,
            do_lower_case=False,
        )
        self.fine_tuned_model_ckpt = torch.load(
            self.args.downstream_model_checkpoint_fpath,
            map_location=torch.device("cpu")
        )
        self.pretrained_model_config = BertConfig.from_pretrained(
            self.args.pretrained_model_name,
            num_labels=self.fine_tuned_model_ckpt['state_dict']['model.classifier.bias'].shape.numel(),
        )
        self.model = BertForSequenceClassification(self.pretrained_model_config)
        self.activate = False

    def activate_model(self):
        if not self.activate:
            self.activate = True
            self.model.load_state_dict(
                {k.replace("model.", ""): v for k, v in self.fine_tuned_model_ckpt['state_dict'].items()})
            self.model.eval()

    def inference(self, sentence):
        self.activate_model()
        label_names = self.data_set.get_labels()
        # Preprocess the input
        inputs = self.tokenizer(
            [sentence],
            max_length=self.args.max_seq_length,
            padding="max_length",
            truncation=True,
        )

        # Inference
        with torch.no_grad():
            outputs = self.model(**{k: torch.tensor(v) for k, v in inputs.items()})

        probabilities = softmax(outputs.logits, dim=1)
        probabilities = probabilities.squeeze().tolist()  # If there's only one input sentence

        # Pair each label with its corresponding probability
        label_probabilities = zip(label_names, probabilities)

        # Print the probabilities per label
        for label, prob in label_probabilities:
            print(f"{label}: {prob:.4f}")

        return probabilities


if __name__ == '__main__':
    data_set = DataSet()
    my_model = ClassificationModel(data_set)
    my_model.inference(INPUT)
