In [1]:
from google.colab import drive
drive.mount('/content/drive')
# drive.mount('/gdrive')
%cd /content/drive/MyDrive/github/intent_model

Mounted at /content/drive
/content/drive/MyDrive/github/intent_model


In [None]:
# !git pull
# !ls -a
# !cat .gitignore
!git add .

### imports

In [None]:
!pip install indic-num2words

Collecting indic-num2words
  Downloading indic_num2words-1.2.0-py3-none-any.whl (16 kB)
Installing collected packages: indic-num2words
Successfully installed indic-num2words-1.2.0


In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import torch
import utils
# import importlib
# importlib.reload(utils)

lang_list = ['hi', 'en']

### Fill Placeholders

In [None]:
# for lang in lang_list:
#   for i in range(10):
#     utils.fill_placeholders(lang)

## Training

### Get manually created data

In [None]:
def get_training_data(lang_list):
    df = pd.DataFrame()
    # lang = 'en'
    all_data = pd.DataFrame()
    for lang in lang_list:
        read_dir = os.path.join(os.getcwd(),'formatted_data', lang)
        files = os.listdir(read_dir)
        data_files = [file for file in files if file.endswith(".csv")]

        for file in data_files:
            file_df = pd.read_csv(f'{os.path.join(read_dir, file)}')
            df = pd.concat([df, file_df], ignore_index=True)

    return df

In [None]:
data = get_training_data(lang_list)
data = data.rename(columns={"sentence": "Text", "intent": "Label", 'language': 'lang'})
print(f'before de-dupe, shape is {data.shape}')
data = data.drop_duplicates()
data = data.reset_index(drop=True)
print(f'after de-dupe, shape is {data.shape}')

before de-dupe, shape is (136500, 3)
after de-dupe, shape is (66927, 3)


In [None]:
# data.tail()

### Training utilities

In [None]:
# !pip install transformers
# !pip install sentencepiece
# !pip install transformers[torch]
# # !pip install accelerate -U

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.cuda.set_device(0)
torch.cuda.empty_cache()

MODEL = "ai4bharat/indic-bert"

tokenizer = AutoTokenizer.from_pretrained(MODEL)


def tokenize_data(texts):
    tokenized_inputs = tokenizer(
        texts, padding="max_length", max_length=512, truncation=True
    )
    return tokenized_inputs


def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average="micro")
    precision = precision_score(y_true=labels, y_pred=pred, average="micro")
    f1 = f1_score(y_true=labels, y_pred=pred, average="micro")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


def train(data):
    df = data
    df = df.dropna()

    labels = sorted(list(set(df["Label"])))
    labels_to_ids = {k: v for v, k in enumerate(sorted(labels))}

    X = list(df["Text"])
    y = list(df["Label"])
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    X_train_tokenized = tokenizer(
        X_train, padding=True, truncation=True, max_length=512
    )
    X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL, num_labels=len(labels)
    )
    train_dataset = utils.Dataset(X_train_tokenized, y_train, labels_to_ids)
    val_dataset = utils.Dataset(X_val_tokenized, y_val, labels_to_ids)
    output_dir = os.path.join(os.getcwd(),'models')

    train_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="steps",
        eval_steps=500,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        seed=0,
        save_steps=500,
        load_best_model_at_end=True,
    )
    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    with open(os.path.join(output_dir, "labels-dict.pkl"), "wb") as f:
        pickle.dump(labels_to_ids, f)

    train_metrics = trainer.train(resume_from_checkpoint=os.path.join(os.getcwd(),'models', 'checkpoint-4000'))
    eval_metrics = trainer.evaluate()

    print(train_metrics)
    print(eval_metrics)

## Train the model

In [None]:
# train(data)

### Filter GPT generated data

In [None]:
lang_list = ['hi', 'en']
def get_gpt_data(lang_list):
  df = pd.read_csv(os.path.join(os.getcwd(),'gpt_data', 'all-lang-all.csv'))
  df = df[df['lang'].isin(lang_list)]
  return df

In [None]:
gpt_data = get_gpt_data(lang_list)

In [None]:
# gpt_data.head()

In [None]:
import pickle

import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
from utils import Dataset

MODEL = "ai4bharat/indic-bert"


def softmax(x):
    e_x = np.exp(x)
    return e_x / np.sum(e_x, axis=1, keepdims=True)


class IntentRecognizer:
    def __init__(self, model_path, label_dict_pkl, conf_threshold):
        with open(label_dict_pkl, "rb") as f:
            self.labels_to_ids = pickle.load(f)
        self.ids_to_labels = {
            intent_id: intent_label
            for intent_label, intent_id in self.labels_to_ids.items()
        }
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_path, num_labels=len(self.labels_to_ids)
        )
        self.test_trainer = Trainer(self.model)
        self.conf_threshold = conf_threshold

    def predict(self, sentence):
        sentence = [sentence]
        sentence_tokenized = self.tokenizer(
            sentence, padding=True, truncation=True, max_length=512
        )
        model_in = Dataset(sentence_tokenized)
        raw_pred, label_ids, _ = self.test_trainer.predict(model_in)
        # print(raw_pred, metrics)
        probs = softmax(raw_pred)
        # print(probs)
        y_pred = np.argmax(probs, axis=1)[0]
        pred_prob = np.max(probs)

        orig_pred = self.ids_to_labels[y_pred]
        # pred = self.ids_to_labels[y_pred]
        # if pred_prob < self.conf_threshold:
        #     pred = "unknown"
        return orig_pred, pred_prob

In [None]:
import pandas as pd
# from intent_recognizer import IntentRecognizer
from sklearn.metrics import classification_report
from tqdm import tqdm


def test(data):
    df = data
    df = df.dropna()
    model_path = os.path.join(os.getcwd(),'models', 'filtere_data_checkpoint')

    intent_recognizer = IntentRecognizer(
        model_path, os.path.join(os.getcwd(),'models', 'labels-dict.pkl'), conf_threshold=0.85
    )

    true_intent = list()
    pred_intent = list()
    pred_probability = list()

    for i, row in tqdm(df.iterrows()):
        true_intent.append(row["Label"])
        intent, pred_prob = intent_recognizer.predict(row["Text"])
        pred_intent.append(intent)
        pred_probability.append(pred_prob)

    df["Predicted Intent"] = pred_intent
    df["Predicted Prob"] = pred_probability
    print(classification_report(true_intent, pred_intent))
    # df.to_csv(os.path.join(os.getcwd(),'gpt_data', csv'), index=False)
    return df


In [None]:
f = test(gpt_data.head())
print(f)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
0it [00:00, ?it/s]

1it [00:00,  9.49it/s]

5it [00:00, 28.65it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

           balance_check       1.00      1.00      1.00         2
     electricity_payment       0.00      0.00      0.00         1
       insurance_renewal       1.00      1.00      1.00         1
mobile_recharge_postpaid       0.00      0.00      0.00         0
            upi_creation       1.00      1.00      1.00         1

                accuracy                           0.80         5
               macro avg       0.60      0.60      0.60         5
            weighted avg       0.80      0.80      0.80         5

                                                Text                Label  \
0  id is linked to my electricity bill i just nee...  electricity_payment   
1                               account activity now        balance_check   
2  i am interested in finding out what my balance is        balance_check   
3  i want to renew my sons insurance policy with ...    insurance_renewal   
4  can you help me

### Use filtered data to finetune earlier model

In [None]:
def get_filtere_data():
  df = pd.read_csv(os.path.join(os.getcwd(),'gpt_data', 'filtered_en_hi_data.csv'))
  df = df[df['Label'] == df['Predicted Intent']]
  df = df.reset_index(drop=True)
  return df

In [None]:
filtered_data = get_filtere_data()
filtered_data.shape

(34509, 5)

In [None]:
train(filtered_data)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss


TrainOutput(global_step=4000, training_loss=0.0, metrics={'train_runtime': 0.2235, 'train_samples_per_second': 370526.582, 'train_steps_per_second': 11582.73, 'total_flos': 328882366672860.0, 'train_loss': 0.0, 'epoch': 2.39})
{'eval_loss': 0.02108871191740036, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 40.1833, 'eval_samples_per_second': 171.763, 'eval_steps_per_second': 5.375, 'epoch': 2.39}
