In [1]:
# With evaluation
from sklearn.preprocessing import LabelEncoder
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from joblib import dump
import torch

# prepare data
def prepare_data_for_bert(df):
    df['bert_input'] = "Text: " + df['text'].astype(str)
    return df

# read data
df_train = pd.read_csv('bert_train_gpt.csv')  # make sure the CSV file has 'text' and 'output' columns
df_eval = pd.read_csv('bert_test_gpt.csv')  # Evaluation data

# Label encoding for the output labels
labelencoder = LabelEncoder()
df_train['output'] = labelencoder.fit_transform(df_train['output'])
df_eval['output'] = labelencoder.transform(df_eval['output']) 

prepared_df_train = prepare_data_for_bert(df_train)
prepared_df_eval = prepare_data_for_bert(df_eval)

# transform to Hugging Face Dataset
dataset_train = Dataset.from_pandas(prepared_df_train)
dataset_eval = Dataset.from_pandas(prepared_df_eval)

# initialize model and tokenizer
tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('./bert-base-uncased', num_labels=len(labelencoder.classes_))

# tokenize function
def tokenize_and_add_labels(batch):
    tokenized_inputs = tokenizer(batch['bert_input'], truncation=True, padding='max_length', max_length=128)
    tokenized_inputs['labels'] = batch['output']
    return tokenized_inputs

tokenized_dataset_train = dataset_train.map(tokenize_and_add_labels, batched=True)
tokenized_dataset_eval = dataset_eval.map(tokenize_and_add_labels, batched=True)


# training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy="steps",  # Evaluate the model every 'logging_steps'
)

# create a trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_eval  # Evaluation data
)

dump(labelencoder, 'labelencoder.joblib')
trainer.train()
model.save_pretrained("./results/trained_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/11717 [00:00<?, ? examples/s]

Map:   0%|          | 0/3093 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mhexplode2021[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [2]:
# Evaluate
import pandas as pd
import torch

# Function to compute the confusion matrix
def compute_confusion_matrix(true_labels, pred_labels, num_classes):
    confusion_matrix = torch.zeros(num_classes, num_classes)
    for t, p in zip(true_labels, pred_labels):
        confusion_matrix[t, p] += 1
    return confusion_matrix

# Function to compute rates based on the confusion matrix
def compute_rates(confusion_matrix):
    tp = torch.diag(confusion_matrix)
    fp = confusion_matrix.sum(dim=0) - tp
    fn = confusion_matrix.sum(dim=1) - tp
    tn = confusion_matrix.sum() - (fp + fn + tp)
    
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    fnr = fn / (tp + fn)
    tnr = tn / (tn + fp)
    class_count = tp+fp
    
    return tp, fp, fn, tn, tpr, fpr, fnr, tnr, class_count

# Generate predictions
predictions = trainer.predict(tokenized_dataset_eval)
pred_labels = predictions.predictions.argmax(axis=1)
true_labels = tokenized_dataset_eval["labels"]

# Generate confusion matrix and compute rates
num_classes = len(labelencoder.classes_)
confusion_matrix = compute_confusion_matrix(true_labels, pred_labels, num_classes)
tp, fp, fn, tn, tpr, fpr, fnr, tnr, class_count = compute_rates(confusion_matrix)

# Store confusion matrix in a Pandas DataFrame
confusion_df = pd.DataFrame(confusion_matrix.numpy(), columns=labelencoder.classes_, index=labelencoder.classes_)

# Store rates in a Pandas DataFrame
rates_df = pd.DataFrame({
    'True Positive': tp.numpy(),
    'False Positive': fp.numpy(),
    'False Negative': fn.numpy(),
    'True Negative': tn.numpy(),
    'True Positive Rate': tpr.numpy(),
    'False Positive Rate': fpr.numpy(),
    'False Negative Rate': fnr.numpy(),
    'True Negative Rate': tnr.numpy(),
    'class_count': class_count.numpy()
}, index=labelencoder.classes_)


# print("\nRates:")
# display(rates_df.head(10))

In [3]:
rates_df.to_csv('./results/confusion_matrix_bert_gpt.csv')

In [4]:
labelencoder.classes_

array(['2GIG Technologies', '2Wire Inc', 'ADT Security Services',
       'APC by Schneider Electric', 'ARRIS', 'ASDF Technologies',
       'ASRock', 'ASUS', 'AVM', 'Abode', 'Acer', 'Adafruit', 'Adax',
       'Aeotec', 'AirTV', 'Airthings', 'Aladdin Connect', 'Alibaba Group',
       'Amazon', 'Ambient Weather', 'Amcrest', 'Amlogic', 'Anker',
       'Ankuoo', 'Apple', 'Aqara', 'Asustor', 'Athom', 'Audio Pro',
       'August Home', 'Aukey', 'Avatar Controls', 'Awair', 'Awox',
       'Axis Communications', 'AzureWave Technologies',
       'BT (British Telecom)', 'Bang & Olufsen', 'Beautyrest', 'Belkin',
       'Besder', 'Big Ass Fans', 'BlackBerry', 'Blink', 'Blitzwolf',
       'BloomSky', 'Bluesound', 'Bond Home', 'Bosch', 'Bose',
       'Bouygues Telecom', 'Bowers & Wilkins', 'Brilliant', 'British Gas',
       'BroadLink', 'Brother', 'Bryant', 'Bticino', 'Buffalo', 'CE Smart',
       'Caavo', 'Canary', 'Canon', 'Canonical', 'Carrier', 'Cecotec',
       'Chamberlain', 'ChargePoint', 'Char

In [5]:
# single test
from transformers import BertTokenizer, BertForSequenceClassification
from joblib import load
import torch

# load saved model and the original tokenizer
model = BertForSequenceClassification.from_pretrained('./results/trained_model')
tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased')

# load saved LabelEncoder
labelencoder = load('labelencoder.joblib')

# the text we want to predict
new_text = "Murata Manufacturing Co., Ltd.,,homekit,TRADFRI gateway,TRADFRI gateway,"

# preprocess
bert_input = f"Text: {new_text}"

# tokenize
inputs = tokenizer(bert_input, padding=True, truncation=True, max_length=128, return_tensors="pt")

# inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# decode the label
predicted_label = labelencoder.inverse_transform([predictions.item()])[0]
print(f"Predicted output label is: {predicted_label}")

Predicted output label is: IKEA


In [6]:
# ONNX convert
import torch
from transformers import BertForSequenceClassification

# load trained model (pytorch original)
model = BertForSequenceClassification.from_pretrained("./results/trained_model")

# create a dummy input to fit the input format
input_ids = torch.randint(0, model.config.vocab_size, (1, 128))  # length is 128
# attention_mask = torch.ones(1, 128)
attention_mask = torch.ones(1, 128).to(torch.float32)
dummy_input = (input_ids, attention_mask)

# set to evaluate mode
model.eval()

# output to ONNX format
onnx_model_path = "./onnx/model.onnx"
torch.onnx.export(model, dummy_input, onnx_model_path, input_names=['input_ids', 'attention_mask'], 
                  output_names=['output'], opset_version=11)


verbose: False, log level: Level.ERROR

