In [1]:
# With evaluation
from sklearn.preprocessing import LabelEncoder
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from joblib import dump
import torch

# 准备数据
def prepare_data_for_bert(df):
    df['bert_input'] = "Text: " + df['text'].astype(str)
    return df

# 读取数据
df_train = pd.read_csv('bert_train_gpt.csv')  # 请确保你的 CSV 文件中有 'text' 和 'output' 这两列
df_eval = pd.read_csv('bert_test_gpt.csv')  # Evaluation data

# Label encoding for the output labels
labelencoder = LabelEncoder()
df_train['output'] = labelencoder.fit_transform(df_train['output'])
df_eval['output'] = labelencoder.transform(df_eval['output']) 

prepared_df_train = prepare_data_for_bert(df_train)
prepared_df_eval = prepare_data_for_bert(df_eval)

# 转换成 Hugging Face Dataset
dataset_train = Dataset.from_pandas(prepared_df_train)
dataset_eval = Dataset.from_pandas(prepared_df_eval)

# 初始化模型和分词器
tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('./bert-base-uncased', num_labels=len(labelencoder.classes_))

# 分词
def tokenize_and_add_labels(batch):
    tokenized_inputs = tokenizer(batch['bert_input'], truncation=True, padding='max_length', max_length=128)
    tokenized_inputs['labels'] = batch['output']
    return tokenized_inputs

# 应用分词
tokenized_dataset_train = dataset_train.map(tokenize_and_add_labels, batched=True)
tokenized_dataset_eval = dataset_eval.map(tokenize_and_add_labels, batched=True)


# 训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy="steps",  # Evaluate the model every 'logging_steps'
)

# 创建 Trainer 实例并训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_eval  # Evaluation data
)

dump(labelencoder, 'labelencoder.joblib')
trainer.train()
model.save_pretrained("./results/trained_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/11717 [00:00<?, ? examples/s]

Map:   0%|          | 0/3093 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mhexplode2021[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [5]:
import pandas as pd
import torch

# Function to compute the confusion matrix
def compute_confusion_matrix(true_labels, pred_labels, num_classes):
    confusion_matrix = torch.zeros(num_classes, num_classes)
    for t, p in zip(true_labels, pred_labels):
        confusion_matrix[t, p] += 1
    return confusion_matrix

# Function to compute rates based on the confusion matrix
def compute_rates(confusion_matrix):
    tp = torch.diag(confusion_matrix)
    fp = confusion_matrix.sum(dim=0) - tp
    fn = confusion_matrix.sum(dim=1) - tp
    tn = confusion_matrix.sum() - (fp + fn + tp)
    
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    fnr = fn / (tp + fn)
    tnr = tn / (tn + fp)
    class_count = tp+fp
    
    return tp, fp, fn, tn, tpr, fpr, fnr, tnr, class_count

# Generate predictions
predictions = trainer.predict(tokenized_dataset_eval)
pred_labels = predictions.predictions.argmax(axis=1)
true_labels = tokenized_dataset_eval["labels"]

# Generate confusion matrix and compute rates
num_classes = len(labelencoder.classes_)
confusion_matrix = compute_confusion_matrix(true_labels, pred_labels, num_classes)
tp, fp, fn, tn, tpr, fpr, fnr, tnr, class_count = compute_rates(confusion_matrix)

# Store confusion matrix in a Pandas DataFrame
confusion_df = pd.DataFrame(confusion_matrix.numpy(), columns=labelencoder.classes_, index=labelencoder.classes_)

# Store rates in a Pandas DataFrame
rates_df = pd.DataFrame({
    'True Positive': tp.numpy(),
    'False Positive': fp.numpy(),
    'False Negative': fn.numpy(),
    'True Negative': tn.numpy(),
    'True Positive Rate': tpr.numpy(),
    'False Positive Rate': fpr.numpy(),
    'False Negative Rate': fnr.numpy(),
    'True Negative Rate': tnr.numpy(),
    'class_count': class_count.numpy()
}, index=labelencoder.classes_)


print("\nRates:")
display(rates_df.head(10))



Rates:


Unnamed: 0,True Positive,False Positive,False Negative,True Negative,True Positive Rate,False Positive Rate,False Negative Rate,True Negative Rate,class_count
2GIG Technologies,0.0,0.0,0.0,3093.0,,0.0,,1.0,0.0
2Wire Inc,0.0,0.0,0.0,3093.0,,0.0,,1.0,0.0
ADT Security Services,0.0,0.0,1.0,3092.0,0.0,0.0,1.0,1.0,0.0
APC by Schneider Electric,0.0,0.0,0.0,3093.0,,0.0,,1.0,0.0
ARRIS,9.0,1.0,1.0,3082.0,0.9,0.000324,0.1,0.999676,10.0
ASDF Technologies,0.0,0.0,1.0,3092.0,0.0,0.0,1.0,1.0,0.0
ASRock,2.0,0.0,1.0,3090.0,0.666667,0.0,0.333333,1.0,2.0
ASUS,13.0,8.0,6.0,3066.0,0.684211,0.002602,0.315789,0.997398,21.0
AVM,7.0,0.0,3.0,3083.0,0.7,0.0,0.3,1.0,7.0
Abode,0.0,0.0,0.0,3093.0,,0.0,,1.0,0.0


In [6]:
rates_df.to_csv('./results/confusion_matrix_bert_gpt_10epochs.csv')

In [4]:
labelencoder.classes_

array(['Apple', 'Bang & Olufsen', 'Chamberlain', 'IKEA', 'Mediabridge',
       'Microsoft', 'Pixel Magic Systems Ltd', 'Sony', 'Vizio', 'Xiaomi',
       'acer', 'actiontec', 'airtv', 'amazon', 'amplifi', 'apple',
       'aqara', 'arcadyan', 'arris', 'askey', 'asus', 'athom', 'august',
       'avm', 'awox', 'belkin', 'blink', 'bluesound', 'bose', 'bravia',
       'buffalo', 'canon', 'cisco', 'd-link', 'dell', 'denon', 'devialet',
       'devolo', 'directv', 'doorbird', 'drobo', 'echostar', 'ecobee',
       'eero', 'elgato', 'eve', 'facebook', 'fibaro', 'freebox',
       'freenas', 'gardena', 'google', 'hama', 'heatmiser', 'hikvision',
       'hisense', 'hitron', 'homebridge', 'homee', 'hp', 'huawei',
       'humax', 'idevices', 'ieast', 'ihome', 'ikea', 'insignia',
       'integra', 'intel', 'jbl', 'konnected', 'koogeek', 'lacie',
       'legrand', 'lenovo', 'leviton', 'lg', 'libratone', 'lifx',
       'linksys', 'loewe', 'logitech', 'lutron', 'marantz', 'mediatek',
       'medion', 'mi

In [7]:
from transformers import BertTokenizer, BertForSequenceClassification
from joblib import load
import torch

# 加载保存的模型和分词器
model = BertForSequenceClassification.from_pretrained('./results/trained_model')
tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased')

# 加载保存的 LabelEncoder
labelencoder = load('labelencoder.joblib')

# 要预测的新文本样本（替换成你自己的文本和已知输出标签）
new_text = "Murata Manufacturing Co., Ltd.,,homekit,TRADFRI gateway,TRADFRI gateway,"

# 数据预处理（与训练时使用的格式保持一致）
bert_input = f"Text: {new_text}"

# 分词
inputs = tokenizer(bert_input, padding=True, truncation=True, max_length=128, return_tensors="pt")

# 模型推理
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# 解码预测的标签
predicted_label = labelencoder.inverse_transform([predictions.item()])[0]
print(f"Predicted output label is: {predicted_label}")

Predicted output label is: IKEA


In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
from joblib import load
import torch

# 设置设备为CPU
device = torch.device('cuda')

# 加载保存的模型和分词器
model = BertForSequenceClassification.from_pretrained('./results/trained_model')
tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased')

# 确保模型在CPU上
model.to(device)

# 加载保存的 LabelEncoder
labelencoder = load('labelencoder.joblib')

# 要预测的新文本样本（替换成你自己的文本和已知输出标签）
new_text = "Murata Manufacturing Co., Ltd.,,homekit,TRADFRI gateway,TRADFRI gateway,"

# 数据预处理（与训练时使用的格式保持一致）
bert_input = f"Text: {new_text}"

# 分词
inputs = tokenizer(bert_input, padding=True, truncation=True, max_length=128, return_tensors="pt")

# 确保输入数据在CPU上
inputs = {key: val.to(device) for key, val in inputs.items()}

# 模型推理
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# 解码预测的标签
predicted_label = labelencoder.inverse_transform([predictions.item()])[0]
print(f"Predicted output label is: {predicted_label}")

Predicted output label is: IKEA


In [4]:
import torch
from transformers import BertForSequenceClassification

# 加载训练好的模型
model = BertForSequenceClassification.from_pretrained("./results/trained_model")

# 创建一个 dummy 输入符合模型的输入格式
input_ids = torch.randint(0, model.config.vocab_size, (1, 128))  # 假设输入长度为 128
# attention_mask = torch.ones(1, 128)
attention_mask = torch.ones(1, 128).to(torch.float32)
dummy_input = (input_ids, attention_mask)

# 设置模型为评估模式
model.eval()

# 导出模型
onnx_model_path = "./onnx/model.onnx"
torch.onnx.export(model, dummy_input, onnx_model_path, input_names=['input_ids', 'attention_mask'], 
                  output_names=['output'], opset_version=11)


verbose: False, log level: Level.ERROR

