In [3]:
import pandas as pd
data_path = "/kaggle/input/classificiation-dataset/classification_dataset.csv"
df = pd.read_csv(data_path)
df

Unnamed: 0,name,main_category,sub_category
0,Electronic Spices 2.75 Inch 4ω (Ohm) 400w Max ...,car & motorbike,Car Electronics
1,"ZQWINT Bluetooth Car Adapter, Mini USB Bluetoo...",car & motorbike,Car Electronics
2,PROTECTRON 6.35X32mm GLASS FUSE/INVERTER FUSE ...,car & motorbike,Car Electronics
3,Cave Maruti Suzuki Male-Female Stereo Coupler ...,car & motorbike,Car Electronics
4,COVERBLACK Rubber Back Cover for Infinix X6815...,car & motorbike,Car Electronics
...,...,...,...
1103165,VOIV Digital Lux Meter LCD Display Handheld Il...,industrial supplies,"Test, Measure & Inspect"
1103166,VOIV 10K NTC Thermistor Line Cable Sensor Prob...,industrial supplies,"Test, Measure & Inspect"
1103167,VOIV Handheld Digital Anemometer Professional ...,industrial supplies,"Test, Measure & Inspect"
1103168,"VOIV 55,000 Counts Digital Multimeter High Acc...",industrial supplies,"Test, Measure & Inspect"


In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [5]:
df = df.dropna(subset=["name", "main_category"])

In [8]:
label_encoder = LabelEncoder()

In [9]:
df["label"] = label_encoder.fit_transform(df["main_category"])

In [15]:
# Chọn 100,000 dòng ban đầu
df_sampled_1 = df.sample(n=100000, random_state=82)

# Loại bỏ các dòng đã chọn trước đó
df_remaining = df.drop(df_sampled_1.index, axis=0, errors='ignore')

# Chọn tiếp 100,000 dòng mới từ dữ liệu còn lại
df_sampled_2 = df_remaining.sample(n=100000, random_state=42)

# Loại bỏ các dòng đã chọn trước đó
df_remaining = df_remaining.drop(df_sampled_2.index, axis=0, errors='ignore')

# Chọn 2,000 dòng test từ phần còn lại
df_test = df_remaining.sample(n=2000, random_state=24)

In [18]:
df_test

Unnamed: 0,name,main_category,sub_category,label
500561,FURO Men's R1042 1465 Sneaker,men's shoes,Sports Shoes,11
891591,Optimuss Running Camping Hydration Backpack Pa...,sports & fitness,Camping & Hiking,14
838626,"Chambor Orosa Defining 10h Eyeliner Pencil, Bl...",beauty & health,Luxury Beauty,3
160294,Girl's Fashion Hub's Designer Georgette Yellow...,women's clothing,Clothing,18
1030575,Royal Fashion Plus Size Thong Mens Lingerie (R...,men's clothing,Innerwear,10
...,...,...,...,...
457994,TFG Mini Ultra HD 5.5mm Endoscope Camera Flexi...,"tv, audio & cameras",Security Cameras,17
1078518,Le Platinum Sling Bag For Kids/Girls/Women/Tee...,accessories,Handbags & Clutches,0
649529,(Renewed) Jbl Tune 230Nc Bluetooth 5.2 Truly W...,"tv, audio & cameras",Headphones,17
18429,A.W.Faber-Castell I Pvt Textliner Pastel -Wall...,accessories,Bags & Luggage,0


In [40]:
label_map = {idx: label for idx, label in enumerate(label_encoder.classes_)}

In [11]:
label_map

{0: 'accessories',
 1: 'appliances',
 2: 'bags & luggage',
 3: 'beauty & health',
 4: 'car & motorbike',
 5: 'grocery & gourmet foods',
 6: 'home & kitchen',
 7: 'home, kitchen, pets',
 8: 'industrial supplies',
 9: "kids' fashion",
 10: "men's clothing",
 11: "men's shoes",
 12: 'music',
 13: 'pet supplies',
 14: 'sports & fitness',
 15: 'stores',
 16: 'toys & baby products',
 17: 'tv, audio & cameras',
 18: "women's clothing",
 19: "women's shoes"}

In [75]:
X_train, X_val, y_train, y_val = train_test_split(df_sampled_2["name"].tolist(), df_sampled_2["label"].tolist(), test_size=0.2, random_state=47)

In [76]:
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=256)

In [44]:
import torch

class ProductDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [77]:
train_dataset = ProductDataset(train_encodings, y_train)
val_dataset = ProductDataset(val_encodings, y_val)

In [46]:
train_dataset

<__main__.ProductDataset at 0x7997c78e4700>

In [47]:
num_labels = len(label_encoder.classes_)
model =BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
training_args = TrainingArguments(
    output_dir="./results_2",
    report_to="none",
    num_train_epochs=3,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy"
)

In [50]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [80]:
from transformers import Trainer

model.to("cuda")
torch.cuda.empty_cache()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3247,0.349539,0.91195,0.907376,0.91195,0.904764
2,0.2387,0.339741,0.91615,0.911578,0.91615,0.909902
3,0.163,0.365533,0.9178,0.912593,0.9178,0.913545


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=15000, training_loss=0.2623891301472982, metrics={'train_runtime': 5267.3481, 'train_samples_per_second': 45.564, 'train_steps_per_second': 2.848, 'total_flos': 1.04850253728e+16, 'train_loss': 0.2623891301472982, 'epoch': 3.0})

In [81]:
model.to("cpu")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [90]:
index_to_category = {int(k): v for k, v in label_map.items()}
def predict_category(text, threshold=0.8):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)  # Chuyển logits thành xác suất
    
    predicted_index = torch.argmax(probabilities, dim=1).item()
    confidence = probabilities[0][predicted_index].item()  # Lấy xác suất cao nhất

    if confidence >= threshold:
        category_name = index_to_category.get(predicted_index, "Unknown Category")
    else:
        category_name = "Other"  # Nếu thấp hơn ngưỡng, trả về Other
    
    return predicted_index, category_name, confidence

# Test thử với một mô tả sản phẩm
test_text = "Apple AirPods Pro 2nd Gen with active noise cancellation and crystal clear sound."
predicted_index, predicted_category, confidence = predict_category(test_text)

print(f"Text: {test_text}")
print(f"Predicted Index: {predicted_index}")
print(f"Predicted Category Name: {predicted_category}")
print(f"Confidence: {confidence:.2f}")


Text: Apple AirPods Pro 2nd Gen with active noise cancellation and crystal clear sound.
Predicted Index: 17
Predicted Category Name: tv, audio & cameras
Confidence: 1.00


In [91]:
df

Unnamed: 0,name,main_category,sub_category,label
0,Electronic Spices 2.75 Inch 4ω (Ohm) 400w Max ...,car & motorbike,Car Electronics,4
1,"ZQWINT Bluetooth Car Adapter, Mini USB Bluetoo...",car & motorbike,Car Electronics,4
2,PROTECTRON 6.35X32mm GLASS FUSE/INVERTER FUSE ...,car & motorbike,Car Electronics,4
3,Cave Maruti Suzuki Male-Female Stereo Coupler ...,car & motorbike,Car Electronics,4
4,COVERBLACK Rubber Back Cover for Infinix X6815...,car & motorbike,Car Electronics,4
...,...,...,...,...
1103165,VOIV Digital Lux Meter LCD Display Handheld Il...,industrial supplies,"Test, Measure & Inspect",8
1103166,VOIV 10K NTC Thermistor Line Cable Sensor Prob...,industrial supplies,"Test, Measure & Inspect",8
1103167,VOIV Handheld Digital Anemometer Professional ...,industrial supplies,"Test, Measure & Inspect",8
1103168,"VOIV 55,000 Counts Digital Multimeter High Acc...",industrial supplies,"Test, Measure & Inspect",8


In [93]:
df["sub_category"].nunique()

112

In [94]:
!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-8.17.2-py3-none-any.whl.metadata (8.8 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.17.0-py3-none-any.whl.metadata (3.6 kB)
Downloading elasticsearch-8.17.2-py3-none-any.whl (717 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.0/718.0 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading elastic_transport-8.17.0-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.5/64.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.17.0 elasticsearch-8.17.2


In [96]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es = Elasticsearch("http://localhost:9200")
df_filtered = df[["name", "main_category", "sub_category"]]
df_filtered = df_filtered.drop_duplicates().reset_index(drop=True)

In [97]:
df_filtered

Unnamed: 0,name,main_category,sub_category
0,Electronic Spices 2.75 Inch 4ω (Ohm) 400w Max ...,car & motorbike,Car Electronics
1,"ZQWINT Bluetooth Car Adapter, Mini USB Bluetoo...",car & motorbike,Car Electronics
2,PROTECTRON 6.35X32mm GLASS FUSE/INVERTER FUSE ...,car & motorbike,Car Electronics
3,Cave Maruti Suzuki Male-Female Stereo Coupler ...,car & motorbike,Car Electronics
4,COVERBLACK Rubber Back Cover for Infinix X6815...,car & motorbike,Car Electronics
...,...,...,...
460387,AzzureCollection Women's Synthetic Secret Pock...,stores,Women's Fashion
460388,Caprese MARI women's Tote Bag (ORANGE),stores,Women's Fashion
460389,Skechers Womens Flex Appeal 4.0-Vivid Spirit C...,stores,Women's Fashion
460390,Harpa Shoulder Straps Printed Dresses,stores,Women's Fashion


In [99]:
index_name = "product_subcategories"
mapping = {
    "mappings": {
        "properties": {
            "name": {"type": "text"},
            "main_category": {"type": "keyword"},
            "sub_category": {"type": "keyword"}
        }
    }
}

In [None]:
es.indices.create(index=index_name, body=mapping)
for _, row in df_filtered.iterrows():
    doc = {
        "title": row["title"],
        "main_category": row["main_category"],
        "sub_category": row["sub_category"]
    }
    es.index(index=index_name, body=doc)

In [17]:
model = BertForSequenceClassification.from_pretrained("/kaggle/input/bert_test/transformers/default/1")
tokenizer = BertTokenizer.from_pretrained("/kaggle/input/bert_test/transformers/default/1")

In [20]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_test[["name", "label"]])

In [23]:
def tokenize_function(example):
    return tokenizer(example["name"], padding="max_length", truncation=True)

In [24]:
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [25]:
dataset

Dataset({
    features: ['name', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2000
})

In [26]:
dataset = dataset.remove_columns(["name"])
dataset = dataset.rename_column("label", "labels")
dataset.set_format("torch")

In [35]:
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import DataLoader

# Kiểm tra GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Chuyển model sang GPU nếu có
model.to(device)

test_dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

preds = []
true_labels = []

# Đảm bảo không tính gradient trong quá trình dự đoán
with torch.no_grad():
    for batch in test_dataloader:
        # Chuyển dữ liệu vào device (GPU/CPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Dự đoán
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Đưa về CPU để xử lý numpy
        preds.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Đánh giá mô hình
accuracy = accuracy_score(true_labels, preds)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(true_labels, preds))

Using device: cuda
Accuracy: 0.9030
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.89      0.92       397
           1       0.95      0.98      0.97       128
           2       0.49      0.74      0.59        31
           3       0.88      0.93      0.90        40
           4       0.89      0.83      0.86        30
           5       0.88      0.78      0.82         9
           6       0.87      0.90      0.89        52
           8       0.92      0.79      0.85        14
           9       0.68      0.83      0.75        46
          10       0.97      0.99      0.98       289
          11       0.84      0.93      0.89       213
          12       0.78      0.88      0.82         8
          13       1.00      0.50      0.67         2
          14       0.82      0.82      0.82        40
          15       0.59      0.42      0.49       111
          16       0.84      0.90      0.87        29
          17       0.9