In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd

In [2]:
# Load and preprocess the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
df = pd.read_csv(r"C:\Users\k_pow\OneDrive\Documents\Capstone\EDA\filtered_first_10000.csv")


In [4]:
def preprocess(data):
    # Convert Series to list if necessary
    if isinstance(data, pd.Series):
        data = data.tolist()
    return tokenizer(data, padding=True, truncation=True, max_length=64, return_tensors="pt")

In [5]:
# Assuming 'df' is your labeled DataFrame with 'PRODUCT DESCRIPTION' and 'label'
train_texts, val_texts, train_labels, val_labels = train_test_split(df['PRODUCT DESCRIPTION'], df['Commodity Category'], test_size=0.2)


In [6]:
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset


# Convert labels to numeric values
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)

# Convert labels to tensor
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

train_encodings = preprocess(train_texts)
val_encodings = preprocess(val_texts)

# Prepare data in dictionary format
train_data = {
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
}
val_data = {
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
}

# Create Hugging Face Dataset objects
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)


In [7]:
# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['Commodity Category'].unique()))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer with correct dataset format
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()




  0%|          | 0/1500 [00:00<?, ?it/s]

{'loss': 0.7092, 'grad_norm': 10.539091110229492, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.3681996762752533, 'eval_runtime': 129.0254, 'eval_samples_per_second': 15.501, 'eval_steps_per_second': 0.969, 'epoch': 1.0}
{'loss': 0.2402, 'grad_norm': 18.807851791381836, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.22671830654144287, 'eval_runtime': 131.3246, 'eval_samples_per_second': 15.229, 'eval_steps_per_second': 0.952, 'epoch': 2.0}
{'loss': 0.1274, 'grad_norm': 0.08937319368124008, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.22119183838367462, 'eval_runtime': 209.867, 'eval_samples_per_second': 9.53, 'eval_steps_per_second': 0.596, 'epoch': 3.0}
{'train_runtime': 7119.4604, 'train_samples_per_second': 3.371, 'train_steps_per_second': 0.211, 'train_loss': 0.3589092356363932, 'epoch': 3.0}


TrainOutput(global_step=1500, training_loss=0.3589092356363932, metrics={'train_runtime': 7119.4604, 'train_samples_per_second': 3.371, 'train_steps_per_second': 0.211, 'total_flos': 789411124224000.0, 'train_loss': 0.3589092356363932, 'epoch': 3.0})

In [9]:
# Define the path where you want to save the model
save_path = r"C:\Users\k_pow\OneDrive\Documents\Capstone\EDA\SCx\BERT"

# Save the trained model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


('C:\\Users\\k_pow\\OneDrive\\Documents\\Capstone\\EDA\\SCx\\BERT\\tokenizer_config.json',
 'C:\\Users\\k_pow\\OneDrive\\Documents\\Capstone\\EDA\\SCx\\BERT\\special_tokens_map.json',
 'C:\\Users\\k_pow\\OneDrive\\Documents\\Capstone\\EDA\\SCx\\BERT\\vocab.txt',
 'C:\\Users\\k_pow\\OneDrive\\Documents\\Capstone\\EDA\\SCx\\BERT\\added_tokens.json')

In [None]:
# from transformers import BertForSequenceClassification, BertTokenizer

# # Load the saved model and tokenizer
# model = BertForSequenceClassification.from_pretrained('./saved_model')
# tokenizer = BertTokenizer.from_pretrained('./saved_model')


In [10]:
df_class = pd.read_csv(r"C:\Users\k_pow\OneDrive\Documents\Capstone\EDA\df_class.csv")

In [11]:
# Sample 100 rows from df_class
df_sample = df_class.sample(n=100, random_state=42)
# Extract PRODUCT DESCRIPTION as test_texts
test_texts = df_sample['PRODUCT DESCRIPTION'].tolist()
# Assuming the label column is named 'label'
test_labels = df_sample['Commodity Category'].tolist()


In [15]:
# Load the tokenizer and model (if they’re not already loaded in your session)
model = BertForSequenceClassification.from_pretrained(r"C:\Users\k_pow\OneDrive\Documents\Capstone\EDA\SCx\BERT")
tokenizer = tokenizer  # Ensure your tokenizer is the same as used for training

In [16]:
# Step 1: Tokenize the test set
test_encodings = tokenizer(
    test_texts,  # List of test descriptions
    padding=True,
    truncation=True,
    max_length=64,
    return_tensors="pt"
)


In [18]:
# Prepare test data in dictionary format
test_data = {
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels  # Now in tensor format with numeric labels
}

# Convert to Dataset format
test_dataset = Dataset.from_dict(test_data)


  0%|          | 0/7 [00:00<?, ?it/s]

Test Set Evaluation Results: {'eval_runtime': 10.1996, 'eval_samples_per_second': 9.804, 'eval_steps_per_second': 0.686, 'epoch': 3.0}


In [19]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 3: Define custom metric computation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [21]:
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics  # Ensure this is included
)

# Step 4: Evaluate the model on the test set
results = trainer.evaluate(test_dataset)

# Print the evaluation results
print("Test Set Evaluation Results:", results)

  0%|          | 0/13 [00:00<?, ?it/s]

Test Set Evaluation Results: {'eval_model_preparation_time': 0.008, 'eval_runtime': 10.1345, 'eval_samples_per_second': 9.867, 'eval_steps_per_second': 1.283}


In [25]:
# Sample 10,000 rows and prepare test_texts and test_labels
df_sample = df_class.sample(n=1000, random_state=42)
test_texts = df_sample['PRODUCT DESCRIPTION'].tolist()
test_labels = df_sample['Commodity Category'].tolist()

# Convert test_labels to numeric format using the same LabelEncoder
test_labels = label_encoder.transform(test_labels)  # Apply label encoding on test labels
test_labels = torch.tensor(test_labels)  # Convert test labels to tensor

# Tokenize the test set
test_encodings = tokenizer(
    test_texts,
    padding=True,
    truncation=True,
    max_length=64,
    return_tensors="pt"
)

# Prepare test data in dictionary format
test_data = {
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels  # Ensure test_labels is now a tensor with numeric labels
}

# Convert test data to Hugging Face Dataset
test_dataset = Dataset.from_dict(test_data)

# Define custom metric computation function
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments for evaluation only
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    eval_strategy="no"  # Set to "no" since we only want to evaluate without retraining
)

# Initialize Trainer for evaluation only
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics
)

# Step 4: Evaluate the model on the test set
results = trainer.evaluate(test_dataset)

# Print the evaluation results
print("Test Set Evaluation Results:", results)


  0%|          | 0/63 [00:00<?, ?it/s]

Test Set Evaluation Results: {'eval_loss': 2.4619579315185547, 'eval_model_preparation_time': 0.0, 'eval_accuracy': 0.529, 'eval_f1': 0.40789703923602527, 'eval_precision': 0.5451971491003285, 'eval_recall': 0.529, 'eval_runtime': 124.327, 'eval_samples_per_second': 8.043, 'eval_steps_per_second': 0.507}


In [26]:

# Step 1: Generate predictions on the test set
predictions = trainer.predict(test_dataset)

# Extract predicted labels (class with highest probability)
predicted_labels = predictions.predictions.argmax(axis=-1)

# Convert predicted labels and actual test labels back to category names
predicted_labels_named = label_encoder.inverse_transform(predicted_labels)
actual_labels_named = label_encoder.inverse_transform(test_labels.numpy())

# Step 2: Add the actual and predicted labels to the DataFrame
df_sample['Predicted Category'] = predicted_labels_named
df_sample['Actual Category'] = actual_labels_named

# Step 3: Print the first few rows and save the DataFrame with predictions to a CSV
print(df_sample[['PRODUCT DESCRIPTION', 'Actual Category', 'Predicted Category']].head(20))

# Save the DataFrame with predictions to a CSV file for further analysis
df_sample.to_csv(r"C:\Users\k_pow\OneDrive\Documents\Capstone\EDA\SCx\predicted_results.csv", index=False)


  0%|          | 0/63 [00:00<?, ?it/s]

                                      PRODUCT DESCRIPTION  \
76727   BATTING GLOVE DISPLAY CASE BASKETBALL/SOCCERDI...   
90293   OFFICE CHAIR HS-CODE 940130 NO S.W.P.M. S C#S2...   
131220  AUTO CLUTCH PARTS - AUTO CLUTCH PARTS 18 CASES...   
72764   WALKER HS CODE 98170096 RAISED TOILET SEAT HSC...   
127438  CARTONS DISH PAN WHITE BENCH PANIER EMPLIABLE ...   
31168   SHOES ( WITH SHOES'S BOXES) SHOES ( WITH SHOES...   
23706   ONE 40 HC CONTAINER,SLAC 19170 PIECES INTO 33 ...   
34916                               REUSABLE SHOPPING BAG   
109344  UMBRELLA PO1920381 THIS SHIPMENT DOES NO CON -...   
57914   FROZEN RAW PEELED AND DEVEINED TAIL TOTAL 1700...   
3615    RADIAL PASSENGER TIRES (4011.10) RADIAL PASSEN...   
83836   HAND SOPA, SOAP, SHAMPOO, BODY CREAM, WIPES KI...   
5658    (ONE) 20' DV CNTR WITH POLISHED STONE SLABS, A...   
44190   PAPER BAG CY/CY THIS SHIPMENT CONTAINS NO WOOD...   
51010   HAND DRYER SOAP DISPENSER (ALKALINE BATTERY LR...   
18544   INV#21JPGE81062 

In [29]:
df_sample.sample(20)

Unnamed: 0.1,Unnamed: 0,PRODUCT DESCRIPTION,Commodity Category,Predicted Category,Actual Category
124095,161414,(1)FERROUS SULPHATE DRIED USP (100 PPM ALUMI M...,Other,Chemicals,Other
71511,90532,CERAMIC GLAZED DIGITAL FLOOR TILES. HS-CODE: 6...,Building Materials,Building Materials,Building Materials
103213,132322,SUMAC-THYME / BASIL-MINT / APPLE TEA-TEA / SOU...,Furniture,Food,Furniture
96280,123370,20 PLASTIC PALLETS HAVING TOTAL 2720 SMALL CAR...,Chemicals,Chemicals,Chemicals
20441,24429,PEELED GARLICP.O.NO.:ILP21038-PTEMP:-1.5'CVENT...,Other,Food,Other
131416,171808,MACAROONS 3840 CARTONS MACAROONS HS CODE 19059...,Other,Food,Other
125668,163681,FORDHAM 46 IN RECTANGULAR FIRE PIT P.O. NO.: 5...,Building Materials,Building Materials,Building Materials
62619,78753,MEDICAL / HOSPITAL EQUIPMENT / INSTRUME MEDICA...,Building Materials,Building Materials,Building Materials
50857,63668,KERRI WALSH USA/OLYMPIC EMB-VB RNBW AQUATICZ P...,Chemicals,Chemicals,Chemicals
32817,40071,"PAPER AND PAPERBOARD, COATED ON ONE OR BOTH SI...",Other,Building Materials,Other
