In [None]:
## 1. Data Loading
# Install simpletransformers package
# !pip install simpletransformers

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset (replace with your dataset path)
data = pd.read_csv('NLP Manufacturer Dataset - newdatasets(1).csv')

# Exploratory Data Analysis (EDA)
print(data.info())  # Overview of data structure
print(data['Prediction'].value_counts())  # Class distribution

# Split dataset into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert labels to numerical format
le = LabelEncoder()
train_data['Prediction'] = le.fit_transform(train_data['Prediction'])
val_data['Prediction'] = le.transform(val_data['Prediction'])

# Preparing the data in the correct format for SimpleTransformers
train_df = pd.DataFrame({
    'text': train_data['Input'],  # Use 'Input' column for text
    'labels': train_data['Prediction']  # Use 'Prediction' column for labels
})

val_df = pd.DataFrame({
    'text': val_data['Input'],
    'labels': val_data['Prediction']
})

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1549 entries, 0 to 1548
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Input       1549 non-null   object
 1   Prediction  1547 non-null   object
dtypes: object(2)
memory usage: 24.3+ KB
None
Prediction
Minor Defect    616
Compliant       509
Major Issue     422
Name: count, dtype: int64


In [None]:
## 2. Text Processing
import re

# Define a function to clean text data
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
        text = text.strip()  # Remove extra whitespace
        return text
    return ""

# Apply the cleaning function to the dataset
train_df['text'] = train_df['text'].apply(clean_text)
val_df['text'] = val_df['text'].apply(clean_text)

print(train_df.head())


                                                   text  labels
1249  the product has been tested and meets all perf...       2
1048  some wooden picture frames had slightly uneven...       2
1159  some glass measuring cups had slightly faded v...       2
818   the product has a minor issue with the display...       2
741   a batch of medical face masks failed filtratio...       1


In [None]:
## 3. Text Embedding using BERT and RoBERTa
from simpletransformers.classification import ClassificationModel

num_labels = len(le.classes_)  # Determine the number of unique labels

# Create a BERT model for text classification
bert_model = ClassificationModel('bert', 'bert-base-uncased', num_labels=num_labels, use_cuda=False)

# Create a RoBERTa model for text classification
roberta_model = ClassificationModel('roberta', 'roberta-base', num_labels=num_labels, use_cuda=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
## 4. Model Training with BERT and RoBERTa
from simpletransformers.classification import ClassificationArgs

# Set up model arguments with custom hyperparameters
model_args = ClassificationArgs(
    num_train_epochs=3,
    train_batch_size=8,
    eval_batch_size=8,
    learning_rate=3e-5,
    max_seq_length=128,
    weight_decay=0.01,
    warmup_steps=0,
    logging_steps=50,
    save_steps=200,
    overwrite_output_dir=True  # Allow overwriting existing output directory
)

# Train the BERT model with custom hyperparameters
bert_model = ClassificationModel('bert', 'bert-base-uncased', num_labels=num_labels, args=model_args, use_cuda=False)
bert_model.train_model(train_df)

# Train the RoBERTa model with custom hyperparameters
roberta_model = ClassificationModel('roberta', 'roberta-base', num_labels=num_labels, args=model_args, use_cuda=False)
roberta_model.train_model(train_df)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/155 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/155 [00:00<?, ?it/s]

(465, 0.44188968075178964)

In [None]:
## 5. Evaluation on Validation Set
# Evaluate BERT on validation dataAbstract ( 4/5)
print("BERT Evaluation Results:")
print("result_bert")

# Evaluate RoBERTa on validation data
result_roberta, model_outputs_roberta, wrong_predictions_roberta = roberta_model.eval_model(val_df)

print("RoBERTa Evaluation Results:")
print(result_roberta)

BERT Evaluation Results:
result_bert


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/39 [00:00<?, ?it/s]

RoBERTa Evaluation Results:
{'mcc': np.float64(0.8278904412109951), 'eval_loss': 0.5329986420006324}


In [None]:
## 6. Saving the Best Model
import os

# Create directories if they don’t exist
os.makedirs("bert_best_model", exist_ok=True)
os.makedirs("roberta_best_model", exist_ok=True)

# Save the BERT Model
bert_model.save_model("bert_best_model")

# Save the RoBERTa Model
roberta_model.save_model("roberta_best_model")

print("Models saved successfully!")


Models saved successfully!


In [None]:
bert_model.save_model("bert_best_model", model=bert_model.model)
roberta_model.save_model("roberta_best_model", model=roberta_model.model)

In [None]:
# Load the saved BERT model
bert_model = ClassificationModel('bert', 'bert_best_model', use_cuda=False)

# Real-world input text
real_world_text = ["The batch is compliant.", "Minor defects were found.", "Major issues detected in production."]

# Predict the class
predictions_bert, _ = bert_model.predict(real_world_text)

print(f"BERT Predictions: {predictions_bert}")

# Load the saved RoBERTa model
roberta_model = ClassificationModel('roberta', 'roberta_best_model', use_cuda=False)

# Predict the class
predictions_roberta, _ = roberta_model.predict(real_world_text)

print(f"RoBERTa Predictions: {predictions_roberta}")


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

BERT Predictions: [0, 2, 1]


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

RoBERTa Predictions: [0, 2, 0]


In [None]:
decoded_labels_bert = le.inverse_transform([2, 1, 2])
decoded_labels_roberta = le.inverse_transform([2, 2, 2])

print("Decoded BERT Predictions:", decoded_labels_bert)
print("Decoded RoBERTa Predictions:", decoded_labels_roberta)

Decoded BERT Predictions: ['Minor Defect' 'Major Issue' 'Minor Defect']
Decoded RoBERTa Predictions: ['Minor Defect' 'Minor Defect' 'Minor Defect']


In [None]:
# Importing necessary libraries
from sklearn.metrics import accuracy_score
from simpletransformers.classification import ClassificationModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset (replace with your dataset path)
data = pd.read_csv('NLP Manufacturer Dataset - newdatasets(1).csv')

# Split dataset into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert labels to numerical format using LabelEncoder
le = LabelEncoder()
train_data['Prediction'] = le.fit_transform(train_data['Prediction'])  # Fit on training data
val_data['Prediction'] = le.transform(val_data['Prediction'])  # Transform on validation data

# Preparing the validation data in the correct format for SimpleTransformers
val_df = pd.DataFrame({
    'text': val_data['Input'],
    'labels': val_data['Prediction']
})

# Load the previously saved BERT model
bert_model = ClassificationModel('bert', 'bert_best_model', use_cuda=False)

# Load the previously saved RoBERTa model
roberta_model = ClassificationModel('roberta', 'roberta_best_model', use_cuda=False)

# 1. Evaluate the BERT model on the validation set and calculate accuracy
result_bert, model_outputs_bert, wrong_predictions_bert = bert_model.eval_model(val_df)

# Get the true labels from the validation set
y_true_bert = val_data['Prediction']  # Actual labels
y_pred_bert = model_outputs_bert.argmax(axis=1)  # Predicted labels

# Calculate Accuracy for BERT
accuracy_bert = accuracy_score(y_true_bert, y_pred_bert)
print(f"BERT Accuracy: {accuracy_bert}")

# 2. Evaluate the RoBERTa model on the validation set and calculate accuracy
result_roberta, model_outputs_roberta, wrong_predictions_roberta = roberta_model.eval_model(val_df)

# Get the true labels from the validation set
y_true_roberta = val_data['Prediction']  # Actual labels
y_pred_roberta = model_outputs_roberta.argmax(axis=1)  # Predicted labels

# Calculate Accuracy for RoBERTa
accuracy_roberta = accuracy_score(y_true_roberta, y_pred_roberta)
print(f"RoBERTa Accuracy: {accuracy_roberta}")


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/39 [00:00<?, ?it/s]

BERT Accuracy: 0.8806451612903226


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/39 [00:00<?, ?it/s]

RoBERTa Accuracy: 0.8838709677419355
