In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch






In [2]:
# Load the custom dataset
dataset = load_dataset('csv', data_files='Ezitech Data.csv')

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



Generating train split: 0 examples [00:00, ? examples/s]



In [3]:
# Define a function to tokenize the input text
def tokenize_function(examples):
    return tokenizer(examples['Question'], padding='max_length', truncation=True)



In [4]:
# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/38 [00:00<?, ? examples/s]

In [5]:
# Map the labels (Assuming that the 'Answer' column needs to be mapped to numerical labels)
# Convert lists to tuples to make them hashable
unique_labels = list(set([tuple(answer) for answer in tokenized_datasets['train']['Answer']]))



In [6]:
# Create label mapping for single and multiple answers
label_mapping = {}
for idx, label in enumerate(unique_labels):
    label_mapping[label] = idx
    if len(label) > 1:  # For multiple-answer scenarios
        for sub_label in label:
            label_mapping[tuple([sub_label])] = idx



In [7]:
# Define a function to map the labels
def map_labels(example):
    # Convert list to tuple before looking up in the dictionary
    key = tuple(example['Answer'])
    if key not in label_mapping:
        key = tuple([example['Answer'][0]])  # Handle case where key isn't found
    example['label'] = label_mapping[key]
    return example



In [8]:
# Apply the label mapping function to the dataset
tokenized_datasets = tokenized_datasets.map(map_labels)



Map:   0%|          | 0/38 [00:00<?, ? examples/s]

In [9]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_labels))



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Split the dataset into train and test sets
tokenized_datasets = tokenized_datasets['train'].train_test_split(test_size=0.2) # Split the dataset into train and test sets



In [11]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',               # Output directory for results
    evaluation_strategy="epoch",          # Evaluate every epoch
    learning_rate=2e-5,                   # Learning rate for fine-tuning
    per_device_train_batch_size=16,       # Batch size for training
    per_device_eval_batch_size=16,        # Batch size for evaluation
    num_train_epochs=3,                   # Number of epochs for training
    weight_decay=0.01                     # Weight decay to prevent overfitting
)





In [12]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'], # Use the training split of the data
    eval_dataset=tokenized_datasets['test'], # Use the testing split of the data
    tokenizer=tokenizer,
    compute_metrics=None  # Optionally add metrics if needed
)



In [13]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,3.655311
2,No log,3.670395
3,No log,3.682681


TrainOutput(global_step=6, training_loss=3.6331351598103843, metrics={'train_runtime': 1109.6184, 'train_samples_per_second': 0.081, 'train_steps_per_second': 0.005, 'total_flos': 23687649054720.0, 'train_loss': 3.6331351598103843, 'epoch': 3.0})

In [14]:
# Save the trained model and tokenizer
trainer.save_model('./bert_faq_model')
tokenizer.save_pretrained('./bert_faq_model')

('./bert_faq_model\\tokenizer_config.json',
 './bert_faq_model\\special_tokens_map.json',
 './bert_faq_model\\vocab.txt',
 './bert_faq_model\\added_tokens.json')