In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

In [None]:
# Mount Google Drive to access your data
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the generated data
generated_data = pd.read_csv('/content/drive/MyDrive/generated_data.csv')
original_data = pd.read_csv('/content/drive/MyDrive/original_data.csv')

In [None]:
# Combine generated and original data
combined_data = pd.concat([generated_data, original_data])

# Split the data into train and test sets
train_data, test_data = train_test_split(combined_data, test_size=0.2, stratify=combined_data['intent'])

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

In [None]:
# Load pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(combined_data['intent'].unique()))
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

In [None]:
# Start training
trainer.train()

In [None]:
# Make predictions on the test set
predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
# Calculate metrics
accuracy = accuracy_score(test_data['intent'], preds)
precision, recall, _, _ = precision_recall_fscore_support(test_data['intent'], preds, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

In [None]:
# Compare performance on generated vs original data
generated_mask = test_data.index.isin(generated_data.index)
original_mask = test_data.index.isin(original_data.index)

generated_accuracy = accuracy_score(test_data[generated_mask]['intent'], preds[generated_mask])
original_accuracy = accuracy_score(test_data[original_mask]['intent'], preds[original_mask])

print(f"Generated Data Accuracy: {generated_accuracy:.4f}")
print(f"Original Data Accuracy: {original_accuracy:.4f}")