# Data Preprocessing for E-Commerce FAQ Chatbot

This notebook handles data preparation for fine-tuning Falcon-7B on e-commerce customer support data.

In [None]:
!pip install datasets pandas transformers -q

In [None]:
from datasets import load_dataset
import pandas as pd
import json
from collections import Counter

## 1. Load Dataset

We use the Bitext Customer Support dataset from Hugging Face which contains ~27,000 customer service conversation pairs.

In [None]:
dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
print(f"Dataset loaded: {dataset}")

In [None]:
df = pd.DataFrame(dataset['train'])
print(f"Total samples: {len(df)}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 2. Explore Dataset

In [None]:
print("Sample instruction:")
print(df['instruction'].iloc[0])
print("\nSample response:")
print(df['response'].iloc[0])

In [None]:
print("\nCategory distribution:")
category_counts = df['category'].value_counts()
print(category_counts.head(10))

In [None]:
df['instruction_length'] = df['instruction'].str.len()
df['response_length'] = df['response'].str.len()

print(f"\nInstruction length stats:")
print(df['instruction_length'].describe())
print(f"\nResponse length stats:")
print(df['response_length'].describe())

## 3. Format Data for Instruction Tuning

We format the data using a standard instruction template for causal language model fine-tuning.

In [None]:
def format_instruction(row):
    text = f"""### Instruction:
You are a helpful e-commerce customer support assistant. Answer the customer's question professionally and helpfully.

### Customer Query:
{row['instruction']}

### Response:
{row['response']}"""
    return text

df['formatted_text'] = df.apply(format_instruction, axis=1)
print("Sample formatted text:")
print(df['formatted_text'].iloc[0])

## 4. Create Train/Validation Split

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

## 5. Convert to Hugging Face Dataset Format

In [None]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df[['instruction', 'response', 'formatted_text', 'category']])
val_dataset = Dataset.from_pandas(val_df[['instruction', 'response', 'formatted_text', 'category']])

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

print(dataset_dict)

## 6. Save Processed Dataset

In [None]:
dataset_dict.save_to_disk('processed_dataset')
print("Dataset saved to 'processed_dataset' directory")

In [None]:
train_df[['instruction', 'response', 'category']].to_csv('train_data.csv', index=False)
val_df[['instruction', 'response', 'category']].to_csv('val_data.csv', index=False)
print("CSV files saved")

## 7. Create Test Set for Evaluation

In [None]:
test_samples = val_df.sample(n=100, random_state=42)
test_samples[['instruction', 'response', 'category']].to_csv('test_samples.csv', index=False)
print(f"Test samples saved: {len(test_samples)}")

## Summary

Data preprocessing complete. Files created:
- `processed_dataset/` - Hugging Face dataset format
- `train_data.csv` - Training data
- `val_data.csv` - Validation data
- `test_samples.csv` - Test samples for evaluation