# Working with Custom Datasets in Hugging Face

This notebook demonstrates how to create, load, and use custom datasets with Hugging Face Transformers and Datasets libraries.

In [None]:
# Install required packages
!pip install datasets transformers torch pandas

## 1. Creating a Custom Dataset from Lists

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd

# Create sample data for sentiment analysis
texts = [
    "I love this movie! It's amazing.",
    "This film is terrible and boring.",
    "Great acting and wonderful story.",
    "Not my favorite, but decent.",
    "Absolutely fantastic! Highly recommend.",
    "Waste of time. Very disappointing.",
    "Good movie with great visuals.",
    "Average film, nothing special."
]

labels = [1, 0, 1, 0, 1, 0, 1, 0]  # 1 = positive, 0 = negative
label_names = ["negative", "positive"]

# Create dataset from dictionary
dataset_dict = {
    'text': texts,
    'label': labels
}

dataset = Dataset.from_dict(dataset_dict)

print("Dataset created:")
print(f"Number of examples: {len(dataset)}")
print(f"Features: {dataset.features}")
print("\nFirst 3 examples:")
for i in range(3):
    example = dataset[i]
    print(f"{i+1}. Text: {example['text']}")
    print(f"   Label: {example['label']} ({label_names[example['label']]})")

## 2. Loading Custom Dataset from CSV File

In [None]:
import os
from datasets import load_dataset

# Create a sample CSV file
sample_data = {
    'text': [
        "The weather is beautiful today.",
        "I'm feeling sad about the news.",
        "This is the best day ever!",
        "Traffic is really annoying.",
        "I love spending time with family.",
        "Work is stressful these days.",
        "The sunset looks amazing tonight.",
        "I'm worried about the future.",
        "This book is incredibly interesting.",
        "The food was disappointing."
    ],
    'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
    'category': ['weather', 'news', 'personal', 'transport', 'family', 'work', 'nature', 'personal', 'books', 'food']
}

# Save to CSV
df = pd.DataFrame(sample_data)
csv_path = 'custom_sentiment_data.csv'
df.to_csv(csv_path, index=False)

# Load dataset from CSV
csv_dataset = load_dataset('csv', data_files=csv_path)['train']

print("Dataset loaded from CSV:")
print(f"Number of examples: {len(csv_dataset)}")
print(f"Features: {csv_dataset.features}")
print("\nFirst 3 examples:")
for i in range(3):
    example = csv_dataset[i]
    print(f"{i+1}. Text: {example['text']}")
    print(f"   Label: {example['label']}, Category: {example['category']}")

# Clean up
os.remove(csv_path)

## 3. Tokenizing and Preprocessing Custom Dataset

In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

# Apply tokenization to dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

print("Tokenized dataset:")
print(f"Features: {tokenized_dataset.features}")
print("\nExample tokenized data:")
example = tokenized_dataset[0]
print(f"Text: {dataset[0]['text']}")
print(f"Input IDs: {example['input_ids'][:10]}...")  # Show first 10 tokens
print(f"Attention Mask: {example['attention_mask'][:10]}...")  # Show first 10 positions
print(f"Label: {example['label']}")

# Decode tokens back to text to verify
decoded_text = tokenizer.decode(example['input_ids'])
print(f"\nDecoded tokens: {decoded_text}")

## 4. Creating Train/Validation Splits

In [None]:
# Create a larger dataset for meaningful splits
extended_texts = [
    "I love this product! Highly recommended.",
    "Terrible quality, waste of money.",
    "Amazing service and fast delivery.",
    "Not worth the price, very disappointing.",
    "Excellent quality and great value.",
    "Poor customer service experience.",
    "Beautiful design and works perfectly.",
    "Broke after one week of use.",
    "Outstanding performance, very satisfied.",
    "Difficult to use and confusing interface.",
    "Great features and easy to use.",
    "Overpriced for what you get.",
    "Perfect for my needs, love it!",
    "Quality control issues, not recommended.",
    "Impressive build quality and design.",
    "Arrived damaged and took forever to ship.",
    "Exceeded my expectations in every way.",
    "Cheap materials and poor construction.",
    "Fantastic customer support team.",
    "Worst purchase I've made this year."
]

extended_labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

# Create extended dataset
extended_dataset = Dataset.from_dict({
    'text': extended_texts,
    'label': extended_labels
})

# Split dataset into train and validation
train_test_split = extended_dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

# Further split validation into validation and test
val_test_split = val_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = val_test_split['train']
test_dataset = val_test_split['test']

# Create DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print("Dataset splits:")
print(f"Train: {len(dataset_dict['train'])} examples")
print(f"Validation: {len(dataset_dict['validation'])} examples")
print(f"Test: {len(dataset_dict['test'])} examples")

# Apply tokenization to all splits
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

print("\nTokenized dataset splits:")
for split_name, split_dataset in tokenized_datasets.items():
    print(f"{split_name}: {len(split_dataset)} examples")
    
# Show label distribution
print("\nLabel distribution:")
for split_name, split_dataset in dataset_dict.items():
    labels = split_dataset['label']
    pos_count = sum(labels)
    neg_count = len(labels) - pos_count
    print(f"{split_name}: {pos_count} positive, {neg_count} negative")

## 5. Custom Dataset for Question Answering

In [None]:
# Create a custom QA dataset
qa_data = {
    'context': [
        "Python is a high-level programming language created by Guido van Rossum in 1991. It emphasizes code readability and simplicity.",
        "Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming.",
        "The Transformer architecture was introduced in 2017 and revolutionized natural language processing with its attention mechanism.",
        "BERT (Bidirectional Encoder Representations from Transformers) was developed by Google and released in 2018.",
        "GPT (Generative Pre-trained Transformer) models are autoregressive language models trained to predict the next word in a sequence."
    ],
    'question': [
        "Who created Python?",
        "What is machine learning?",
        "When was the Transformer architecture introduced?",
        "Which company developed BERT?",
        "What type of models are GPT models?"
    ],
    'answer': [
        "Guido van Rossum",
        "a subset of artificial intelligence",
        "2017",
        "Google",
        "autoregressive language models"
    ]
}

# Create QA dataset
qa_dataset = Dataset.from_dict(qa_data)

print("Question Answering Dataset:")
print(f"Number of examples: {len(qa_dataset)}")
print(f"Features: {qa_dataset.features}")

# Display examples
print("\nQA Examples:")
for i in range(len(qa_dataset)):
    example = qa_dataset[i]
    print(f"\n{i+1}. Context: {example['context'][:60]}...")
    print(f"   Question: {example['question']}")
    print(f"   Answer: {example['answer']}")

# Tokenize for QA model
def tokenize_qa(examples):
    return tokenizer(
        examples['question'],
        examples['context'],
        truncation=True,
        padding=True,
        max_length=256
    )

# Apply tokenization
tokenized_qa_dataset = qa_dataset.map(tokenize_qa, batched=True)

print("\nTokenized QA Dataset:")
print(f"Features: {tokenized_qa_dataset.features}")

# Show tokenized example
example = tokenized_qa_dataset[0]
print(f"\nTokenized example:")
print(f"Input IDs shape: {len(example['input_ids'])}")
print(f"Decoded input: {tokenizer.decode(example['input_ids'])}")