# Text Generation with Genesis

Generate synthetic text data using LLMs (OpenAI or HuggingFace).

In [None]:
import pandas as pd
import os

from genesis.generators.text import LLMTextGenerator

## Sample Text Data

In [None]:
# Sample customer reviews
reviews = pd.DataFrame({
    'text': [
        "Great product! Exactly what I needed. Fast shipping too.",
        "Not worth the price. Quality is mediocre at best.",
        "Amazing customer service. They resolved my issue quickly.",
        "Product arrived damaged. Still waiting for replacement.",
        "Five stars! Will definitely buy again."
    ],
    'rating': [5, 2, 5, 1, 5],
    'category': ['Electronics', 'Clothing', 'Electronics', 'Home', 'Electronics']
})

reviews

## OpenAI Backend

Requires `OPENAI_API_KEY` environment variable.

In [None]:
# Set API key (or use environment variable)
# os.environ['OPENAI_API_KEY'] = 'your-api-key'

openai_generator = LLMTextGenerator(
    backend='openai',
    model='gpt-3.5-turbo',
    temperature=0.7,
    privacy_filter=True,  # Filter out PII
)

# Analyze text patterns
openai_generator.fit(reviews['text'])

# Generate synthetic reviews
synthetic_reviews = openai_generator.generate(
    n_samples=5,
    context="customer product review"
)

for i, text in enumerate(synthetic_reviews):
    print(f"{i+1}. {text}\n")

## HuggingFace Backend

Local model inference (no API key needed).

In [None]:
hf_generator = LLMTextGenerator(
    backend='huggingface',
    model='gpt2',  # Or 'facebook/opt-350m', 'EleutherAI/gpt-neo-125M'
    temperature=0.8,
    max_length=100,
)

hf_generator.fit(reviews['text'])

hf_synthetic = hf_generator.generate(n_samples=5)

for i, text in enumerate(hf_synthetic):
    print(f"{i+1}. {text}\n")

## Conditional Generation

Generate text with specific attributes.

In [None]:
# Generate positive reviews only
positive_reviews = openai_generator.generate(
    n_samples=3,
    context="positive 5-star customer review"
)

print("Positive Reviews:")
for text in positive_reviews:
    print(f"- {text}")

# Generate negative reviews
negative_reviews = openai_generator.generate(
    n_samples=3,
    context="negative 1-star customer complaint"
)

print("\nNegative Reviews:")
for text in negative_reviews:
    print(f"- {text}")

## Privacy-Safe Generation

Filter out personally identifiable information.

In [None]:
# Text with PII
pii_texts = pd.Series([
    "Contact John Smith at john.smith@email.com for details.",
    "Call 555-123-4567 to speak with Sarah.",
    "My SSN is 123-45-6789 and I need help."
])

safe_generator = LLMTextGenerator(
    backend='openai',
    privacy_filter=True,
    pii_patterns=['email', 'phone', 'ssn', 'name']
)

safe_generator.fit(pii_texts)
safe_synthetic = safe_generator.generate(n_samples=3)

print("Privacy-filtered synthetic text:")
for text in safe_synthetic:
    print(f"- {text}")