In [3]:
df = pd.read_csv("../data/filtered_complaints_2.csv")

# Try 3 setups
configurations = [
    {"chunk_size": 300, "chunk_overlap": 50},
    {"chunk_size": 500, "chunk_overlap": 100},
    {"chunk_size": 700, "chunk_overlap": 150}
]

# Loop over each config and compare
for config in configurations:
    size = config["chunk_size"]
    overlap = config["chunk_overlap"]
    
    print(f"\n🔍 Testing chunk_size={size}, chunk_overlap={overlap}")
    chunked_df = chunk_complaints(df, size, overlap)
    
    print(f"Total chunks: {len(chunked_df)}")
    print(f"Average chunk length: {chunked_df['text_chunk'].apply(len).mean():.2f} characters")
    
    # Save each version for inspection
    filename = f"../data/chunked_complaints_{size}_{overlap}.csv"
    chunked_df.to_csv(filename, index=False)
    print(f"✅ Saved to {filename}")



🔍 Testing chunk_size=300, chunk_overlap=50
Total chunks: 2143457
Average chunk length: 269.93 characters
✅ Saved to ../data/chunked_complaints_300_50.csv

🔍 Testing chunk_size=500, chunk_overlap=100
Total chunks: 1381982
Average chunk length: 426.99 characters
✅ Saved to ../data/chunked_complaints_500_100.csv

🔍 Testing chunk_size=700, chunk_overlap=150
Total chunks: 1034331
Average chunk length: 566.16 characters
✅ Saved to ../data/chunked_complaints_700_150.csv


In [1]:
# Step 1: Add src to sys.path so we can import
import sys
sys.path.append("../src")

# Step 2: Import your custom module
from chunking import run_chunking_experiments

# Step 3: Load your cleaned complaint data
import pandas as pd
df = pd.read_csv("../data/filtered_complaints_2.csv")

# Step 4: Define configs and run
configurations = [
    {"chunk_size": 300, "chunk_overlap": 50},
    {"chunk_size": 500, "chunk_overlap": 100},
    {"chunk_size": 700, "chunk_overlap": 150}
]

run_chunking_experiments(df, configurations)



🔍 Testing chunk_size=300, chunk_overlap=50
Total chunks: 2143457
Average chunk length: 269.93 characters
✅ Saved to ../data/chunked_complaints_300_50.csv

🔍 Testing chunk_size=500, chunk_overlap=100
Total chunks: 1381982
Average chunk length: 426.99 characters
✅ Saved to ../data/chunked_complaints_500_100.csv

🔍 Testing chunk_size=700, chunk_overlap=150
Total chunks: 1034331
Average chunk length: 566.16 characters
✅ Saved to ../data/chunked_complaints_700_150.csv


### 🧪 Chunking Configuration Experiments

| Chunk Size | Overlap | Total Chunks | Avg Length (chars) |
|------------|---------|---------------|---------------------|
| **300**    | 50      | 2,143,457     | 269.93              |
| **500**    | 100     | 1,381,982     | 426.99              |
| **700**    | 150     | 1,034,331     | 566.16              |

---

### ✅ Final Selection

We selected:

- `chunk_size = 500`
- `chunk_overlap = 100`

This configuration strikes a **good balance** between:

- **Semantic coherence**
- **Retrieval performance**
- **Storage cost**

It retains enough context for meaningful retrieval while avoiding excessive fragmentation or memory usage during vector indexing.
