In [None]:
# %% [markdown]
# # Task 2: Text Chunking, Embedding, and Vector Store Indexing
# 
# ## Objective
# Convert the cleaned text narratives into a format suitable for efficient semantic search.

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import json
from typing import List, Dict, Any
import hashlib

warnings.filterwarnings('ignore')

# Create necessary directories
Path("data/processed").mkdir(parents=True, exist_ok=True)
Path("vector_store").mkdir(parents=True, exist_ok=True)

# %%
# Load the cleaned dataset from Task 1
print("Loading cleaned dataset...")
df = pd.read_csv('data/processed/filtered_complaints.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# %%
# Display basic information
print("Product category distribution:")
category_dist = df['product_category'].value_counts()
display(category_dist)

# Visualize
plt.figure(figsize=(10, 6))
category_dist.plot(kind='bar')
plt.title('Complaint Distribution by Product Category')
plt.xlabel('Product Category')
plt.ylabel('Number of Complaints')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# %%
# Stratified Sampling
print("Creating stratified sample...")

# Calculate sample size (10,000-15,000)
total_samples = min(15000, len(df))  # Cap at 15,000
print(f"Target sample size: {total_samples}")

# Get unique categories
categories = df['product_category'].unique()
print(f"Unique categories: {categories}")

# Calculate proportion of each category
category_proportions = df['product_category'].value_counts(normalize=True)
print("\nOriginal proportions:")
for category, prop in category_proportions.items():
    print(f"  {category}: {prop:.3f} ({int(prop * len(df))} samples)")

# Calculate sample size per category
samples_per_category = {}
for category in categories:
    prop = category_proportions[category]
    samples_per_category[category] = int(prop * total_samples)

# Ensure total equals target
total_allocated = sum(samples_per_category.values())
adjustment = total_samples - total_allocated

# Adjust the largest category if needed
if adjustment != 0:
    largest_category = category_proportions.idxmax()
    samples_per_category[largest_category] += adjustment

print("\nSampling strategy:")
for category, count in samples_per_category.items():
    print(f"  {category}: {count} samples")

# %%
# Perform stratified sampling
sampled_dfs = []
for category, n_samples in samples_per_category.items():
    category_df = df[df['product_category'] == category]
    
    # If we need more samples than available, take all
    n_samples = min(n_samples, len(category_df))
    
    if n_samples > 0:
        sampled_category = category_df.sample(n=n_samples, random_state=42)
        sampled_dfs.append(sampled_category)

# Combine sampled data
sampled_df = pd.concat(sampled_dfs, ignore_index=True)
print(f"\nFinal sampled dataset shape: {sampled_df.shape}")
print(f"Sampling rate: {len(sampled_df)/len(df)*100:.1f}%")

# Check new distribution
print("\nSampled distribution:")
sampled_dist = sampled_df['product_category'].value_counts(normalize=True)
for category, prop in sampled_dist.items():
    print(f"  {category}: {prop:.3f} ({len(sampled_df[sampled_df['product_category'] == category])} samples)")

# %%
# Save the sampled dataset
sampled_path = 'data/processed/sampled_complaints.csv'
sampled_df.to_csv(sampled_path, index=False)
print(f"\nSaved sampled dataset to: {sampled_path}")

# %%
# Text Chunking Implementation
print("\n" + "="*50)
print("TEXT CHUNKING")
print("="*50)

from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken  # For token counting

# Let's analyze text lengths first
text_lengths = sampled_df['cleaned_narrative'].apply(lambda x: len(str(x).split()))
print("Text length statistics (words):")
print(f"  Mean: {text_lengths.mean():.1f}")
print(f"  Median: {text_lengths.median():.1f}")
print(f"  Min: {text_lengths.min():.1f}")
print(f"  Max: {text_lengths.max():.1f}")
print(f"  95th percentile: {text_lengths.quantile(0.95):.1f}")

# %%
# Function to count tokens (approximate)
def count_tokens(text):
    """Approximate token count (1 token â‰ˆ 4 characters for English)"""
    return len(text) // 4

# Test different chunk sizes
test_text = sampled_df['cleaned_narrative'].iloc[0]
print("\nTesting chunking on sample text:")
print(f"Original text length: {len(test_text)} characters")
print(f"Approximate tokens: {count_tokens(test_text)}")

# %%
# Create text splitter
# Based on the pre-built vector store specifications:
# Chunk size: 500 characters, Overlap: 50 characters
chunk_size = 500
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Test chunking
test_chunks = text_splitter.split_text(test_text)
print(f"\nNumber of chunks created: {len(test_chunks)}")
print(f"Chunk sizes: {[len(chunk) for chunk in test_chunks]}")

# Display sample chunks
print("\nSample chunks:")
for i, chunk in enumerate(test_chunks[:3]):
    print(f"\nChunk {i+1} ({len(chunk)} chars):")
    print(chunk[:200] + "..." if len(chunk) > 200 else chunk)

# %%
# Apply chunking to entire sampled dataset
print("\nApplying chunking to entire dataset...")

chunks = []
metadata = []

for idx, row in sampled_df.iterrows():
    text = str(row['cleaned_narrative'])
    
    # Skip empty texts
    if not text.strip():
        continue
    
    # Split text into chunks
    text_chunks = text_splitter.split_text(text)
    
    # Create metadata for each chunk
    for chunk_idx, chunk in enumerate(text_chunks):
        chunks.append(chunk)
        
        metadata.append({
            'complaint_id': row.get('Complaint ID', f'id_{idx}'),
            'product_category': row['product_category'],
            'product': row.get('Product', 'Unknown'),
            'issue': row.get('Issue', 'Unknown'),
            'sub_issue': row.get('Sub-issue', 'Unknown'),
            'company': row.get('Company', 'Unknown'),
            'state': row.get('State', 'Unknown'),
            'date_received': row.get('Date received', 'Unknown'),
            'chunk_index': chunk_idx,
            'total_chunks': len(text_chunks),
            'original_length': len(text)
        })

print(f"Total complaints processed: {len(sampled_df)}")
print(f"Total chunks created: {len(chunks)}")
print(f"Average chunks per complaint: {len(chunks)/len(sampled_df):.2f}")

# %%
# Analyze chunk statistics
chunk_lengths = [len(chunk) for chunk in chunks]
print("\nChunk Statistics:")
print(f"Mean chunk length: {np.mean(chunk_lengths):.1f} characters")
print(f"Median chunk length: {np.median(chunk_lengths):.1f} characters")
print(f"Min chunk length: {np.min(chunk_lengths):.1f} characters")
print(f"Max chunk length: {np.max(chunk_lengths):.1f} characters")

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(chunk_lengths, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Chunk Length (characters)')
plt.ylabel('Frequency')
plt.title('Distribution of Chunk Lengths')
plt.axvline(chunk_size, color='red', linestyle='--', label=f'Target: {chunk_size}')
plt.legend()

plt.subplot(1, 2, 2)
# Box plot by product category
chunk_df = pd.DataFrame({
    'chunk': chunks,
    'length': chunk_lengths,
    'category': [m['product_category'] for m in metadata]
})
category_order = chunk_df.groupby('category')['length'].median().sort_values().index
sns.boxplot(data=chunk_df, x='category', y='length', order=category_order)
plt.xlabel('Product Category')
plt.ylabel('Chunk Length')
plt.title('Chunk Length by Product Category')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# %%
# Embedding Model Selection
print("\n" + "="*50)
print("EMBEDDING MODEL")
print("="*50)

from sentence_transformers import SentenceTransformer
import torch

# Choose embedding model
# We'll use all-MiniLM-L6-v2 as specified in requirements
model_name = "all-MiniLM-L6-v2"
print(f"Selected model: {model_name}")

# Model specifications:
# - 384 dimensions
# - Good balance of speed and quality
# - Well-suited for semantic search
# - Approximately 80MB in size

print("\nLoading embedding model...")
try:
    model = SentenceTransformer(model_name)
    print("Model loaded successfully!")
    
    # Test the model
    test_sentences = ["Customer complaint about credit card fees", 
                      "Issue with money transfer service"]
    test_embeddings = model.encode(test_sentences)
    print(f"Embedding shape: {test_embeddings.shape}")
    print(f"Number of dimensions: {test_embeddings.shape[1]}")
    
except Exception as e:
    print(f"Error loading model: {e}")
    print("Using a smaller model as fallback...")
    model = SentenceTransformer('all-MiniLM-L6-v2')

# %%
# Generate embeddings for chunks
print("\nGenerating embeddings for chunks...")

# Process in batches to avoid memory issues
batch_size = 128
embeddings = []

for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]
    batch_embeddings = model.encode(batch, show_progress_bar=False)
    embeddings.extend(batch_embeddings)
    
    if (i // batch_size) % 10 == 0:
        print(f"Processed {min(i + batch_size, len(chunks))}/{len(chunks)} chunks")

embeddings_array = np.array(embeddings)
print(f"\nEmbeddings shape: {embeddings_array.shape}")
print(f"Memory size: {embeddings_array.nbytes / 1024 / 1024:.2f} MB")

# %%
# Vector Store Indexing
print("\n" + "="*50)
print("VECTOR STORE INDEXING")
print("="*50)

# Option 1: Using ChromaDB
try:
    import chromadb
    from chromadb.config import Settings
    
    print("Creating ChromaDB vector store...")
    
    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(path="vector_store/chroma_db")
    
    # Create or get collection
    collection_name = "complaint_chunks"
    
    # Check if collection exists
    existing_collections = chroma_client.list_collections()
    if collection_name in [col.name for col in existing_collections]:
        print(f"Collection '{collection_name}' already exists, recreating...")
        chroma_client.delete_collection(collection_name)
    
    collection = chroma_client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"}  # Using cosine similarity
    )
    
    # Prepare data for ChromaDB
    print("Adding documents to ChromaDB...")
    
    # Add in batches
    batch_size = 1000
    for i in range(0, len(chunks), batch_size):
        batch_end = min(i + batch_size, len(chunks))
        
        ids = [f"chunk_{j}" for j in range(i, batch_end)]
        documents = chunks[i:batch_end]
        metadatas = metadata[i:batch_end]
        embeddings_batch = embeddings_array[i:batch_end].tolist()
        
        collection.add(
            ids=ids,
            documents=documents,
            metadatas=metadatas,
            embeddings=embeddings_batch
        )
        
        if (i // batch_size) % 10 == 0:
            print(f"Added {batch_end}/{len(chunks)} chunks")
    
    print("ChromaDB collection created successfully!")
    
    # Test retrieval
    test_query = "credit card fees issue"
    test_query_embedding = model.encode(test_query).tolist()
    
    results = collection.query(
        query_embeddings=[test_query_embedding],
        n_results=3
    )
    
    print("\nTest retrieval results:")
    for i, doc in enumerate(results['documents'][0]):
        print(f"\nResult {i+1}:")
        print(f"Document: {doc[:200]}...")
        print(f"Metadata: {results['metadatas'][0][i]}")
    
except ImportError:
    print("ChromaDB not available, using FAISS...")

# %%
# Option 2: Using FAISS (if ChromaDB fails or as alternative)
import faiss

print("\nCreating FAISS vector store...")

# Create FAISS index
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product (cosine similarity)

# Normalize vectors for cosine similarity
faiss.normalize_L2(embeddings_array)

# Add vectors to index
index.add(embeddings_array)
print(f"FAISS index created with {index.ntotal} vectors")

# Save FAISS index
faiss.write_index(index, "vector_store/faiss_index.bin")

# Save metadata
metadata_df = pd.DataFrame(metadata)
metadata_df.to_parquet("vector_store/chunk_metadata.parquet", index=False)

# Save chunks
chunks_df = pd.DataFrame({"chunk": chunks})
chunks_df.to_parquet("vector_store/chunks.parquet", index=False)

print("FAISS index and metadata saved successfully!")

# %%
# Test FAISS retrieval
print("\nTesting FAISS retrieval...")

# Test query
test_query = "high interest rates on personal loans"
test_query_embedding = model.encode([test_query])
faiss.normalize_L2(test_query_embedding)

# Search
k = 3
distances, indices = index.search(test_query_embedding, k)

print(f"Query: '{test_query}'")
print("\nTop results:")
for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
    print(f"\nResult {i+1} (similarity: {distance:.4f}):")
    print(f"Chunk: {chunks[idx][:200]}...")
    print(f"Metadata: {metadata[idx]['product_category']} - {metadata[idx]['issue']}")

# %%
# Create a summary of the vector store
print("\n" + "="*50)
print("VECTOR STORE SUMMARY")
print("="*50)

print(f"Total complaints in sample: {len(sampled_df)}")
print(f"Total chunks created: {len(chunks)}")
print(f"Embedding dimension: {dimension}")
print(f"Vector store size: {embeddings_array.nbytes / 1024 / 1024:.2f} MB")

# Distribution by product category
category_counts = {}
for meta in metadata:
    category = meta['product_category']
    category_counts[category] = category_counts.get(category, 0) + 1

print("\nChunks by product category:")
for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  {category}: {count} chunks ({count/len(chunks)*100:.1f}%)")

# %%
# Save configuration
config = {
    "model_name": model_name,
    "chunk_size": chunk_size,
    "chunk_overlap": chunk_overlap,
    "total_chunks": len(chunks),
    "embedding_dimension": dimension,
    "sampling_strategy": "stratified",
    "sample_size": len(sampled_df),
    "total_complaints": len(df),
    "vector_store_type": "FAISS",
    "similarity_metric": "cosine"
}

with open("vector_store/config.json", "w") as f:
    json.dump(config, f, indent=2)

print("\nConfiguration saved to vector_store/config.json")

# %%
# Create a simple retrieval function for testing
def retrieve_similar_chunks(query, k=5):
    """
    Retrieve similar chunks for a given query
    """
    # Encode query
    query_embedding = model.encode([query])
    faiss.normalize_L2(query_embedding)
    
    # Search
    distances, indices = index.search(query_embedding, k)
    
    # Prepare results
    results = []
    for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
        results.append({
            'rank': i + 1,
            'similarity': float(distance),
            'chunk': chunks[idx],
            'metadata': metadata[idx]
        })
    
    return results

# Test the function
print("\nTesting retrieval function...")
test_queries = [
    "credit card annual fee too high",
    "money transfer took too long",
    "savings account interest rate low"
]

for query in test_queries:
    print(f"\nQuery: '{query}'")
    results = retrieve_similar_chunks(query, k=2)
    for result in results:
        print(f"  - {result['metadata']['product_category']}: {result['chunk'][:100]}...")