In [6]:
# Data handling
import pandas as pd
import numpy as np

# Text chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Embeddings
from sentence_transformers import SentenceTransformer

# Vector store
import faiss
import pickle
import os

# For reproducibility
np.random.seed(42)


In [7]:
# Load the filtered complaints dataset from Task 1
df = pd.read_csv("../data/processed/filtered_complaints.csv")

# Check columns and size
print(df.shape)
df.head()


(80667, 20)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,word_count,clean_text
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,Servicemember,Consent provided,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,a xxxx xxxx card was opened under my name by a...
1,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,,Consent provided,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,dear cfpb i have a secured credit card with ci...
2,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,,Consent provided,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,i have a citi rewards cards the credit balance...
3,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,bi am writing to dispute the following charges...
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem when making payments,Problem during payment process,"Although the account had been deemed closed, I...",Company believes it acted appropriately as aut...,Atlanticus Services Corporation,NY,11212,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13965746,170,although the account had been deemed closed i ...


In [8]:
# Sample 12,000 complaints stratified by product
sample_size = 12000
products = df['Product'].unique()
sampled_df = df.groupby('Product', group_keys=False).apply(
    lambda x: x.sample(frac=min(1, sample_size/len(df)), random_state=42)
)

print(sampled_df['Product'].value_counts())
sampled_df.shape


Product
Credit card    12000
Name: count, dtype: int64


  sampled_df = df.groupby('Product', group_keys=False).apply(


(12000, 20)

In [9]:
# Initialize LangChain text splitter
chunk_size = 500      # characters per chunk
chunk_overlap = 50    # overlap between chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

# Split complaints into chunks
all_chunks = []
metadata = []

for idx, row in sampled_df.iterrows():
    chunks = text_splitter.split_text(row['clean_text'])
    all_chunks.extend(chunks)
    metadata.extend([{
        "complaint_id": row['Complaint ID'],
        "product": row['Product']
    }]*len(chunks))

print(f"Total text chunks: {len(all_chunks)}")


Total text chunks: 34010


In [10]:
# Load pre-trained sentence transformer model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(model_name)

# Generate embeddings for all chunks
embeddings = embed_model.encode(all_chunks, show_progress_bar=True)
print(f"Embeddings shape: {embeddings.shape}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1063 [00:00<?, ?it/s]

Embeddings shape: (34010, 384)


In [11]:
# Dimension of embeddings
dim = embeddings.shape[1]

# Initialize FAISS index
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings, dtype='float32'))

# Save index and metadata
os.makedirs("../vector_store", exist_ok=True)
faiss.write_index(index, "../vector_store/faiss_index.index")

with open("../vector_store/metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

print("FAISS index and metadata saved in vector_store/")


FAISS index and metadata saved in vector_store/


In [12]:
# Example: retrieve top 3 similar complaints for a query
query = "Unauthorized credit card opened"
query_emb = embed_model.encode([query])

D, I = index.search(np.array(query_emb, dtype='float32'), k=3)

for i, idx in enumerate(I[0]):
    print(f"\nResult {i+1}:")
    print("Product:", metadata[idx]['product'])
    print("Complaint ID:", metadata[idx]['complaint_id'])
    print("Text chunk:", all_chunks[idx][:300], "...")



Result 1:
Product: Credit card
Complaint ID: 9147642
Text chunk: credit card opened without my consent due to identity theft ...

Result 2:
Product: Credit card
Complaint ID: 8455611
Text chunk: fraudulent credit card that was opened without my consent then sold to a collection called xxxx xxxx ...

Result 3:
Product: Credit card
Complaint ID: 7976487
Text chunk: i was xxxx in xxxx when this card was opened and another was opened recently ive advised the company i didnt open this and it wasnt authorized by me they werent even mailed to me i requested the addresses and application information and removal from credit reporting but they just opened another card ...
