In [None]:
#1️⃣ Load CFPB Complaint Dataset
import pandas as pd
import matplotlib.pyplot as plt
import re

# Load dataset
df = pd.read_csv("data/raw/cfpb_complaints.csv", low_memory=False)

print(df.shape)
df.head()

#2️⃣ Initial EDA (Structure & Quality)
df.info()
df.isna().sum().sort_values(ascending=False).head(10)


#Key column used:

TEXT_COL = "Consumer complaint narrative"
PRODUCT_COL = "Product"

#3️⃣ Distribution of Complaints by Product
product_counts = df[PRODUCT_COL].value_counts()

plt.figure()
product_counts.plot(kind="bar")
plt.title("Complaint Distribution by Product")
plt.xlabel("Product")
plt.ylabel("Number of Complaints")
plt.show()

#4️⃣ Narrative Length Analysis (Word Count)
df["word_count"] = df[TEXT_COL].astype(str).apply(lambda x: len(x.split()))

plt.figure()
df["word_count"].hist(bins=50)
plt.title("Distribution of Complaint Narrative Length")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.show()

df["word_count"].describe()


#✔ This reveals:

#Very short narratives (under 20 words)

#Extremely long narratives (1000+ words)

#5️⃣ Complaints With vs Without Narratives
with_narrative = df[TEXT_COL].notna().sum()
without_narrative = df[TEXT_COL].isna().sum()

with_narrative, without_narrative

#6️⃣ Filter Dataset (Project Requirements)
#✅ Allowed products
ALLOWED_PRODUCTS = [
    "Credit card",
    "Personal loan",
    "Savings account",
    "Money transfers"
]

#Apply filters
filtered_df = df[
    (df[PRODUCT_COL].isin(ALLOWED_PRODUCTS)) &
    (df[TEXT_COL].notna())
].copy()

filtered_df.shape

#7️⃣ Text Cleaning for Embedding Quality
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i am writing to file a complaint.*?\. ", "", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

filtered_df["clean_narrative"] = filtered_df[TEXT_COL].apply(clean_text)


#(Optional extra normalization like stopword removal is not required for RAG and usually avoided.)

#8️⃣ Save Final Cleaned Dataset (DELIVERABLE)
output_path = "data/filtered_complaints.csv"
filtered_df[["Product", "clean_narrative"]].to_csv(output_path, index=False)

print(f"Saved cleaned dataset to {output_path}")


#✅ This file is mandatory