In [None]:
# Step 1: Load necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os


In [None]:
# Step 2: Load the dataset
df = pd.read_csv('c:/data science/10 academy/week-6/Technical content/Data/complaint.xlsx')

In [None]:
# Step 3: Initial EDA
print(df.columns)

# Check for missing values
df.isnull().sum()

# Number of complaints with and without narratives
has_narrative = df['Consumer complaint narrative'].notnull().sum()
no_narrative = df['Consumer complaint narrative'].isnull().sum()

print(f"Complaints with narrative: {has_narrative}")
print(f"Complaints without narrative: {no_narrative}")

In [None]:
# Step 4: Analyze product distribution
top_products = df['Product'].value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(x=top_products.head(10).index, y=top_products.head(10).values)
plt.title("Top 10 Products by Complaint Count")
plt.xticks(rotation=45)
plt.ylabel("Number of Complaints")
plt.tight_layout()
plt.show()

In [None]:
# Step 5: Analyze narrative length
df['narrative_length'] = df['Consumer complaint narrative'].dropna().apply(lambda x: len(x.split()))
plt.figure(figsize=(10, 5))
sns.histplot(df['narrative_length'].dropna(), bins=50, kde=True)
plt.title("Distribution of Narrative Word Count")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.show()

print("Shortest narrative:", df['narrative_length'].min())
print("Longest narrative:", df['narrative_length'].max())

In [None]:
# Step 6: Filter dataset for specific products
TARGET_PRODUCTS = [
    'Credit card', 
    'Personal loan', 
    'Buy Now, Pay Later (BNPL)', 
    'Savings account', 
    'Money transfer, virtual currency, or money service'
]

# Normalize the Product column to avoid inconsistencies
df['Product'] = df['Product'].str.strip()
filtered_df = df[df['Product'].isin(TARGET_PRODUCTS)].copy()
filtered_df = filtered_df[filtered_df['Consumer complaint narrative'].notnull()].copy()

print("Filtered dataset shape:", filtered_df.shape)

In [None]:
# Step 7: Clean narratives
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    # Remove common boilerplate (optional)
    boilerplate_phrases = [
        "i am writing to file a complaint",
        "i would like to report",
        "this is regarding"
    ]
    for phrase in boilerplate_phrases:
        text = text.replace(phrase, '')
    return text.strip()

filtered_df['cleaned_narrative'] = filtered_df['Consumer complaint narrative'].apply(clean_text)


In [None]:
# Step 8: Save cleaned dataset
filtered_df.to_csv('../data/filtered_complaints.csv', index=False)
print("Saved to data/filtered_complaints.csv")