In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/complaints-data/small_complaints_check.csv


In [None]:
pip install -U chromadb langchain transformers sentence-transformers langchain-community

In [2]:
import os
import sys
import pandas as pd

df = pd.read_csv('/kaggle/input/complaints-data/small_complaints_check.csv')

In [None]:
df.head()

In [3]:
df["Consumer_complaint_narrative"] = df["Consumer_complaint_narrative"].str.replace(r'[^\w\s.,]', '', regex=True)

In [4]:
complaint_data = df[["Complaint_ID", "Consumer_complaint_narrative"]].dropna()

complaint_data.head()

Unnamed: 0,Complaint_ID,Consumer_complaint_narrative
0,2321847,School balance was turned over to a collection...
1,2382606,I spoke with the bank Representative around XX...
2,2376700,Mailings packet from American Express Gold Car...
3,2274211,Bank of America has illegally placed an inquir...
4,2418769,I had no communication or knowledge of account...


In [5]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Step 1: Prepare data
# complaint_data = df.copy()

# Step 2: Initialize the text splitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)

# Step 3: Split the complaint narratives into chunks
documents = [
    Document(page_content=chunk, metadata={"Complaint_ID": row["Complaint_ID"]})
    for _, row in complaint_data.iterrows()
    for chunk in text_splitter.split_text(row["Consumer_complaint_narrative"])
]

# Step 4: Set up ChromaDB and embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = Chroma(collection_name="complaints", embedding_function=embedding_model)

# Step 5: Add documents to ChromaDB
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
db.add_texts(texts=texts, metadatas=metadatas)

print("Data has been added to ChromaDB successfully.")


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  db = Chroma(collection_name="complaints", embedding_function=embedding_model)


Data has been added to ChromaDB successfully.


In [6]:
query = "What complaints are related to delayed payments?"
results = db.similarity_search(query, k=5)  # Retrieve top 5 matches

for result in results:
    print(result)

page_content='This item is being reported as 30 days late in XXXXXXXX when payments have never been late.' metadata={'Complaint_ID': 2420135}
page_content='I made a payment over the phone for my auto loan on the day the payment was due and I was assured that the late payment would report as paid on time. The payment was never 30 days late and its been reporting as late since XXXX2014.' metadata={'Complaint_ID': 2284493}
page_content='I have time periods on my credit report where this student loan is being reported as late. I ve never been late on payments and recently paid it off. When I was not actively making payments, I was in deferment. I contac ted USA Funds to resolve the issue and have yet to receive resolution.' metadata={'Complaint_ID': 2422529}
page_content='My loan was in forbearance with Navient from XXXX2011 to XXXX2011 and they reported a 30 day late payment in XXXX2011. This is incorrect and they are hurting my ability to use my credit and need to fix it ASAP.' metadata=

In [17]:
query = "What many complaints have delayed payments?"
results = db.similarity_search_with_score(query, k=50)  # Retrieve top 5 matches
print(len(results))
# for result in results:
    # print(result)

50


In [20]:
data = [
    {
        "Complaint_ID": result.metadata["Complaint_ID"],
        "Chunk": result.page_content,
        "Score": score,  # Include similarity score if available
    }
    for result, score in results
]

# Create a DataFrame
results_df = pd.DataFrame(data)
results_df

Unnamed: 0,Complaint_ID,Chunk,Score
0,2420135,This item is being reported as 30 days late in...,0.766188
1,2284493,I made a payment over the phone for my auto lo...,0.827877
2,2427046,My loan was in forbearance with Navient from X...,0.917813
3,2270891,They are reporting that i missed a payment on ...,0.928003
4,2422529,I have time periods on my credit report where ...,0.928521
5,2377301,I got a 30 day late XXXX2017 and it s reported...,0.943166
6,2422587,This item is being reported as late on my cred...,0.955112
7,2370828,I was shocked when I reviewed my credit report...,0.965598
8,2415559,I was shocked when I reviewed my credit report...,0.967168
9,2419197,I make complaint about my credit report that w...,0.971709
