In [None]:
pip install pandas

In [1]:
import pandas as pd

# Load the CSV data
data = pd.read_csv("10cus.csv")

# Check the first few rows of the dataset
print(data.head())

# Optionally, preprocess the data to create a feature set
# For example, we can convert categorical columns into numerical labels
# This would normally be necessary for embedding or more sophisticated processing


  CustomerID  Age  Gender MaritalStatus               Occupation  IncomeLevel  \
0   CUST0001   44    Male       Widowed  Private Sector Employee        54278   
1   CUST0002   40  Female        Single      Government Employee        61471   
2   CUST0003   43    Male        Single  Private Sector Employee        80036   
3   CUST0004   21    Male      Divorced                  Student         2431   
4   CUST0005   41  Female       Widowed      Government Employee        51298   

   CreditLimit  CreditScore CardType  YearsWithBank  NumberOfCreditCards  \
0        11254          645     Gold              6                    2   
1         5371          505   Silver             10                    2   
2         5708          341   Silver              2                    2   
3         4642          721     Gold              2                    2   
4         7360          520   Silver              9                    1   

   AverageMonthlySpending  LatePayments  CreditCardUsage

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("EMBEDDING_KEY")
endpoint = os.getenv("EMBEDDING_END_POINT")

In [26]:
import pandas as pd
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma

In [27]:
def batch_documents(documents, batch_size):
    total_docs = len(documents)
    for i in range(0, total_docs, batch_size):
        yield documents[i:i + batch_size]


In [28]:

def add_documents_to_vectorstore_in_batches(documents, vectorstore, batch_size=64):
    for batch in batch_documents(documents, batch_size):
        texts = [doc.page_content for doc in batch]
        metadatas = [doc.metadata for doc in batch]
        vectorstore.add_texts(texts=texts, metadatas=metadatas)

In [30]:
# Load the CSV data
data = pd.read_csv('10cus.csv')

# Prepare the list of documents
documents = []

for _, row in data.iterrows():
    customer_info = f"""
    Customer ID: {row['CustomerID']}
    Age: {row['Age']}
    Gender: {row['Gender']}
    Marital Status: {row['MaritalStatus']}
    Occupation: {row['Occupation']}
    Income Level: {row['IncomeLevel']}
    Credit Limit: {row['CreditLimit']}
    Credit Score: {row['CreditScore']}
    Card Type: {row['CardType']}
    Years With Bank: {row['YearsWithBank']}
    Number of Credit Cards: {row['NumberOfCreditCards']}
    Average Monthly Spending: {row['AverageMonthlySpending']}
    Late Payments: {row['LatePayments']}
    Credit Card Usage: {row['CreditCardUsage']}
    Mobile Banking Usage: {row['MobileBankingUsage']}
    Customer Satisfaction Rating: {row['CustomerSatisfactionRating']}
    """
    # Create a Document object
    doc = Document(page_content=customer_info, metadata={"CustomerID": row['CustomerID']})
    documents.append(doc)


In [37]:
# Initialize embedding
embeddings = OpenAIEmbeddings(
    openai_api_key=api_key,
    openai_api_base=f"{endpoint}/openai/deployments",  # Ensure correct formatting
    model="text-embedding-ada-002",
    api_version="2024-04-01-preview"
)
# Initialize vector store

CHROMA_DATA_PATH = "./chroma_data"
os.makedirs(CHROMA_DATA_PATH, exist_ok=True)

vectorstore = Chroma(
    embedding_function=embeddings,
    persist_directory=CHROMA_DATA_PATH
)

In [38]:
# Add documents to the vector store in batches
batch_size = 64  # Adjust as needed
add_documents_to_vectorstore_in_batches (documents, vectorstore, batch_size)

print("Documents have been processed and stored in the vector store.")


NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}

In [2]:
import pandas as pd
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma

# Initialize embedding
embeddings = OllamaEmbeddings(model="llama3.2", base_url="http://ollama_container:11434")

# Initialize vector store
CHROMA_DATA_PATH = "/data"
os.makedirs(CHROMA_DATA_PATH, exist_ok=True)
vectorstore = Chroma(
    embedding_function=embeddings,
    persist_directory=CHROMA_DATA_PATH
)

In [3]:


# Function to search for risky customers based on similarity threshold
def search_risk_in_vectorstore(vectorstore, new_customer_info, embeddings, threshold=0.8):
    # Embed the new customer info using the same embedding model
    new_customer_embedding = embeddings.embed_query(new_customer_info)  # FIX: Use embed_query for embedding
    
    # Search the vector store for similar documents (risk threshold can be adjusted)
    results = vectorstore.similarity_search_with_score(query=new_customer_info)
    
    # Filter results based on similarity threshold
    risky_customers = [result for result in results if result[1] >= threshold]
    
    if risky_customers:
        print(f"Found {len(risky_customers)} similar customers at or above the risk threshold of {threshold}.")
        for i, (doc, score) in enumerate(risky_customers, 1):
            print(f"Risky customer {i} with similarity score {score}:")
            print(f"Customer ID: {doc.metadata['CustomerID']}")
            print(doc.page_content)
    else:
        print("No customers found that are similar enough to indicate risk.")
    
    return risky_customers

# Step 4: Perform risk search for a new customer profile
new_customer_info = """
Customer ID: CUST0101
Age: 35
Gender: Female
Marital Status: Single
Occupation: Private Sector Employee
Income Level: 25000000
Credit Limit: 8000000
Credit Score: 550
Card Type: Gold
Years With Bank: 5
Number of Credit Cards: 2
Average Monthly Spending: 1500000
Late Payments: 6
Credit Card Usage: 0.75
Mobile Banking Usage: Yes
Customer Satisfaction Rating: 3
"""

# Perform risk search based on similarity threshold
risk_threshold = 0.8  # Define a similarity threshold to identify risky customers
risky_customers = search_risk_in_vectorstore(vectorstore, new_customer_info, embeddings, threshold=risk_threshold)

print("Risky customers identified:", risky_customers)


Found 4 similar customers at or above the risk threshold of 0.8.
Risky customer 1 with similarity score 8590.0947265625:
Customer ID: CUST0086

    Customer ID: CUST0086
    Age: 28
    Gender: Female
    Marital Status: Widowed
    Occupation: Government Employee
    Income Level: 66853407
    Credit Limit: 11941292
    Credit Score: 467
    Card Type: Gold
    Years With Bank: 12
    Number of Credit Cards: 4
    Average Monthly Spending: 3990211
    Late Payments: 5
    Credit Card Usage: 0.39
    Mobile Banking Usage: Yes
    Customer Satisfaction Rating: 5
    
Risky customer 2 with similarity score 8706.7578125:
Customer ID: CUST0008

    Customer ID: CUST0008
    Age: 24
    Gender: Other
    Marital Status: Single
    Occupation: Business Owner
    Income Level: 63909529
    Credit Limit: 17397951
    Credit Score: 507
    Card Type: Classic
    Years With Bank: 7
    Number of Credit Cards: 3
    Average Monthly Spending: 3062615
    Late Payments: 5
    Credit Card Usage: 0.6