# Project Overview: 


## Collaborators:

1. Agnes Chomba

2. Derrick Malinga

3. Erick Okacha

4. Judah Odida

5. Lucas Ominde

6. Nick  Mwai

7. Olgah Omollo

# FinComBot - Compliance Chatbot 

## 1. Background
Financial institutions face increasing pressure to comply with stringent regulatory frameworks governing customer onboarding, Know Your Customer (KYC), Customer Due Diligence (CDD), Enhanced Due Diligence (EDD), Anti-Money Laundering (AML), Counter Terrorism Financing, Counter Proliferation Financing (CPF), and sanctions screening. These obligations are complex, continuously evolving, and vary across jurisdictions.

Staff often face difficulties accessing and interpreting regulatory documents and internal policies, leading to:
-	Delays in onboarding, affecting customer experience and revenue.
-	Inconsistent application of compliance procedures.
-	Overdependence on compliance officers for basic guidance.
-	Increased risk of regulatory breaches which may lead to fining by regulators and put the bank at risk of its license being suspended.





#  2. Business Objective

a.)  Build a chatbot that retrieves accurate compliance information 
from the bank’s KYC/AML/CTF/CPF policies and responds to staff queries.



## 3. Target Audience

a.) Front office / Relationship Managers (who onboard customers)

b.)  Operations staff (who process documents)

c.) Compliance officers (for guidance validation)

d.) New staff (as a training tool)

e.) Risk & Audit teams (for oversight)


##  4. Data Understanding
Data Source: 
a. Internal compliance policy, stored in Word (.docx) format,  Contains: KYC procedures, AML red flags, CDD/EDD checklists, risk rating methodology, regulatory guidelines (FATF, CBK, CMA)

Data Characteristics:Unstructured text (paragraphs, checklists), Multiple sections (policies, procedures, workflows), Needs preprocessing before AI ingestion


### a. Data Loading

In [1]:

import docx
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import openai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sentence_transformers import SentenceTransformer
from wordcloud import WordCloud


ModuleNotFoundError: No module named 'docx'

In [None]:
 -------------------------
# 1. Load Word Document
# -------------------------
def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    text = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
    return text

# Example: replace with your compliance manual path
file_path = "compliance_manual.docx"
paragraphs = extract_text_from_docx(file_path)
print(f"Extracted {len(paragraphs)} paragraphs from document.")

# -------------------------
# 2. Chunk the Text
# -------------------------
def chunk_text(paragraphs, chunk_size=300):
    chunks, current = [], ""
    for para in paragraphs:
        if len(current) + len(para) < chunk_size:
            current += " " + para
        else:
            chunks.append(current.strip())
            current = para
    if current:
        chunks.append(current.strip())
    return chunks

chunks = chunk_text(paragraphs)
print(f"Created {len(chunks)} chunks for embedding.")

# -------------------------
# 3. Visualizations: Data Understanding
# -------------------------

# Paragraph Length Distribution
lengths = [len(p.split()) for p in paragraphs]
plt.figure(figsize=(8,5))
sns.histplot(lengths, bins=20, kde=True)
plt.title("Distribution of Paragraph Lengths (in words)")
plt.xlabel("Words per Paragraph")
plt.ylabel("Frequency")
plt.show()

# Chunk Length Distribution
chunk_lengths = [len(c.split()) for c in chunks]
plt.figure(figsize=(8,5))
sns.histplot(chunk_lengths, bins=20, color="orange", kde=True)
plt.title("Distribution of Chunk Lengths (in words)")
plt.xlabel("Words per Chunk")
plt.ylabel("Frequency")
plt.show()

# Top Keywords
all_text = " ".join(paragraphs).lower().split()
common_words = [w for w in all_text if len(w) > 3]  # remove very short words
word_freq = Counter(common_words).most_common(20)

words, freqs = zip(*word_freq)
plt.figure(figsize=(10,5))
sns.barplot(x=list(freqs), y=list(words))
plt.title("Top 20 Most Frequent Terms in Compliance Manual")
plt.xlabel("Frequency")
plt.ylabel("Keyword")
plt.show()

# Word Cloud
wc = WordCloud(width=800, height=400, background_color="white").generate(" ".join(common_words))
plt.figure(figsize=(12,6))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Compliance Manual")
plt.show()

# -------------------------
# 4. Create Embeddings
# -------------------------
model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(chunks, convert_to_numpy=True)

# -------------------------
# 5. Store in FAISS
# -------------------------
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors.")

# -------------------------
# 6. Retrieval Function
# -------------------------
def search(query, top_k=3):
    query_emb = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_emb, top_k)
    results = [chunks[i] for i in indices[0]]
    
    # Visualization: Show similarity scores
    plt.figure(figsize=(6,4))
    sns.barplot(x=list(range(1, top_k+1)), y=distances[0])
    plt.title(f"Retrieval Scores for Query: {query}")
    plt.xlabel("Top-k Retrieved Passages")
    plt.ylabel("Distance (Lower = More Similar)")
    plt.show()
    
    return results, distances

# -------------------------
# 7. Generate Answer
# -------------------------
openai.api_key = "YOUR_API_KEY"  # replace with your key

def generate_answer(query, retrieved_passages):
    context = "\n".join(retrieved_passages)
    prompt = f"""
    You are FinComBot, a compliance assistant.
    Use ONLY the context below to answer. 
    If not found, say "Not available in policies."

    Context:
    {context}

    Question: {query}
    """
    response = openai.ChatCompletion.create(
        model="gpt-4-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response['choices'][0]['message']['content']

# -------------------------
# 8. Example Query
# -------------------------
user_query = "What documents are needed for Enhanced Due Diligence (EDD)?"
retrieved_passages, scores = search(user_query)
final_answer = generate_answer(user_query, retrieved_passages)

print("\n🔍 Query:", user_query)
print("\n📑 Retrieved Context:", retrieved_passages)
print("\n🤖 FinComBot Answer:", final_answer)

In [None]:
# ========================================
# 📘 FinComBot - Compliance Chatbot (MVP with Interactive Clustering)
# ========================================

# Requirements:
# pip install python-docx sentence-transformers faiss-cpu openai matplotlib seaborn wordcloud scikit-learn plotly

import docx
import faiss
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sentence_transformers import SentenceTransformer
from wordcloud import WordCloud
from sklearn.manifold import TSNE
import plotly.express as px
import openai

# -------------------------
# 1. Load Word Document
# -------------------------
def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    text = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
    return text

file_path = "compliance_manual.docx"
paragraphs = extract_text_from_docx(file_path)
print(f"Extracted {len(paragraphs)} paragraphs from document.")

# -------------------------
# 2. Chunk the Text
# -------------------------
def chunk_text(paragraphs, chunk_size=300):
    chunks, current = [], ""
    for para in paragraphs:
        if len(current) + len(para) < chunk_size:
            current += " " + para
        else:
            chunks.append(current.strip())
            current = para
    if current:
        chunks.append(current.strip())
    return chunks

chunks = chunk_text(paragraphs)
print(f"Created {len(chunks)} chunks for embedding.")

# -------------------------
# 3. Create Embeddings
# -------------------------
model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(chunks, convert_to_numpy=True)

# -------------------------
# 4. Topic Clustering with t-SNE + Interactive Plotly
# -------------------------
print("Running t-SNE... this may take a few minutes for large documents.")
tsne = TSNE(n_components=2, random_state=42, perplexity=15)
embeddings_2d = tsne.fit_transform(embeddings)

# Assign topics by keyword
def assign_topic(chunk):
    text = chunk.lower()
    if "kyc" in text:
        return "KYC"
    elif "aml" in text or "anti-money laundering" in text:
        return "AML"
    elif "edd" in text or "enhanced due diligence" in text:
        return "EDD"
    elif "cdd" in text or "customer due diligence" in text:
        return "CDD"
    elif "risk" in text:
        return "Risk"
    else:
        return "Other"

labels = [assign_topic(c) for c in chunks]

# Build interactive scatter plot
fig = px.scatter(
    x=embeddings_2d[:,0],
    y=embeddings_2d[:,1],
    color=labels,
    hover_data={"Chunk": chunks},
    title="t-SNE Clustering of Compliance Manual (Interactive)",
    labels={"x": "t-SNE Dimension 1", "y": "t-SNE Dimension 2"}
)
fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()

# -------------------------
# 5. Store in FAISS
# -------------------------
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors.")

# -------------------------
# 6. Retrieval Function
# -------------------------
def search(query, top_k=3):
    query_emb = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_emb, top_k)
    results = [chunks[i] for i in indices[0]]
    
    # Retrieval score visualization
    plt.figure(figsize=(6,4))
    sns.barplot(x=list(range(1, top_k+1)), y=distances[0])
    plt.title(f"Retrieval Scores for Query: {query}")
    plt.xlabel("Top-k Retrieved Passages")
    plt.ylabel("Distance (Lower = Better)")
    plt.show()
    
    return results, distances

# -------------------------
# 7. Generate Answer
# -------------------------
openai.api_key = "YOUR_API_KEY"  # replace with your key

def generate_answer(query, retrieved_passages):
    context = "\n".join(retrieved_passages)
    prompt = f"""
    You are FinComBot, a compliance assistant.
    Use ONLY the context below to answer. 
    If not found, say "Not available in policies."

    Context:
    {context}

    Question: {query}
    """
    response = openai.ChatCompletion.create(
        model="gpt-4-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response['choices'][0]['message']['content']

# -------------------------
# 8. Example Query
# -------------------------
user_query = "What documents are needed for Enhanced Due Diligence (EDD)?"
retrieved_passages, scores = search(user_query)
final_answer = generate_answer(user_query, retrieved_passages)

print("\n🔍 Query:", user_query)
print("\n📑 Retrieved Context:", retrieved_passages)
print("\n🤖 FinComBot Answer:", final_answer)
