In [1]:
import pandas as pd
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
import chromadb
from sentence_transformers import SentenceTransformer
import os



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class GenAIAgent:
    """Base class for Gen AI agents with advanced prompt engineering"""

    def __init__(self, model_name="microsoft/DialoGPT-medium", max_length=512):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)

        # Set pad token if not available
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.max_length = max_length

    def generate_response(self, prompt, temperature=0.7, max_new_tokens=256):
        """Generate response using LLM with advanced parameters"""
        try:
            inputs = self.tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=self.max_length).to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.9,
                    top_k=50,
                    pad_token_id=self.tokenizer.eos_token_id,
                    repetition_penalty=1.2,
                    no_repeat_ngram_size=3
                )

            response = self.tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
            return response.strip()
        except Exception as e:
            return f"Error generating response: {str(e)}"

In [3]:
# agent = GenAIAgent()
# prompt = "Hi how are you"
# response = agent.generate_response(prompt)
# print(response)

# FIXING INSIGHT GENERATION AGENT

This prompt didn't require much changes; the prompt juust takes in the entire dataframe we have in our merged dataset and passes to the agent now

In [None]:
class InsightGenerationAgent(GenAIAgent):
    """Agent for generating business insights and recommendations"""

    def __init__(self):
        super().__init__()

        # opportunities data is our MERGED dataframe
    def generate_pipeline_insights(self, opportunities_data):
        """Generate intelligent pipeline insights"""

        # Calculate pipeline metrics
        total_pipeline = opportunities_data['close_value'].sum()
        avg_deal_size = opportunities_data['close_value'].mean()
        conversion_rate = len(opportunities_data[opportunities_data['deal_stage'] == 'Won']) / len(opportunities_data) * 100

        context = f"""
        PIPELINE ANALYSIS DATA:
        - Total Pipeline Value: ${total_pipeline:,}
        - Number of Opportunities: {len(opportunities_data)}
        - Average Deal Size: ${avg_deal_size:,.0f}
        - Conversion Rate: {conversion_rate:.1f}%
        - Top Industries: {opportunities_data['sector'].value_counts().head(3).to_dict()}
        - Stage Distribution: {opportunities_data['deal_stage'].value_counts().to_dict()}
        - Top Performing Sales Agents: {opportunities_data['sales_agent'].value_counts().head(3).to_dict()}
        - Regional Breakdown: {opportunities_data['regional_office'].value_counts().to_dict()}
        """

        prompt = f"""
        You are a senior sales analyst. Analyze the pipeline data and provide actionable insights:

        {context}

        Generate insights covering:
        1. Pipeline Health Assessment
        2. Deal Size and Conversion Trends
        3. Industry and Segment Performance
        4. Stage-Specific Recommendations
        5. Risk Factors and Mitigation Strategies
        6. Growth Opportunities

        Provide specific, actionable recommendations with data-driven reasoning.

        PIPELINE INSIGHTS:
        """
        insights = self.generate_response(prompt, temperature=0.2, max_new_tokens=350)
        print("insights:", insights)
        return insights

# Fixing ACCOUNT AGENT


The account agent prompts all use the data we have from earlier BEFORE the merge (accounts, opportunity), however the columns/features that they expect from this data are different from what we have.

For example, 'account_id' is needed, and these are also expected: 
'Account_name' 
'Industry' 
'Annual_revenue' 
'Employee_count' 
'Account_type' 
'region'
'account_status'
'account_owner'
'created_date'
'Last_activity'

Some of those don't exist in our data, such as account_status, account_owner, so remove them from the prompts. Also added parent company into the prompt ('subsidiary_of') and added account_id in notebook 'data_fixing_for_agent' and have new csv in data/original called "account_with_id.csv"


The opportunity agent similarly expects: 
'opportunity_name'
'stage'
'amount'
'probability'
'close_date'

I subbed out ‘amount’ for ‘close_value’, calculated open_opps as open if the close date is not none, and changed ‘stage’ to ‘deal_stage’

It expects an 'activities' dataframe which we don't have, but according to the code if we pass in nothing as the activity, it should be fine

In summary, for the account agent, I fixed the prompts to pull the OUR columns from OUR data because it did not match our data before.


In [5]:

class AccountSummaryAgent(GenAIAgent):
    """Specialized agent for generating dynamic account summaries"""

    def __init__(self):
        super().__init__()

    def create_summary_prompt(self, account_data, opportunities, activities, ml_insights=None):
        """Create dynamic prompt for account summary generation"""

        # Calculate key metrics
        total_pipeline = opportunities['close_value'].sum() if not opportunities.empty else 0
        open_opps = len(opportunities[opportunities['close_date'].notna()])
        won_opps = len(opportunities[opportunities['deal_stage'] == 'Won'])

        prompt = f"""
        You are an expert CRM analyst. Generate a comprehensive, professional account summary based on the following data:

        ACCOUNT INFORMATION:
        - Company: {account_data['account']}  
        - Industry: {account_data['sector']}
        - Annual Revenue: ${account_data['revenue']:,}
        - Employees: {account_data['employees']}
        - Region: {account_data['office_location']}
        - Parent company: {account_data['subsidiary_of'] if account_data['subsidiary_of'] else "N/A"}

        SALES PERFORMANCE:
        - Total Pipeline Value: ${total_pipeline:,}
        - Open Opportunities: {open_opps}
        - Won Opportunities: {won_opps}

        OPPORTUNITY DETAILS:
        {opportunities[['opportunity_id', 'sales_agent', 'product', 'deal_stage', 'engage_date', 'close_date', 'close_value']].to_string(index=False) if not opportunities.empty else "No opportunities found"}

        RECENT ACTIVITIES:
        {activities[['activity_type', 'subject', 'outcome', 'activity_date']].head(5).to_string(index=False) if not activities.empty else "No recent activities"}

        Generate a professional account summary that includes:
        1. Executive Overview (2-3 sentences about the account's status and potential)
        2. Key Metrics and Performance Indicators
        3. Opportunity Pipeline Analysis
        4. Engagement and Activity Summary
        5. Risk Assessment and Recommendations
        6. Next Steps and Action Items

        Format the response in clear sections with bullet points where appropriate. Be analytical, insights-driven, and actionable.

        ACCOUNT SUMMARY:
        """
        return prompt

    def generate_account_summary(self, account_data, opportunities, activities):
        """Generate dynamic account summary using LLM"""
        prompt = self.create_summary_prompt(account_data, opportunities, activities)
        summary = self.generate_response(prompt, temperature=0.3, max_new_tokens=400)

        # Post-process to ensure quality
        if len(summary) < 50:
            return f"""
        **Account Summary: {account_data['account']}**

        **Executive Overview:**
        {account_data['account']} is a {account_data['sector'].lower()} company with ${account_data['revenue']:,} in annual revenue and {account_data['employees']} employees. This account is located the {account_data['office_location']} region and is a parent company of {account_data['subsidiary_of']}. 

        **Key Metrics:**
        • Annual Revenue: ${account_data['revenue']:,}
        • Company Size: {account_data['employees']} employees
        

        **Pipeline Analysis:**
        • Total Opportunities: {len(opportunities)}
        • Pipeline Value: ${opportunities['close_value'].sum():,}
        • Open Deals: {len(opportunities[opportunities['close_date'].notna()])}


        **Recommendations:**
        Based on the account profile and activity level, focus on nurturing the relationship and identifying expansion opportunities.
        """

        return summary


# Fixing EMAIL DRAFT AGENT

{"The close date is approaching soon, so create urgency." if (opportunity_data['close_date'] - datetime.now()).days < 30 else ""}
For this line, it calculates the current date difference from the close date. All the close dates are YEARRS ago so I delete this

It expects opportunity['probability']. I think our model generates that so im gonna exclude it
Theres also 'lead_data' which the introduction email uses, but we dont have it, i think can delete that for now
I added additional details to the prompt that our dataset captures like the product series, engage date, etc

In [6]:
class EmailDraftingAgent(GenAIAgent):
    """Specialized agent for generating contextual emails"""

    def __init__(self):
        super().__init__()

    def create_email_prompt(self, email_type, context_data, additional_context=""):
        """Create dynamic prompt for email generation"""

        base_prompt = f"""
        You are an expert sales professional writing personalized, engaging emails. Generate a professional email based on the context provided.

        EMAIL TYPE: {email_type}
        CONTEXT DATA: {context_data}
        ADDITIONAL CONTEXT: {additional_context}

        Email Guidelines:
        - Professional but warm tone
        - Personalized and specific to the recipient
        - Clear call-to-action
        - Appropriate length (not too long or short)
        - Include relevant business value
        - Use compelling subject line
        - Follow best practices for sales communication

        Generate a complete email with:
        1. Subject Line
        2. Professional greeting
        3. Body with clear purpose and value proposition
        4. Specific call-to-action
        5. Professional closing

        EMAIL:
        """
        return base_prompt

    def draft_follow_up_email(self, opportunity_data, account_data, last_activity=None):
        """Draft intelligent follow-up email"""

        context = f"""
OPPORTUNITY: {opportunity_data['opportunity_id']}
ACCOUNT: {account_data['account']}
INDUSTRY: {account_data['sector']}
CURRENT STAGE: {opportunity_data['deal_stage']}
DEAL VALUE: ${opportunity_data['close_value']:,}
CLOSE DATE: {opportunity_data['close_date']}
LAST CONTACT: {last_activity if last_activity else 'No recent activity recorded'}"""

        additional_context = f"""
The opportunity is currently in {opportunity_data['deal_stage']} stage.
Focus on moving the deal forward and addressing any potential concerns."""

        prompt = self.create_email_prompt("Follow-up", context, additional_context)
        email = self.generate_response(prompt, temperature=0.4, max_new_tokens=300)

        return self.format_email_output(email, opportunity_data, account_data)


# ignore??
    # def draft_introduction_email(self, lead_data, additional_info=""):
    #     """Draft introduction email for new leads"""

    #     context = f"""
    #     LEAD: {lead_data['first_name']} {lead_data['last_name']}
    #     TITLE: {lead_data['title']}
    #     COMPANY: {lead_data['company']}
    #     INDUSTRY: {lead_data['sector']}
    #     LEAD SCORE: {lead_data['lead_score']}/100
    #     SOURCE: {lead_data['source']}
    #     """

    #     additional_context = f"""
    #     This is a new lead with a score of {lead_data['lead_score']}/100.
    #     The lead came from {lead_data['source']}.
    #     Focus on introducing your company's value proposition relevant to their industry ({lead_data['sector']}).
    #     Keep it brief and focus on scheduling a discovery call.
    #     """

    #     prompt = self.create_email_prompt("Introduction", context, additional_context)
    #     email = self.generate_response(prompt, temperature=0.5, max_new_tokens=250)

    #     return self.format_lead_email_output(email, lead_data)

    def draft_proposal_email(self, opportunity_data, account_data, proposal_details=""):
        """Draft proposal presentation email"""

        context = f"""
        OPPORTUNITY INFORMATION:
        - Opportunity ID: {opportunity_data['opportunity_id']}
        - Product: {opportunity_data['product']}
        - Deal Stage: {opportunity_data['deal_stage']}
        - Sales Agent: {opportunity_data['sales_agent']}
        - Manager: {opportunity_data['manager']}
        - Regional Office: {opportunity_data['regional_office']}
        - Series: {opportunity_data['series']}
        - Engage Date: {opportunity_data['engage_date']}
        - Expected Close Date: {opportunity_data['close_date']}
        - Deal Value: ${opportunity_data['close_value']:,}
        - Sales Price: ${opportunity_data['sales_price']:,}

        ACCOUNT INFORMATION:
        - Account Name: {opportunity_data['account']}
        - Industry (Sector): {opportunity_data['sector']}
        - Year Established: {opportunity_data['year_established']}
        - Annual Revenue: ${opportunity_data['revenue']:,}
        - Employees: {opportunity_data['employees']}
        - Office Location: {opportunity_data['office_location']}
        - Subsidiary Of: {opportunity_data['subsidiary_of']}
        """

        additional_context = f"""
        The opportunity is ready for proposal presentation.
        Focus on scheduling a meeting to present the proposal.
        Highlight the business value and ROI for their {account_data['sector']} industry.
        Create excitement about the solution and next steps.
        """

        prompt = self.create_email_prompt("Proposal Presentation", context, additional_context)
        print("Email Prompt:", prompt)
        email = self.generate_response(prompt, temperature=0.3, max_new_tokens=280)
        print("Generated Proposal Email:", email)

        return self.format_email_output(email, opportunity_data, account_data)

    def format_email_output(self, email_content, opportunity_data, account_data):
        """Format and enhance email output"""

        # If the generated email is too short or incomplete, provide a structured fallback
        if len(email_content.strip()) < 100:
            return f"""
Subject: Following up on {opportunity_data['opportunity_id']} - Next Steps

Dear {account_data['account']} Team,

I hope this email finds you well. I wanted to follow up on opportunity {opportunity_data['opportunity_id']} that we've been discussing.

**Current Status:**
• Deal Stage: {opportunity_data['deal_stage']}
• Project Value: ${opportunity_data['close_value']:,}
• Target Timeline: {opportunity_data['close_date']}

**Next Steps:**
I'd like to schedule a brief call this week to discuss any questions you might have and outline the next steps in our process. This will help ensure we stay on track for your {opportunity_data['close_date']} timeline.

**Value Proposition:**
Our solution is specifically designed for {account_data['sector']} companies like {account_data['account']}, helping organizations achieve measurable results while reducing operational complexity.

Would you be available for a 30-minute call this week? I have openings on Tuesday and Thursday afternoons.

Best regards,
{opportunity_data['sales_agent']}

P.S. I've attached some relevant case studies from similar {account_data['sector']} implementations that you might find interesting.
"""

        return email_content

#     def format_lead_email_output(self, email_content, lead_data):
#         """Format lead introduction email output"""

#         if len(email_content.strip()) < 80:
#             return f"""
# Subject: Introduction - Helping {lead_data['company']} Optimize {lead_data['sector']} Operations

# Hello {lead_data['first_name']},

# I hope this email finds you well. I'm reaching out because I noticed {lead_data['company']}'s recent activity and thought there might be an opportunity for us to help.

# **Why I'm Reaching Out:**
# We specialize in helping {lead_data['sector']} companies like yours streamline operations and drive growth. Given your role as {lead_data['title']}, I thought you'd be interested in learning how we've helped similar organizations achieve significant results.

# **Quick Question:**
# Are you currently facing any challenges with [specific industry challenge] that's impacting your team's efficiency or bottom line?

# I'd love to share some quick insights that might be valuable for {lead_data['company']}. Would you be open to a brief 15-minute conversation this week?

# Best regards,
# {lead_data['owner']}

# P.S. No sales pitch - just a quick exchange of ideas that might be mutually beneficial.
# """

#         return email_content


In [31]:
class CRMChatbot:
    def __init__(self, client, embedder, ml_models, accounts, opportunities, leads, activities):
        self.client = client
        self.embedder = embedder
        self.ml_models = ml_models
        self.accounts = accounts
        self.opportunities = opportunities
        self.leads = leads
        self.activities = activities

        # Initialize AI Agents
        self.summary_agent = AccountSummaryAgent()
        self.email_agent = EmailDraftingAgent()
        self.insight_agent = InsightGenerationAgent()

    def return_preprocessed_lead_scoring_data(self, df):
        df["qualified_lead"] = (
            (df.get("deal_stage_ENGAGING", 0) == 1) |
            (df.get("deal_stage_WON", 0) == 1) |
            (df.get("deal_stage_LOST", 0) == 1)
        ).astype(int)
        TARGET = "qualified_lead"
        temporal_cols = ['engage_date', 'close_date', 'engage_year', 'engage_month', 
                        'engage_dayofweek', 'days_to_close', 'closed_within_30d']
        outcome_cols = ['deal_stage_PROSPECTING', 'deal_stage_ENGAGING', 
                        'deal_stage_WON', 'deal_stage_LOST', 'won_deal', 
                        'has_close_date', 'close_value', 'close_value_log']
        remove_cols = temporal_cols + outcome_cols
        raw_features = ['revenue', 'employees', 'sales_price']
        feature_cols = [c for c in df.columns if c not in remove_cols + raw_features + [TARGET]]
        X = df[feature_cols]
        return X.copy()

    def generate_account_summary(self, account_id):
        """Generate dynamic account summary using AI agent"""
        account = self.accounts[self.accounts['account_id'] == account_id]
        if account.empty:
            return f"Account {account_id} not found."

        account_data = account.iloc[0]
        related_opps = self.opportunities[self.opportunities['account_id'] == account_id]
        related_activities = self.activities[self.activities['account_id'] == account_id] if not self.activities.empty else pd.DataFrame()

        # Use AI agent to generate dynamic summary
        summary = self.summary_agent.generate_account_summary(account_data, related_opps, related_activities)
        return summary

    def draft_email(self, opportunity_id, email_type="follow_up"):
        """Draft intelligent email using AI agent"""
        opp = self.opportunities[self.opportunities['opportunity_id'] == opportunity_id]
        if opp.empty:
            return f"Opportunity {opportunity_id} not found."

        opp_data = opp.iloc[0]
        account = self.accounts[self.accounts['account_id'] == opp_data['account_id']].iloc[0]

        # Get last activity for context
        last_activity = self.activities[
            self.activities['account_id'] == opp_data['account_id']
        ].sort_values('activity_date', ascending=False).iloc[0] if not self.activities.empty else None

        # Use AI agent to draft email
        if email_type == "follow_up":
            email = self.email_agent.draft_follow_up_email(opp_data, account, last_activity)
        elif email_type == "proposal":
            email = self.email_agent.draft_proposal_email(opp_data, account)
        else:
            email = self.email_agent.draft_follow_up_email(opp_data, account, last_activity)

        return email

    # def draft_lead_email(self, lead_id):
    #     """Draft introduction email for leads"""
    #     lead = self.leads[self.leads['lead_id'] == lead_id]
    #     if lead.empty:
    #         return f"Lead {lead_id} not found."

    #     lead_data = lead.iloc[0]
    #     email = self.email_agent.draft_introduction_email(lead_data)
    #     return email

    def generate_insights(self, insight_type="pipeline"):
        """Generate business insights using AI agent"""
        if insight_type == "pipeline":
            insights = self.insight_agent.generate_pipeline_insights(self.opportunities)
            return insights
        else:
            return "Insight type not supported yet."

    def get_ml_insights(self, query, id):
        """Get ML model insights"""
        insights = []
        id = id.split("-")[-1]  

        query_lower = query.lower()

        run_lead_scoring = ('lead' in query_lower or 'score' in query_lower)
        run_account_health = ('account' in query_lower and 'health' in query_lower)
        run_opp_win = ('opportunity' in query_lower and ('win' in query_lower or 'probability' in query_lower))

        if run_lead_scoring:
            # print("Running Lead Scoring Model")
            lead_scoring_df = pd.read_csv(os.path.join(os.getcwd(), "..", "..", "data", "cleaned_data_with_id.csv"))
            lead_scoring_df = lead_scoring_df[lead_scoring_df['opportunity_id'] == id]
            X = self.return_preprocessed_lead_scoring_data(lead_scoring_df)
            X.drop("opportunity_id", axis=1, inplace=True)
            lead_scoring = self.ml_models['lead_scoring_model']
            lead_scoring_result = lead_scoring.predict(X)[0]
            insights.append(f"Lead Scoring Prediction: {'High Quality Lead' if lead_scoring_result == 1 else 'Low Quality Lead'}")

        if run_account_health:
            # print("Running Account Health Model")
            acc_health_df = pd.read_csv(os.path.join(os.getcwd(), "..", "..", "data", "preprocessed_acc_health.csv"))
            X = acc_health_df[acc_health_df['opportunity_id'] == id].copy()
            X.drop("opportunity_id", axis=1, inplace=True)
            acc_health_model = self.ml_models['account_health_model']
            acc_health_result = acc_health_model.predict(X)[0]
            insights.append(f"Account Health Score Prediction: {acc_health_result}")

        if run_opp_win:
            # print("Running Opportunity Win Model")
            opp_win_df = pd.read_csv(os.path.join(os.getcwd(), "..", "..", "data", "preprocessed_opp_win_data.csv"))
            X = opp_win_df[opp_win_df['opportunity_id'] == id].copy()
            X.drop('opportunity_id', axis=1, inplace=True)
            opp_win_model = self.ml_models['opportunity_win_model']
            opp_win_result = opp_win_model.predict(X)[0]
            insights.append(f"Opportunity Win Prediction: {'Win' if opp_win_result == 1 else 'Lose'}")

        if insights:
            paragraph = " \n".join(insights)
            return paragraph
        else:
            return "No ML insights available for this query."
    
    def process_natural_language_query(self, query):
        """Process natural language queries and route to appropriate AI agents"""
        query_lower = query.lower()
        print(query_lower)

        # Account summary requests
        if any(word in query_lower for word in ['account', 'company']) and 'summary' in query_lower:
            # Extract account ID or name from query
            account_match = re.search(r'acc-\d+', query_lower)
            if account_match:
                account_id = account_match.group().upper()
                return self.generate_account_summary(account_id)
            else:
                return "Please specify an account ID (e.g., ACC-00001) for the summary."

        # Opportunity searches (this method was never implemented)
        # elif 'opportunit' in query_lower and any(word in query_lower for word in ['find', 'show', 'list']):
        #     criteria = query_lower
        #     opps = self.find_opportunities_by_criteria(criteria)
        #     return f"Found {len(opps)} opportunities:\n\n" + opps.to_string(index=False)

        # Email drafting with AI agent
        elif 'email' in query_lower or 'draft' in query_lower:
            # Check for opportunity ID
            opp_match = re.search(r'opp-[\da-zA-Z]+', query_lower)
            if opp_match:
                opp_id = opp_match.group().upper()
                email_type = "proposal" if 'proposal' in query_lower else "follow_up"
                return self.draft_email(opp_id, email_type)
            else:
                return "Please specify an opportunity ID (e.g., OPP-PEX0DEA) for email drafting."

        # Business insights generation
        elif any(word in query_lower for word in ['insights', 'analysis', 'pipeline', 'forecast']):
            insight_type = "pipeline" if 'pipeline' in query_lower else "general"
            return self.generate_insights(insight_type)

        # ML insights
        elif any(word in query_lower for word in ['predict', 'score', 'probability', 'ml']):
            opp_match = re.search(r'opp-[\da-zA-Z]+', query_lower)

            if opp_match:
                opp_id = opp_match.group().upper()
            else:
                return "Please specify an opportunity ID (e.g., OPP-PEX0DEA) for ML insights."
            insights = self.get_ml_insights(query, id=opp_id)
            return insights

        # Semantic search fallback
        else:
            """Semantic search fallback using ChromaDB collections"""

            acc_collection = self.client.get_collection("sales_accounts")
            opp_collection = self.client.get_collection("sales_opportunities")

            query_embedding = self.embedder.encode([query]).tolist()

            acc_results = acc_collection.query(query_embeddings=query_embedding, n_results=3)
            opp_results = opp_collection.query(query_embeddings=query_embedding, n_results=3)

            response = "Based on your query, here are the most relevant results:\n\n"

            if acc_results["documents"] and acc_results["documents"][0]:
                response += "**Relevant Accounts:**\n"
                for i, doc in enumerate(acc_results["documents"][0][:2]):  
                    meta = acc_results["metadatas"][0][i]
                    meta_text = ", ".join(f"{k}: {v}" for k, v in meta.items())
                    response += f"- {doc}\n  _({meta_text})_\n"
                response += "\n"

            if opp_results["documents"] and opp_results["documents"][0]:
                response += "**Relevant Opportunities:**\n"
                for i, doc in enumerate(opp_results["documents"][0][:2]): 
                    meta = opp_results["metadatas"][0][i]
                    meta_text = ", ".join(f"{k}: {v}" for k, v in meta.items())
                    response += f"- {doc}\n  _({meta_text})_\n"

            if (not acc_results["documents"] or not acc_results["documents"][0]) and (
                not opp_results["documents"] or not opp_results["documents"][0]):
                response += "No relevant results found."

            return response




GenAIAgent's Role: The GenAIAgent (and its subclasses like AccountSummaryAgent) is designed specifically for Large Language Model (LLM) based text generation. Its __init__ method focuses on loading the LLM's tokenizer and model (microsoft/DialoGPT-medium in this case) into memory so it can generate human-like text responses.

CRMVectorStore's Role: The CRMVectorStore class, which holds self.vector_store, has a completely different purpose: to manage and query vector embeddings for semantic search using SentenceTransformer and ChromaDB.

CRMChatbot Orchestration: It's the CRMChatbot class that brings these different capabilities together. When you initialize chatbot = CRMChatbot(vector_store, ml_models, accounts_clean, ...), you are passing instances of CRMVectorStore (as vector_store), CRMMLModels (as ml_models), and also creating instances of AccountSummaryAgent, EmailDraftingAgent, and InsightGenerationAgent within the CRMChatbot.

The CRMChatbot's process_natural_language_query method then decides which underlying component to call based on the user's query:

If the query is for a summary, it calls self.summary_agent.generate_account_summary().

If the query involves semantic search (e.g., 'Show me accounts in the technology industry'), it would use self.vector_store.semantic_search().

So, while self.tokenizer and self.model are part of the text generation pipeline, they operate independently of the vector store, which handles retrieval.

In [32]:
chromadb.api.client.SharedSystemClient.clear_system_cache() # fixing chromadb  bug?

client = chromadb.PersistentClient(path=os.path.join(os.getcwd(), "..", "..", "data", "chroma_db"))
embedder = SentenceTransformer('all-MiniLM-L6-v2')
accounts = pd.read_csv(os.path.join(os.getcwd(), "..", "..", "data", "accounts_with_id.csv"))
opp = pd.read_csv(os.path.join(os.getcwd(), "..", "..", "data", "opp_data_with_id.csv"))
models_path = os.path.join(os.getcwd(), "..", "..", "models")

import pickle

opp_win = pickle.load(open(os.path.join(os.getcwd(), "..", "..", "models", "opportunity_win_model.pkl"), "rb"))
lead_scoring = pickle.load(open(os.path.join(os.getcwd(), "..", "..", "models", "lead_scoring_model.pkl"), "rb"))['classifier']
account_health = pickle.load(open(os.path.join(os.getcwd(), "..", "..", "models", "account_health_model.pkl"), "rb"))['model']

ml_models = {
    "opportunity_win_model": opp_win,
    "lead_scoring_model": lead_scoring,
    "account_health_model": account_health
}

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [33]:
chatbot = CRMChatbot(client, embedder, ml_models, accounts, opp, leads=pd.DataFrame(), activities=pd.DataFrame())

In [34]:
# ml insights
query = "give me ML predictions for lead scoring, oppportunity win, account health with OPP-Z063OYW0"
response = chatbot.process_natural_language_query(query)
print(response)

give me ml predictions for lead scoring, oppportunity win, account health with opp-z063oyw0
Lead Scoring Prediction: High Quality Lead 
Account Health Score Prediction: 64.78001591196772




In [35]:
# insight agent test
query = "give me pipeline insights"
response = chatbot.process_natural_language_query(query)
print(response)

give me pipeline insights

        You are a senior sales analyst. Analyze the pipeline data and provide actionable insights:

        
        PIPELINE ANALYSIS DATA:
        - Total Pipeline Value: $10,005,534.0
        - Number of Opportunities: 8800
        - Average Deal Size: $1,491
        - Conversion Rate: 48.2%
        - Top Industries: {'retail': 1397, 'technolgy': 1165, 'medical': 1051}
        - Stage Distribution: {'Won': 4238, 'Lost': 2473, 'Engaging': 1589, 'Prospecting': 500}
        - Top Performing Sales Agents: {'Darcel Schlecht': 747, 'Vicki Laflamme': 451, 'Anna Snelling': 448}
        - Regional Breakdown: {'Central': 3512, 'West': 2997, 'East': 2291}
        

        Generate insights covering:
        1. Pipeline Health Assessment
        2. Deal Size and Conversion Trends
        3. Industry and Segment Performance
        4. Stage-Specific Recommendations
        5. Risk Factors and Mitigation Strategies
        6. Growth Opportunities

        Provide speci

In [36]:
# email agent test
query = "give me proposal email for OPP-Z063OYW0"
response = chatbot.process_natural_language_query(query)
print(response)

give me proposal email for opp-z063oyw0
Email Prompt: 
        You are an expert sales professional writing personalized, engaging emails. Generate a professional email based on the context provided.

        EMAIL TYPE: Proposal Presentation
        CONTEXT DATA: 
        OPPORTUNITY INFORMATION:
        - Opportunity ID: OPP-Z063OYW0
        - Product: GTX Pro
        - Deal Stage: Won
        - Sales Agent: Darcel Schlecht
        - Manager: Melvin Marxen
        - Regional Office: Central
        - Series: GTX
        - Engage Date: 2016-10-25
        - Expected Close Date: 2017-03-11
        - Deal Value: $4,514.0
        - Sales Price: $4,821

        ACCOUNT INFORMATION:
        - Account Name: Isdom
        - Industry (Sector): medical
        - Year Established: 2002.0
        - Annual Revenue: $3,178.24
        - Employees: 4540.0
        - Office Location: United States
        - Subsidiary Of: nan
        
        ADDITIONAL CONTEXT: 
        The opportunity is ready for pr

In [37]:
# ml insights
query = "give me pipeline insights"
response = chatbot.process_natural_language_query(query)
print(response)

give me pipeline insights

        You are a senior sales analyst. Analyze the pipeline data and provide actionable insights:

        
        PIPELINE ANALYSIS DATA:
        - Total Pipeline Value: $10,005,534.0
        - Number of Opportunities: 8800
        - Average Deal Size: $1,491
        - Conversion Rate: 48.2%
        - Top Industries: {'retail': 1397, 'technolgy': 1165, 'medical': 1051}
        - Stage Distribution: {'Won': 4238, 'Lost': 2473, 'Engaging': 1589, 'Prospecting': 500}
        - Top Performing Sales Agents: {'Darcel Schlecht': 747, 'Vicki Laflamme': 451, 'Anna Snelling': 448}
        - Regional Breakdown: {'Central': 3512, 'West': 2997, 'East': 2291}
        

        Generate insights covering:
        1. Pipeline Health Assessment
        2. Deal Size and Conversion Trends
        3. Industry and Segment Performance
        4. Stage-Specific Recommendations
        5. Risk Factors and Mitigation Strategies
        6. Growth Opportunities

        Provide speci

In [38]:
# account summary agent test
query = "give me account summary for ACC-00007"
response = chatbot.process_natural_language_query(query)
print(response)

give me account summary for acc-00007

        **Account Summary: Bluth Company**

        **Executive Overview:**
        Bluth Company is a technolgy company with $1,242.32 in annual revenue and 3027 employees. This account is located the United States region and is a parent company of Acme Corporation. 

        **Key Metrics:**
        • Annual Revenue: $1,242.32
        • Company Size: 3027 employees


        **Pipeline Analysis:**
        • Total Opportunities: 59
        • Pipeline Value: $56,903.0
        • Open Deals: 48


        **Recommendations:**
        Based on the account profile and activity level, focus on nurturing the relationship and identifying expansion opportunities.
        
