In [1]:
  !pip install \
  "langchain>=1.1.3" langchain-core langchain-community \
  "langchain-openai>=0.1.0" langchain-text-splitters \
  faiss-cpu langchain-huggingface \
  sentence-transformers transformers bert-score nltk rouge-score \
  sacrebleu scikit-learn

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-openai>=0.1.0
  Downloading langchain_openai-1.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting langc

In [2]:
import os
import pandas as pd
import torch

import nltk
nltk.download("punkt")
try:
    nltk.download("punkt_tab")
except:
    pass

from langchain_openai import ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from nltk.tokenize import word_tokenize
import sacrebleu
from rouge_score import rouge_scorer
import bert_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
# CONFIG

OPENROUTER_API_KEY = "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"  # put your key
CSV_PATH = "/content/amazon_dataset.csv"   # path to your reviews

LLM_MODEL_NAME = "qwen/qwen-2.5-7b-instruct"
LLM_BASE_URL = "https://openrouter.ai/api/v1"
LLM_TEMPERATURE = 0.2

SENTIMENT_MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
SENTIMENT_LABELS = ["negative", "neutral", "positive"]

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100
TOP_K = 3

# LOAD REFERENCE DATASET FROM CSV
REF_CSV_PATH = "/content/ref_dataset.csv"  # path where you uploaded the file
ref_df = pd.read_csv(REF_CSV_PATH)
REFERENCE_DATASET = list(zip(ref_df["question"].tolist(), ref_df["answer"].tolist()))
print("Loaded reference pairs:", len(REFERENCE_DATASET))

Loaded reference pairs: 30


In [4]:
# SENTIMENT

sent_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_NAME)
sent_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_NAME)
sent_labels = SENTIMENT_LABELS

def get_sentiment(text: str) -> str:
    tokens = sent_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        logits = sent_model(**tokens).logits
    probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
    return sent_labels[probs.argmax()]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [5]:
# RAG (Retrieval-Augmented Generation) MODULE

def load_reviews(csv_path: str = CSV_PATH) -> pd.DataFrame:
    """Load reviews from CSV and return DataFrame with 'reviews.text'."""
    df = pd.read_csv(csv_path, quotechar='"', doublequote=True, on_bad_lines="skip")
    df = df[["reviews.text"]].dropna()
    return df

def build_docs(df: pd.DataFrame):
    """Split review texts into smaller chunks (documents)."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
    )
    docs = splitter.create_documents(df["reviews.text"].tolist())
    return docs

def build_retriever(docs):
    """Create FAISS vector store and return retriever."""
    embedder = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    vector_db = FAISS.from_documents(docs, embedder)
    retriever = vector_db.as_retriever(search_kwargs={"k": TOP_K})
    return retriever

In [6]:
# LLM + PROMPT + CHATBOT

os.environ["OPENROUTER_API_KEY"] = OPENROUTER_API_KEY

llm = ChatOpenAI(
    base_url=LLM_BASE_URL,
    api_key=OPENROUTER_API_KEY,
    model=LLM_MODEL_NAME,
    temperature=LLM_TEMPERATURE,
)

def get_system_prompt():
    return ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are a helpful, honest customer support chatbot for an e-commerce platform.\n"
                "You have access to context from previous customer reviews.\n"
                "Use ONLY the provided context and chat history to answer the user's question.\n"
                "If the context is not sufficient to answer, clearly say:\n"
                "\"I’m not sure based on the available information.\" and suggest what the user can do next.\n"
                "If the user seems very upset or the issue is serious, explicitly suggest contacting human support or escalating the case.\n"
                "Be concise, polite, and do NOT make up facts."
            ),
            ("human", "{message}"),
        ]
    )

prompt = get_system_prompt()
chat_history = []
escalation_log = []

def should_escalate(user_input: str, sentiment: str) -> bool:
    """
    Decide whether to actually escalate to a human.
    We only escalate if sentiment is negative AND the message has strong anger words.
    """
    if sentiment != "negative":
        return False

    text = user_input.lower()
    strong_triggers = [
        "angry", "very angry", "furious", "upset", "very upset",
        "disgusted", "hate", "terrible", "horrible", "awful",
        "worst", "extremely disappointed", "really disappointed",
        "this is unacceptable", "bad", "very bad", "disappointed"
    ]

    return any(t in text for t in strong_triggers)

def escalate_to_human(user_message: str) -> str:
    """
    Simple escalation: remember the user's message and return a notice.
    In a real system, this is where you'd create a ticket or notify a human agent.
    """
    escalation_log.append({"user_message": user_message})
    return "Your issue has been escalated to a human support agent for further review."

def format_chat_history(history):
    if not history:
        return "No previous messages."
    lines = []
    for turn in history:
        lines.append(f"User: {turn['user']}")
        lines.append(f"Bot: {turn['bot']}")
    return "\n".join(lines)

def build_context_from_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

def chatbot(user_input: str, retriever) -> str:
    """Full chatbot logic, including sentiment + RAG + history + simple escalation."""

    # 1. Sentiment
    sentiment = get_sentiment(user_input)

    escalation_notice = ""  # default: no escalation

    if should_escalate(user_input, sentiment):
        # Strongly negative → empathetic + escalate
        sentiment_note = (
            "The user seems very upset. Start with an empathetic, apologetic tone and clearly explain what they can do next. "
            "You should also suggest escalating the issue to human support.\n"
        )
        escalation_notice = escalate_to_human(user_input)

    elif sentiment == "negative":
        # Mildly negative concern → softer tone, but no escalation
        sentiment_note = (
            "The user has some concerns. Respond with a polite and understanding tone and address their question clearly.\n"
        )

    else:
        # Neutral or positive → normal tone, no escalation
        sentiment_note = ""

    # 2. RAG retrieval
    retrieved_docs = retriever.invoke(user_input)
    context = build_context_from_docs(retrieved_docs)

    # 3. Build the full message
    history_text = format_chat_history(chat_history)
    full_message = (
        f"{sentiment_note}"
        f"Chat history:\n{history_text}\n\n"
        f"Retrieved context from product reviews:\n{context}\n\n"
        f"User question: {user_input}"
    )

    # 4. LLM call
    messages = prompt.format_messages(message=full_message)
    response = llm.invoke(messages)

    # 5. Combine LLM answer with escalation notice (if any)
    bot_text = response.content
    if escalation_notice:
        bot_text = f"{bot_text}\n\n{escalation_notice}"

    # 6. Update history
    chat_history.append({"user": user_input, "bot": bot_text})

    return bot_text

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [7]:
# BUILD RAG COMPONENTS AND RUN CHAT

# 1. Load reviews
df = load_reviews(CSV_PATH)
print("Columns:", df.columns.tolist())
print("Number of reviews:", len(df))

# 2. Build docs
docs = build_docs(df)
print("Number of chunks:", len(docs))

# 3. Build retriever
retriever = build_retriever(docs)

# 4. Interactive loop
print("Customer Support Chatbot (type 'bye', 'exit', or 'quit' to stop)\n")

while True:
    user_input = input("You: ").strip()
    if user_input.lower() in {"bye", "exit", "quit"}:
        print("Bot: Thank you for chatting with us. Goodbye!")
        break

    bot_reply = chatbot(user_input, retriever)
    print("Bot:", bot_reply)

Columns: ['reviews.text']
Number of reviews: 1597
Number of chunks: 4339


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Customer Support Chatbot (type 'bye', 'exit', or 'quit' to stop)

You: how are the quality of the products
Bot: Based on the review, the sound quality of the headphones is described as great by the reviewer. They mention that they think these headphones sound excellent and compare them favorably to more expensive pairs. However, it's important to note that this is just one review, and opinions can vary.
You: how long does the electronic items are durable
Bot: I’m not sure based on the available information. The reviews mention durability issues with specific products like Kindles and cellphones, but the context doesn’t provide a general statement about the durability of electronic items. 

For a more accurate assessment, you might want to look at the manufacturer’s specifications or read more recent reviews focusing on durability. If the issue is critical or you're concerned about a specific product, consider reaching out to customer support for more detailed information.
You: what are

In [8]:
print("\n--- Running evaluation on reference dataset ---")

# 1. Generate bot answers for REFERENCE_DATASET
references = []
hypotheses = []

for user_query, ref_answer in REFERENCE_DATASET:
    bot_answer = chatbot(user_query, retriever)
    references.append(ref_answer)
    hypotheses.append(bot_answer)

print("Number of reference-hypothesis pairs:", len(references))



--- Running evaluation on reference dataset ---
Number of reference-hypothesis pairs: 30


In [9]:

# 2. BLEU (SacreBLEU)
bleu = sacrebleu.corpus_bleu(hypotheses, [references])
print("BLEU score:", bleu.score)
# 3. ROUGE
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

for ref, hyp in zip(references, hypotheses):
    scores = scorer.score(ref, hyp)
    rouge1_scores.append(scores["rouge1"].fmeasure)
    rouge2_scores.append(scores["rouge2"].fmeasure)
    rougeL_scores.append(scores["rougeL"].fmeasure)

print("ROUGE-1 F1 (avg):", sum(rouge1_scores) / len(rouge1_scores))
print("ROUGE-2 F1 (avg):", sum(rouge2_scores) / len(rouge2_scores))
print("ROUGE-L F1 (avg):", sum(rougeL_scores) / len(rougeL_scores))


BLEU score: 4.391658315656689
ROUGE-1 F1 (avg): 0.28757971930644
ROUGE-2 F1 (avg): 0.09018562688001823
ROUGE-L F1 (avg): 0.193460584824205


In [10]:
# 4. BERTScore
P, R, F1 = bert_score.score(hypotheses, references, lang="en", verbose=True)
print("BERTScore - Precision (avg):", P.mean().item())
print("BERTScore - Recall (avg):", R.mean().item())
print("BERTScore - F1 (avg):", F1.mean().item())

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 75.63 seconds, 0.40 sentences/sec
BERTScore - Precision (avg): 0.8668984770774841
BERTScore - Recall (avg): 0.8888929486274719
BERTScore - F1 (avg): 0.8777225613594055


In [11]:
# 5. Simple token-level Precision/Recall/F1
all_precisions, all_recalls, all_f1s = [], [], []

for ref, hyp in zip(references, hypotheses):
    ref_tokens = word_tokenize(ref.lower())
    hyp_tokens = word_tokenize(hyp.lower())
    ref_set, hyp_set = set(ref_tokens), set(hyp_tokens)

    tp = len(ref_set & hyp_set)
    fp = len(hyp_set - ref_set)
    fn = len(ref_set - hyp_set)

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)

    all_precisions.append(precision)
    all_recalls.append(recall)
    all_f1s.append(f1)

print("Token-level Precision (avg):", sum(all_precisions) / len(all_precisions))
print("Token-level Recall (avg):", sum(all_recalls) / len(all_recalls))
print("Token-level F1 (avg):", sum(all_f1s) / len(all_f1s))

Token-level Precision (avg): 0.2479555439148583
Token-level Recall (avg): 0.3874561557205475
Token-level F1 (avg): 0.30067839226618476


In [12]:
# =========================
# SENTIMENT F1 EVALUATION (manual small test set)
# =========================
from sklearn.metrics import precision_score, recall_score, f1_score

# Small labeled test set you define yourself
# (text, true_label) where labels are "negative", "neutral", or "positive"
sentiment_eval_data = [
    ("I am very angry, this product broke in two days!", "negative"),
    ("This is awful, I'm extremely disappointed.", "negative"),
    ("I'm upset because the item arrived damaged.", "negative"),

    ("It's okay, not great but not terrible.", "neutral"),
    ("The product is fine, it does what it should.", "neutral"),
    ("I have no strong opinion, it's just average.", "neutral"),

    ("I love this product, it works perfectly!", "positive"),
    ("I'm very happy with this purchase.", "positive"),
    ("This is fantastic, I highly recommend it.", "positive"),
]

y_true = []
y_pred = []

for text, true_label in sentiment_eval_data:
    pred_label = get_sentiment(text)
    y_true.append(true_label)
    y_pred.append(pred_label)

sentiment_precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
sentiment_recall = recall_score(y_true, y_pred, average="macro", zero_division=0)
sentiment_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Sentiment Precision (macro):", sentiment_precision)
print("Sentiment Recall (macro):", sentiment_recall)
print("Sentiment F1 (macro):", sentiment_f1)


Sentiment Precision (macro): 0.45
Sentiment Recall (macro): 0.6666666666666666
Sentiment F1 (macro): 0.5357142857142857


In [13]:
# =========================
# LATENCY EVALUATION
# =========================
import time

latencies = []

for user_query, ref_answer in REFERENCE_DATASET:
    start = time.time()
    _ = chatbot(user_query, retriever)
    end = time.time()
    latencies.append(end - start)

if latencies:
    avg_latency = sum(latencies) / len(latencies)
    print("Average latency (seconds):", avg_latency)
    print("Min latency (seconds):", min(latencies))
    print("Max latency (seconds):", max(latencies))
else:
    print("No latency data (REFERENCE_DATASET is empty).")

Average latency (seconds): 5.3085521777470905
Min latency (seconds): 1.4972405433654785
Max latency (seconds): 9.53698205947876
