In [30]:
pip install -U langchain-community



In [31]:
pip install chromadb



In [32]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
import time
import json
from datetime import datetime

In [33]:
class Config:
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Faster inference
    LLM_MODEL = "google/flan-t5-xxl" if False else "google/flan-t5-base"  # Toggle for larger model
    CHUNK_SIZE = 256
    CHUNK_OVERLAP = 32
    RETRIEVAL_K = 4
    SCORE_THRESHOLD = 0.65  # Minimum similarity score for retrieval
    TEMPERATURE = 0.3  # Reduces hallucination

In [34]:
class PerformanceMetrics:
    def __init__(self):
        self.metrics = {
            "total_queries": 0,
            "response_times": [],
            "retrieval_scores": [],
            "hallucination_flags": [],
            "error_count": 0
        }

    def log_response(self, duration, scores=None, hallucination=False):
        self.metrics["total_queries"] += 1
        self.metrics["response_times"].append(duration)
        if scores:
            self.metrics["retrieval_scores"].extend(scores)
        if hallucination:
            self.metrics["hallucination_flags"].append(True)

    def save_report(self):
        report = {
            "timestamp": datetime.now().isoformat(),
            "avg_response_time": sum(self.metrics["response_times"])/len(self.metrics["response_times"]) if self.metrics["response_times"] else 0,
            "avg_retrieval_score": sum(self.metrics["retrieval_scores"])/len(self.metrics["retrieval_scores"]) if self.metrics["retrieval_scores"] else 0,
            "hallucination_rate": len(self.metrics["hallucination_flags"])/self.metrics["total_queries"] if self.metrics["total_queries"] else 0,
            "error_rate": self.metrics["error_count"]/self.metrics["total_queries"] if self.metrics["total_queries"] else 0
        }
        with open("performance_metrics.json", "w") as f:
            json.dump(report, f)
        return report

In [35]:
with open("hsuhk_kb.txt", "w", encoding="utf-8") as f:
    f.write("""## University Overview
HSUHK (Hang Seng University of Hong Kong) is a private liberal-arts-oriented university in Hong Kong.
Established: 2010 (Gained university status in 2018)
Location: Shatin, New Territories
Motto: "Erudition • Perseverance • Integrity • Commitment"
Accreditations: Approved by Hong Kong Council for Accreditation of Academic and Vocational Qualifications

## Academic Structure
- School of Business
  Programs: BBA in Financial Analysis, Digital Marketing, Global Business Management
- School of Communication
  Programs: Journalism and Communication, Film and Television Arts
- School of Decision Sciences
  Programs: Data Science, Business Analytics
- School of Humanities & Social Science
  Programs: Chinese History, Applied Psychology

## Admissions
Undergraduate Requirements:
- HKDSE: 3322 in core subjects
- IELTS: 6.0 overall
- Application Deadline: June 30 for September intake
- Tuition Fees: HKD 90,000 - 110,000/year
Scholarships: Academic Excellence Scholarship (Full tuition waiver), Sports Scholarship

## Campus Facilities
- 4 Residential Colleges (1,500 hostel places)
- Library: 250,000+ print/digital resources
- Sports Complex: Olympic-size swimming pool, gymnasium
- Innovation Hub: 3D printing lab, VR studio

## Student Life
Clubs: 50+ student organizations including Debate Team, AI Club
Exchange Programs: Partnered with 100+ universities worldwide
Career Services: 92% graduate employment rate within 6 months

## Contact
Website: https://www.hsu.edu.hk
Admissions Office: +852 3963 5555
Address: 8 Hang Shin Link, Siu Lek Yuen, Shatin""")

In [40]:
class HSUHKChatbot:
    def __init__(self):
        self.metrics = PerformanceMetrics()
        self.qa_chain = self._initialize_model()
        self.kb_version = "1.2"

    def _initialize_model(self):
        # Improved text splitting
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=Config.CHUNK_SIZE,
            chunk_overlap=Config.CHUNK_OVERLAP,
            separators=["\n\n", "\n", ". ", "? ", "! "]
        )

        documents = TextLoader("hsuhk_kb.txt").load_and_split(text_splitter)

        # Optimized embeddings
        embeddings = HuggingFaceEmbeddings(model_name=Config.EMBEDDING_MODEL)

        # Configure vector store with metadata
        vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=embeddings,
            collection_metadata={"hnsw:space": "cosine"}  # Changed parameter name
        )


        # Enhanced retriever
        retriever = vectorstore.as_retriever(
            search_type="mmr",  # Maximal Marginal Relevance
            search_kwargs={
                "k": Config.RETRIEVAL_K,
                "score_threshold": Config.SCORE_THRESHOLD
            }
        )

        # Configure LLM with temperature control
        llm = HuggingFacePipeline(pipeline=pipeline(
            "text2text-generation",
            model=Config.LLM_MODEL,
            temperature=Config.TEMPERATURE,
            max_length=256
        ))

        return RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True
        )

    def _validate_answer(self, answer, sources):
        """Check for potential hallucinations"""
        validation = {
            "contains_numbers": any(char.isdigit() for char in answer),
            "source_support": False,
            "confidence": 0.0
        }

        # Check if answer is supported by sources
        for doc in sources:
            if any(keyword in answer.lower() for keyword in doc.page_content.lower().split()[:10]):
                validation["source_support"] = True
                validation["confidence"] += 0.25

        validation["confidence"] = min(validation["confidence"], 1.0)
        return validation

    def query(self, question):
        start_time = time.time()
        try:
            result = self.qa_chain({"query": question})
            duration = time.time() - start_time

            # Extract scores from source documents
            scores = [doc.metadata.get("score", 0) for doc in result["source_documents"]]

            # Validate answer quality
            validation = self._validate_answer(result["result"], result["source_documents"])

            # Log performance metrics
            self.metrics.log_response(
                duration=duration,
                scores=scores,
                hallucination=not validation["source_support"]
            )

            return {
                "answer": result["result"],
                "sources": result["source_documents"],
                "validation": validation,
                "metrics": {
                    "response_time": duration,
                    "retrieval_scores": scores,
                    "confidence": validation["confidence"]
                }
            }

        except Exception as e:
            self.metrics.log_error()
            return {"error": str(e)}

In [41]:
def run_benchmarks(chatbot):
    test_cases = [
        ("What's the application deadline?", "June 30"),
        ("How many hostel places are there?", "1,500"),
        ("What's the HKDSE requirement?", "3322"),
        ("Invalid question test", None)
    ]

    results = []
    for question, expected in test_cases:
        start = time.time()
        response = chatbot.query(question)
        duration = time.time() - start

        result = {
            "question": question,
            "expected": expected,
            "answer": response.get("answer", ""),
            "match": expected.lower() in response.get("answer", "").lower() if expected else None,
            "confidence": response.get("validation", {}).get("confidence", 0),
            "response_time": duration
        }
        results.append(result)

    accuracy = sum(1 for r in results if r["match"]) / len([r for r in results if r["expected"]])
    avg_time = sum(r["response_time"] for r in results) / len(results)

    print(f"\nBenchmark Results:")
    print(f"Accuracy: {accuracy*100:.1f}%")
    print(f"Average Response Time: {avg_time:.2f}s")
    print(f"Detailed Report Saved to performance_metrics.json")

In [42]:
if __name__ == "__main__":
    bot = HSUHKChatbot()

    # Run performance benchmarks
    run_benchmarks(bot)

    # Interactive mode
    print("\nInteractive Chat Mode:")
    while True:
        question = input("\nQuestion: ").strip()
        if question.lower() == ":exit":
            bot.metrics.save_report()
            break

        response = bot.query(question)

        if "error" in response:
            print(f"Error: {response['error']}")
            continue

        print(f"\nAnswer: {response['answer']}")
        print(f"Confidence: {response['validation']['confidence']*100:.1f}%")
        print("\nSources:")
        for idx, doc in enumerate(response["sources"][:3], 1):
            print(f"{idx}. [{doc.metadata.get('score', 0):.2f}] {doc.page_content[:90]}...")

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipeline(
  result = self.qa_chain({"query": question})



Benchmark Results:
Accuracy: 100.0%
Average Response Time: 2.02s
Detailed Report Saved to performance_metrics.json

Interactive Chat Mode:

Question: What's HSUHK's motto?





Answer: "Erudition • Perseverance • Integrity • Commitment"
Confidence: 50.0%

Sources:
1. [0.00] ## University Overview
HSUHK (Hang Seng University of Hong Kong) is a private liberal-arts...
2. [0.00] Motto: "Erudition • Perseverance • Integrity • Commitment"
Accreditations: Approved by Hon...
3. [0.00] ## Academic Structure
- School of Business
  Programs: BBA in Financial Analysis, Digital ...

Question: Is HSUHK a public or private university?





Answer: private
Confidence: 0.0%

Sources:
1. [0.00] ## University Overview
HSUHK (Hang Seng University of Hong Kong) is a private liberal-arts...
2. [0.00] ## Academic Structure
- School of Business
  Programs: BBA in Financial Analysis, Digital ...
3. [0.00] Scholarships: Academic Excellence Scholarship (Full tuition waiver), Sports Scholarship...

Question: How many schools does HSUHK have?





Answer: ##
Confidence: 75.0%

Sources:
1. [0.00] ## University Overview
HSUHK (Hang Seng University of Hong Kong) is a private liberal-arts...
2. [0.00] - School of Decision Sciences
  Programs: Data Science, Business Analytics
- School of Hum...
3. [0.00] ## Admissions
Undergraduate Requirements:
- HKDSE: 3322 in core subjects
- IELTS: 6.0 over...

Question: What undergraduate programs does the School of Business offer





Answer: BBA in Financial Analysis, Digital Marketing, Global Business Management
Confidence: 75.0%

Sources:
1. [0.00] ## Academic Structure
- School of Business
  Programs: BBA in Financial Analysis, Digital ...
2. [0.00] - School of Decision Sciences
  Programs: Data Science, Business Analytics
- School of Hum...
3. [0.00] Scholarships: Academic Excellence Scholarship (Full tuition waiver), Sports Scholarship...

Question: exit





Answer: The school's motto is "Erudition, Perseverance, Integrity, Commitment"
Confidence: 50.0%

Sources:
1. [0.00] Motto: "Erudition • Perseverance • Integrity • Commitment"
Accreditations: Approved by Hon...
2. [0.00] - School of Decision Sciences
  Programs: Data Science, Business Analytics
- School of Hum...
3. [0.00] ## Campus Facilities
- 4 Residential Colleges (1,500 hostel places)
- Library: 250,000+ pr...


KeyboardInterrupt: Interrupted by user