# University Chatbot Development

This notebook develops an AI chatbot for Emirates Aviation University using transformer models.

## Contents:
1. Setup and Data Loading
2. Data Preprocessing
3. Model Development
4. Training
5. Evaluation

In [5]:
# Install required packages
!pip install transformers torch pandas numpy scikit-learn sentence-transformers

Collecting transformers
  Using cached transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting torch
  Using cached torch-2.5.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Using cached huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Using cached safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Using cached tokenizers-0.20.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting sympy==1.13.1 (from torch)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Using cached transformers-4.45.2-py3-none-any.whl (9.9 MB)
Downloading torch-2.5.0-cp312-cp312-win_amd64.whl (203.1 MB)
   ---------------------------------------- 0.0/203.1 MB ? eta -:--:--
   ----------------------------

In [15]:
# Import necessary libraries
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from typing import List, Tuple, Dict
import re


warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


## 1. Data Loading
Load the university data from CSV files

In [7]:
# Load all CSV files
basic_info = pd.read_csv('dataset/university_basic_info.csv', sep=',', encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
programs = pd.read_csv('dataset/programs_and_courses.csv', sep=',', encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
facilities = pd.read_csv('dataset/facilities.csv', sep=',', encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
accreditations = pd.read_csv('dataset/accreditations_partnerships.csv', sep=',', encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
faculties = pd.read_csv('dataset/faculties_programs.csv', sep=',', encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
events = pd.read_csv('dataset/events_and_news.csv', sep=',', encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
faculty_info = pd.read_csv('dataset/faculty_info.csv', sep=',', encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
features = pd.read_csv('dataset/key_features.csv', sep=',', encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
research = pd.read_csv('dataset/research_and_publications.csv', sep=',', encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
services = pd.read_csv('dataset/student_services.csv', sep=',', encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')

print("Data loaded successfully!")

# Display basic information about the university
print("\nUniversity Basic Information:")
display(basic_info)

Data loaded successfully!

University Basic Information:


Unnamed: 0,name,value
0,university_name,Emirates Aviation University
1,abbreviation,EAU
2,location,Dubai Academic City - Dubai - United Arab Emir...
3,founding_year,1991
4,type,Private university specializing in aviation ed...
5,affiliation,Part of the Emirates Group
6,chancellor,His Highness Sheikh Ahmed bin Saeed Al Maktoum
7,vice_chancellor,Professor Ahmad Al Ali
8,language_of_instruction,English


## 2. Data Preprocessing
Prepare the data for the model by creating structured text representations

In [11]:
def create_knowledge_base():
    knowledge_texts = []
    
    # Process basic information
    university_name = basic_info.loc[basic_info['name'] == 'university_name', 'value'].iloc[0]
    location = basic_info.loc[basic_info['name'] == 'location', 'value'].iloc[0]
    chancellor = basic_info.loc[basic_info['name'] == 'chancellor', 'value'].iloc[0]
    vice_chancellor = basic_info.loc[basic_info['name'] == 'vice_chancellor', 'value'].iloc[0]
    founding_year = basic_info.loc[basic_info['name'] == 'founding_year', 'value'].iloc[0]
    knowledge_texts.append(f"{university_name} is located in {location}")
    knowledge_texts.append(f"The university was founded in {founding_year}")
    knowledge_texts.append(f"The Chancellor is {chancellor}")
    knowledge_texts.append(f"The Vice Chancellor is {vice_chancellor}")
    
    # Process faculty programs
    for faculty in faculties['faculty'].unique():
        faculty_programs = faculties[faculties['faculty'] == faculty]['program_type'].tolist()
        program_text = f"The {faculty} offers the following programs: {', '.join(faculty_programs)}"
        knowledge_texts.append(program_text)
    
    # Process programs
    for _, row in programs.iterrows():
        program_text = f"The {row['program_name']} is a {row['level']} program in {row['department']}. "
        program_text += f"Courses include: {row['courses']}"
        knowledge_texts.append(program_text)
    
    # Process facilities by category
    for category in facilities['category'].unique():
        category_facilities = facilities[facilities['category'] == category]['facility_name'].tolist()
        facility_text = f"The {category} facilities include: {', '.join(category_facilities)}"
        knowledge_texts.append(facility_text)
    
    # Process accreditations and partnerships
    accred_list = accreditations[accreditations['type'] == 'accreditation']['organization'].tolist()
    partner_list = accreditations[accreditations['type'] == 'partnership']['organization'].tolist()
    knowledge_texts.append(f"The university is accredited by: {', '.join(accred_list)}")
    knowledge_texts.append(f"The university has partnerships with: {', '.join(partner_list)}")
    
    # Process key features
    for _, row in features.iterrows():
        knowledge_texts.append(f"Feature: {row['feature']}")
    
    # Process faculty information
    for _, row in faculty_info.iterrows():
        faculty_text = f"Dr. {row['name']} is a {row['title']} in the {row['department']}"
        if pd.notna(row['specialization']):
            faculty_text += f", specializing in {row['specialization']}"
        if pd.notna(row['research_interests']):
            faculty_text += f". Research interests include: {row['research_interests']}"
        knowledge_texts.append(faculty_text)
    
    # Process research and publications
    for _, row in research.iterrows():
        if pd.notna(row['publication_title']) and pd.notna(row['publication_venue']):
            research_text = f"Research: {row['publication_title']} published in {row['publication_venue']}"
            knowledge_texts.append(research_text)
    
    # Process student services
    services_list = services['service_name'].tolist()
    knowledge_texts.append(f"Student services available: {', '.join(services_list)}")
    
    # Process events and news
    for _, row in events.iterrows():
        if pd.notna(row['date']) and pd.to_datetime(row['date']) > pd.Timestamp.now():
            event_text = f"Upcoming {row['type']}: {row['title']} on {row['date']}"
            if pd.notna(row['description']):
                event_text += f" - {row['description']}"
            knowledge_texts.append(event_text)
    
    return knowledge_texts

# Create knowledge base
knowledge_base = create_knowledge_base()

# Display sample entries
print("Sample Knowledge Base Entries:")
for i, text in enumerate(knowledge_base[:20]):
    print(f"\n{i+1}. {text}")

Sample Knowledge Base Entries:

1. Emirates Aviation University is located in Dubai Academic City - Dubai - United Arab Emirates

2. The university was founded in 1991

3. The Chancellor is His Highness Sheikh Ahmed bin Saeed Al Maktoum

4. The Vice Chancellor is Professor Ahmad Al Ali

5. The Faculty of Engineering offers the following programs: Undergraduate, Postgraduate

6. The Faculty of Business Management offers the following programs: Undergraduate, Postgraduate

7. The Faculty of Mathematics and Data Science offers the following programs: Undergraduate, Postgraduate, PhD

8. The Institute of Applied Research and Technology offers the following programs: Research

9. The All Faculties offers the following programs: Professional Training, Foundation and Diploma

10. The Air Transport Management is a Bachelor of Science (Honours) program in Business. Courses include: Introduction to Air Transport Industry, Aviation Strategy and Planning, Airline Route and Fleet Planning, Airline 

## 3. Model Development
Initialize and set up the transformer models

In [31]:
class UniversityChatbot:
    def __init__(self):
        print("Loading embedding model...")
        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        
        print("Loading language model...")
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(device)
        
        print("Creating knowledge embeddings...")
        self.knowledge_base = knowledge_base
        self.knowledge_embeddings = self.embedding_model.encode(
            knowledge_base,
            batch_size=32,
            show_progress_bar=True,
            convert_to_tensor=True
        )
        
        # Create category mappings
        self.category_patterns = {
            'location': r'where|location|address|campus',
            'programs': r'program|course|study|degree|major',
            'facilities': r'facility|facilities|amenities|campus|building',
            'faculty': r'professor|teacher|instructor|faculty|staff|teach',
            'admissions': r'admission|apply|enroll|requirement|criteria',
            'research': r'research|publication|paper|study|project',
            'events': r'event|activity|workshop|seminar|conference',
            'services': r'service|support|help|assistance|aid'
        }
        
        print("Chatbot initialized successfully!")

    def preprocess_query(self, query: str) -> str:
        """Clean and standardize the query"""
        query = query.lower().strip()
        query = re.sub(r'[^\w\s]', ' ', query)
        query = re.sub(r'\s+', ' ', query)
        return query

    def get_query_category(self, query: str) -> str:
        """Determine the category of the query"""
        query = self.preprocess_query(query)
        for category, pattern in self.category_patterns.items():
            if re.search(pattern, query):
                return category
        return 'general'

    def get_relevant_knowledge(self, query: str, top_k: int = 5) -> Tuple[List[str], np.ndarray]:
        """Get relevant knowledge based on query category and semantic similarity"""
        query = self.preprocess_query(query)
        category = self.get_query_category(query)
        
        # Get query embedding
        query_embedding = self.embedding_model.encode([query], convert_to_tensor=True)
        
        # Calculate base similarities
        with torch.no_grad():
            similarities = cosine_similarity(
                query_embedding.cpu().numpy(),
                self.knowledge_embeddings.cpu().numpy()
            )[0]
        
        # Boost similarity scores based on category matching
        boosted_similarities = similarities.copy()
        for i, knowledge in enumerate(self.knowledge_base):
            # Boost scores for knowledge entries matching the query category
            if category != 'general':
                if category in knowledge.lower():
                    boosted_similarities[i] *= 1.2  # 20% boost for category match
                
            # Boost scores for knowledge entries with key terms from query
            query_terms = set(query.lower().split())
            knowledge_terms = set(knowledge.lower().split())
            matching_terms = query_terms.intersection(knowledge_terms)
            if matching_terms:
                term_boost = 1 + (len(matching_terms) / len(query_terms)) * 0.3  # Up to 30% boost
                boosted_similarities[i] *= term_boost
        
        # Get top-k relevant knowledge based on boosted similarities
        top_indices = np.argsort(boosted_similarities)[-top_k:][::-1]
        relevant_knowledge = [self.knowledge_base[i] for i in top_indices]
        relevant_similarities = similarities[top_indices]  # Return original similarities
        
        # Sort knowledge by relevance while maintaining context
        context_pairs = list(zip(relevant_knowledge, relevant_similarities))
        context_pairs.sort(key=lambda x: x[1], reverse=True)
        
        # Group related information
        grouped_knowledge = []
        used_indices = set()
        
        for i, (knowledge, sim) in enumerate(context_pairs):
            if i in used_indices:
                continue
                
            related_group = [knowledge]
            knowledge_terms = set(knowledge.lower().split())
            
            # Look for related information
            for j, (other_knowledge, other_sim) in enumerate(context_pairs):
                if j != i and j not in used_indices:
                    other_terms = set(other_knowledge.lower().split())
                    # Check for significant term overlap
                    overlap = len(knowledge_terms.intersection(other_terms)) / len(knowledge_terms)
                    if overlap > 0.3:  # If more than 30% terms overlap
                        related_group.append(other_knowledge)
                        used_indices.add(j)
            
            grouped_knowledge.extend(related_group)
            used_indices.add(i)
        
        # Ensure we don't exceed top_k while keeping groups together
        grouped_knowledge = grouped_knowledge[:top_k]
        
        # Recalculate similarities for final grouped knowledge
        final_similarities = []
        for knowledge in grouped_knowledge:
            knowledge_embedding = self.embedding_model.encode([knowledge], convert_to_tensor=True)
            with torch.no_grad():
                sim = cosine_similarity(
                    query_embedding.cpu().numpy(),
                    knowledge_embedding.cpu().numpy()
                )[0][0]
            final_similarities.append(sim)
        
        return grouped_knowledge, np.array(final_similarities)

    def format_response(self, response: str) -> str:
        """Clean and format the response"""
        # Remove prompt and instruction text
        response = re.sub(r'You are.*?so:', '', response, flags=re.DOTALL)
        response = re.sub(r'Question:.*?Answer:', '', response, flags=re.DOTALL)
        
        # Remove any URLs
        response = re.sub(r'http\S+', '', response)
        
        # Remove any Q: or A: segments
        response = re.sub(r'Q:.*', '', response)
        response = re.sub(r'A:.*', '', response)
        
        # Clean up whitespace
        response = re.sub(r'\s+', ' ', response).strip()
        
        # Ensure proper sentence ending
        if response and not response.endswith(('.', '!', '?')):
            response += '.'
                
        return response


    def generate_response(self, query: str) -> Tuple[str, List[str]]:
        # Get relevant knowledge
        relevant_knowledge, similarities = self.get_relevant_knowledge(query)
        
        # Filter out low-confidence knowledge
        confidence_threshold = 0.3
        relevant_knowledge = [k for k, s in zip(relevant_knowledge, similarities) 
                            if s > confidence_threshold]
        
        if not relevant_knowledge:
            return "I apologize, but I don't have enough information to answer that question accurately.", []
        
        # Prepare context
        context = (
            "You are helping a student with questions about Emirates Aviation University. "
            "Use ONLY the following facts to answer - if you're not sure, say so:\n\n"
            f"{' '.join(relevant_knowledge)}\n\n"
            f"Question: {query}\n"
            "Answer: "
        )
        
        try:
            inputs = self.tokenizer(context, return_tensors='pt', truncation=True, 
                                  max_length=1024).to(device)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=100,  # Use max_new_tokens instead of max_length
                    num_beams=5,
                    no_repeat_ngram_size=3,
                    num_return_sequences=1,
                    top_k=50,
                    top_p=0.85,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id,
                    early_stopping=True
                )
            
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = self.format_response(response)
            
            # Verify response makes sense
            if len(response.split()) < 3:
                response = "Based on the available information, I cannot provide a complete answer. Please contact the university for more details."
                
            return response, relevant_knowledge
                
        except Exception as e:
            print(f"Error generating response: {str(e)}")
            return "I apologize, but I encountered an error while generating the response. Please try rephrasing your question.", relevant_knowledge
            
# Initialize chatbot
chatbot = UniversityChatbot()

Loading embedding model...
Loading language model...
Creating knowledge embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Chatbot initialized successfully!


## 4. Testing the Chatbot
Test the chatbot with sample queries

In [33]:
def test_chatbot(query: str, verbose: bool = True):
    """
    Test the chatbot with a single query and display results
    
    Args:
        query: The question to ask the chatbot
        verbose: Whether to show detailed analysis
    """
    if verbose:
        print("\n" + "="*80)
        print(f"Query: {query}")
        print("="*80)
        print("\nProcessing query...")
        
        # Get query category
        category = chatbot.get_query_category(query)
        print(f"Query Category: {category}")
        
    response, relevant_knowledge = chatbot.generate_response(query)
    
    if verbose:
        print("\nRelevant Knowledge:")
        for i, knowledge in enumerate(relevant_knowledge, 1):
            relevance = cosine_similarity(
                chatbot.embedding_model.encode([query]), 
                chatbot.embedding_model.encode([knowledge])
            )[0][0]
            print(f"{i}. [{relevance:.2f}] {knowledge}")
        
        print("\nResponse Analysis:")
        print(f"Length: {len(response.split())} words")
        print(f"Confidence: {get_response_confidence(response, query):.2%}")
        
    print("\nChatbot Response:", end=" ")
    print(response)
    
    if verbose:
        print("-"*80)
    
    return response, relevant_knowledge

def get_response_confidence(response: str, query: str) -> float:
    """Calculate confidence score for the response"""
    with torch.no_grad():
        response_embedding = chatbot.embedding_model.encode([response])
        query_embedding = chatbot.embedding_model.encode([query])
        confidence = cosine_similarity(response_embedding, query_embedding)[0][0]
    return confidence

def batch_test_chatbot(test_queries: list, verbose: bool = True):
    """
    Test the chatbot with multiple queries and show performance metrics
    
    Args:
        test_queries: List of questions to test
        verbose: Whether to show detailed results for each query
    """
    print("\nStarting Batch Testing...")
    print(f"Number of test queries: {len(test_queries)}")
    print("-"*80)
    
    results = []
    total_time = 0
    
    for i, query in enumerate(test_queries, 1):
        start_time = time.time()
        response, knowledge = test_chatbot(query, verbose)
        end_time = time.time()
        
        response_time = end_time - start_time
        total_time += response_time
        
        confidence = get_response_confidence(response, query)
        
        results.append({
            'query': query,
            'response': response,
            'confidence': confidence,
            'response_time': response_time,
            'knowledge_count': len(knowledge)
        })
    
    # Display summary metrics
    print("\nTest Summary:")
    print("="*80)
    print(f"Average response time: {total_time/len(test_queries):.2f} seconds")
    print(f"Average confidence score: {np.mean([r['confidence'] for r in results]):.2%}")
    print(f"Average knowledge pieces used: {np.mean([r['knowledge_count'] for r in results]):.1f}")
    
    # Display high/low confidence responses
    print("\nHighest Confidence Responses:")
    for r in sorted(results, key=lambda x: x['confidence'], reverse=True)[:2]:
        print(f"\nQ: {r['query']}")
        print(f"A: {r['response']}")
        print(f"Confidence: {r['confidence']:.2%}")
    
    print("\nLowest Confidence Responses:")
    for r in sorted(results, key=lambda x: x['confidence'])[:2]:
        print(f"\nQ: {r['query']}")
        print(f"A: {r['response']}")
        print(f"Confidence: {r['confidence']:.2%}")

# Test queries with varying complexity
test_queries = [
    # Basic information queries
    "What engineering programs do you offer?",
    "Tell me about the sports facilities",
    "What are the admission requirements for Aviation Safety program?",
    "Where is the university located?",
    
    # Complex queries
    "Can you compare the different engineering programs in terms of duration and career prospects?",
    "What research is being conducted in aerospace engineering?",
    
    # Edge cases
    "What is the meaning of life?",  # Out of scope
    "Tell me about underwater basket weaving courses",  # Non-existent program
    
    # Specific detailed queries
    "Who are the professors in the Faculty of Engineering?",
    "What are the upcoming events this month?"
]

# Run batch testing
import time
batch_test_chatbot(test_queries)


Starting Batch Testing...
Number of test queries: 10
--------------------------------------------------------------------------------

Query: What engineering programs do you offer?

Processing query...
Query Category: programs

Relevant Knowledge:
1. [0.73] The Faculty of Engineering offers the following programs: Undergraduate, Postgraduate
2. [0.52] The Engineering Business Management is a Master of Science program in Engineering. Courses include: Operations Management, Project Management, Supply Chain Management, Quality Management Systems
3. [0.49] The Mechanical Engineering is a Master of Science program in Engineering. Courses include: Advanced Thermodynamics, Fluid Mechanics, Mechanical Design

Response Analysis:
Length: 20 words
Confidence: 30.17%

Chatbot Response: Based on the available information, I cannot provide specific admission requirements. Please contact the university admissions office for detailed requirements.
----------------------------------------------------

## 5. Interactive Chat Interface
Create an interactive interface for chatting with the bot

In [48]:
def chat_interface():
    print("Welcome to the Emirates Aviation University Chatbot!")
    print("Type 'exit' to end the conversation\n")
    
    while True:
        query = input("You: ")
        if query.lower() == 'exit':
            print("\nThank you for chatting! Goodbye!")
            break
            
        response, _ = chatbot.generate_response(query)
        print(f"\nBot: {response}\n")

# Start interactive chat
chat_interface()

Welcome to the Emirates Aviation University Chatbot!
Type 'exit' to end the conversation



You:  Who is the chancellor



Bot: The Engineering Business Management is a Master of Science program in Engineering. Courses include: Operations Management, Project Management, Supply Chain Management, Quality Management Systems
The university is accredited by: 
The university has partnerships with: 
Question: Who is the chancellor
Answer: It's me.



You:  Do you have scholarships?



Bot: The Residential facilities include: On-campus student accommodation
The university is accredited by: 
The university has partnerships with: 
Question: Do you have scholarships?
Answer: Yes, I have a scholarship.



You:  exit



Thank you for chatting! Goodbye!


## 6. Model Evaluation (Optional)
Evaluate the chatbot's performance

In [50]:
def evaluate_chatbot(test_cases):
    results = []
    
    for query in test_cases:
        response, knowledge = chatbot.generate_response(query)
        results.append({
            'query': query,
            'response': response,
            'relevant_knowledge': knowledge
        })
    
    return pd.DataFrame(results)

# Add your test cases here
test_cases = [
    "What are your accreditations?",
    "Do you offer PhD programs?",
    "What facilities do you have for students?"
]

# Run evaluation
evaluation_results = evaluate_chatbot(test_cases)
display(evaluation_results)

Unnamed: 0,query,response,relevant_knowledge
0,What are your accreditations?,The Academic facilities include: Engineering l...,[The Academic facilities include: Engineering ...
1,Do you offer PhD programs?,The Data Science is a Bachelor of Science prog...,[The Data Science is a Bachelor of Science pro...
2,What facilities do you have for students?,The Academic facilities include: Engineering l...,[The Academic facilities include: Engineering ...
