In [1]:
import ollama
import json
import re

In [2]:
# Initialize Ollama client
client = ollama.Client()

# Define model and classification prompt
model = "qwen2.5:1.5b"

In [27]:
def classify_query_ollama(user_query):
    """Classifies query type and extracts filters dynamically using Qwen."""
    
    # Dynamic prompt (no fixed query)
    prompt = f"""
        You are a smart AI assistant specialized in analyzing complex queries.
        Your task is to classify the user's query into one of the following categories:
        - **Retrieval:** Direct search for facts or topics.
        - **Ranking:** Prioritizing or comparing multiple courses.
        - **Inference:** Logical reasoning or prerequisites.
        - **Multi-Hop:** Complex multi-step logical queries.

        ### **Instructions:**  
        1. **Break down** the query into smaller sub-questions if possible.  
        2. **Identify dependencies** between sub-questions (multi-hop if >1 step).  
        3. **Check if any sub-question involves comparison or ranking.**  
        4. **Decide** on the final query type **after** reasoning.  

        ### **Query:**  
        {user_query}

        ### **Chain of Thought (Reasoning):**  
        1. What is the main goal of this query?  
        2. Are there multiple logical steps or sub-questions?  
        3. Is there a ranking or comparison required?  
        4. Is logical inference or multi-hop reasoning needed?

        ### **Final JSON Output:**  
        Return ONLY a JSON object:
        ```json
        {{
        "subject": "<subject>",
        "filters": {{
            "type": "<free/paid/unknown>",
            "level": "<beginner/intermediate/advanced/unknown>",
            "duration": "<short/long/unknown>"
        }},
        "query_type": "<retrieval/ranking/inference/multi-hop/unknown>"
        }}"""
    
    # Generate response from Ollama
    response = client.generate(model=model, prompt=prompt)
    print("Raw Response from Ollama:")
    print(response.response)

    # Clean response: Remove backticks and whitespace
    cleaned_response = re.sub(r'```json|```', '', response.response).strip()

    # Parse JSON safely
    try:
        parsed_response = json.loads(cleaned_response)
        return parsed_response
    except json.JSONDecodeError as e:
        print("\n❌ Error: Invalid JSON format.")
        print(f"Error Details: {e}")
        return {"error": "Invalid JSON format", "response": cleaned_response}

# Example Usage (Dynamic Query)
user_query = "Which course is suitable for beginners in Data Science?"
result = classify_query_ollama(user_query)
print("\n✅ Final Parsed Result:")
print(json.dumps(result, indent=4))

Raw Response from Ollama:
```json
{
    "subject": "Data Science courses",
    "filters": {
        "type": "free/paid/unknown",
        "level": "beginner",
        "duration": "short"
    },
    "query_type": "retrieval"
}
```

✅ Final Parsed Result:
{
    "subject": "Data Science courses",
    "filters": {
        "type": "free/paid/unknown",
        "level": "beginner",
        "duration": "short"
    },
    "query_type": "retrieval"
}


In [6]:
import json
import re

def classify_query_qwen(user_query):
    """Classifies query type and extracts filters dynamically using Qwen."""
    
    # Dynamic prompt for Qwen
    prompt = f"""
        You are a smart AI assistant specialized in analyzing complex queries.
        Your task is to classify the user's query into one of the following categories:
        - **Retrieval:** Direct search for facts or topics.
        - **Ranking:** Prioritizing or comparing multiple courses.
        - **Inference:** Logical reasoning or prerequisites.
        - **Multi-Hop:** Complex multi-step logical queries.

        ### **Instructions:**  
        1. **Break down** the query into smaller sub-questions if possible.  
        2. **Identify dependencies** between sub-questions (multi-hop if >1 step).  
        3. **Check if any sub-question involves comparison or ranking.**  
        4. **Decide** on the final query type **after** reasoning.  

        ### **Query:**  
        {user_query}

        ### **Chain of Thought (Reasoning):**  
        1. What is the main goal of this query?  
        2. Are there multiple logical steps or sub-questions?  
        3. Is there a ranking or comparison required?  
        4. Is logical inference or multi-hop reasoning needed?  

        ### **Final JSON Output:**  
        Return ONLY a JSON object:
        ```json
        {{
        "subject": "<subject>",
        "filters": {{
            "type": "<free/paid/unknown>",
            "level": "<beginner/intermediate/advanced/unknown>",
            "duration": "<short/long/unknown>"
        }},
        "query_type": "<retrieval/ranking/inference/multi-hop/unknown>"
        }}"""
    
    # Call Qwen API
    response = client.generate(model=model, prompt=prompt)

    print("\n🔍 Raw Response from Qwen:")
    print(response.response)

    # Clean response
    cleaned_response = re.sub(r'```json|```', '', response.response).strip()

    # Safely parse JSON
    try:
        parsed_response = json.loads(cleaned_response)
        return parsed_response
    except json.JSONDecodeError as e:
        print("\n❌ Error: Invalid JSON format.")
        print(f"Error Details: {e}")
        return {"error": "Invalid JSON format", "response": cleaned_response}

# Example Queries
queries = [
    "List all free Python courses.",
    "Which is better: Course A or Course B?",
    "What are the prerequisites for an advanced AI course?",
    "Find a course that teaches Python and covers machine learning."
]

# Testing Each Query
for q in queries:
    print(f"\n🗂️ Query: {q}")
    result = classify_query_qwen(q)
    print("\n✅ Final Parsed Result:")
    print(json.dumps(result, indent=4))



🗂️ Query: List all free Python courses.

🔍 Raw Response from Qwen:
```json
{
    "subject": "Python courses",
    "filters": {
        "type": "free",
        "level": "beginner",
        "duration": "unknown"
    },
    "query_type": "retrieval"
}
```

✅ Final Parsed Result:
{
    "subject": "Python courses",
    "filters": {
        "type": "free",
        "level": "beginner",
        "duration": "unknown"
    },
    "query_type": "retrieval"
}

🗂️ Query: Which is better: Course A or Course B?

🔍 Raw Response from Qwen:
```json
{
    "subject": "Compare Course A and Course B",
    "filters": {
        "type": "Unknown",
        "level": "Unknown",
        "duration": "Unknown"
    },
    "query_type": "Multi-Hop"
}
```

✅ Final Parsed Result:
{
    "subject": "Compare Course A and Course B",
    "filters": {
        "type": "Unknown",
        "level": "Unknown",
        "duration": "Unknown"
    },
    "query_type": "Multi-Hop"
}

🗂️ Query: What are the prerequisites for an advanced

In [7]:
user_query = "I have 3 months studying data science field with nothing about statistical, which course should i study to has deeper knowledge on statistical?"
result = classify_query_qwen(user_query)
result


🔍 Raw Response from Qwen:
```json
{
  "subject": "Data Science",
  "filters": {
    "type": "programming",
    "level": "intermediate",
    "duration": "long"
  },
  "query_type": "inference"
}
```


{'subject': 'Data Science',
 'filters': {'type': 'programming',
  'level': 'intermediate',
  'duration': 'long'},
 'query_type': 'inference'}

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
import json

# Load courses data
with open("D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json", "r") as f:
    courses = json.load(f)

# Initialize SBERT model
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def combine_course_text(course):
    """Creates a combined text from multiple relevant fields."""
    # Extract key fields
    title = course.get("title", "") or ""
    description = course.get("description", "") or ""
    skills = ", ".join(course.get("knowledge_requirements", {}).get("teaches") or [])
    category = course.get("category", "") or ""
    sub_category = course.get("sub_category", "") or ""
    instructors = ", ".join(course.get("instructors") or [])
    provider = course.get("course_info", {}).get("provider", "") or ""
    duration = str(course.get("duration_months") or "")
    
    # Combine into one text string
    combined_text = f"""
    Title: {title}.
    Description: {description}.
    Skills: {skills}.
    Category: {category}, {sub_category}.
    Instructors: {instructors}.
    Provider: {provider}.
    Duration: {duration} months.
    """
    
    return combined_text

# Create embeddings for all courses
course_texts = [combine_course_text(course) for course in courses]
embeddings = embedding_model.encode(course_texts, convert_to_numpy=True)

# Create FAISS index
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)
faiss.write_index(faiss_index, "faiss_course_index_full.bin")

print(f"✅ FAISS index built with {len(course_texts)} courses.")


✅ FAISS index built with 8092 courses.


In [9]:
import faiss
import numpy as np

# Load FAISS index
faiss_index = faiss.read_index("faiss_course_index_full.bin")

# Quick Test: Semantic Search
def semantic_search(query_text, top_k=10):
    """Performs semantic search on FAISS index using SBERT."""
    query_embedding = embedding_model.encode([query_text], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, top_k)
    
    # Retrieve results
    results = [courses[i] for i in indices[0]]
    return results

# Example Test
results = semantic_search("Beginner Data Science courses", top_k=5)
print(f"Retrieved {len(results)} courses.")


Retrieved 5 courses.


In [12]:
results = semantic_search("What are the top 3 Python courses?", top_k=5)
#results
def rank_courses_advanced(courses):
    """Ranks courses considering rating, reviews, duration, and metadata completeness."""
    def score(course):
        # Extract features
        rating = course.get('rating') or 0  # Default to 0 if None
        reviews = course.get('reviews', {}).get('total_reviews') or 0
        duration = course.get('duration_months') or 0
        
        # 1. Penalize missing ratings more severely
        rating_penalty = 0.1 if rating == 0 else 1.0  # Drastic penalty for None
        
        # 2. Penalize excessively long courses (>12 months)
        duration_penalty = 1 if duration <= 12 else 1 / (1 + (duration - 12) * 0.2)
        
        # 3. Boost popular courses with more reviews
        popularity = min(reviews, 1000) / 1000  # Capped at 1000
        
        # Final composite score
        return ((rating * 0.6 + popularity * 0.4) * 
                duration_penalty * rating_penalty)
    
    # Sort by final score
    return sorted(courses, key=score, reverse=True)

# Apply the updated ranking
ranked_courses = rank_courses_advanced(results)


def fill_missing_metadata(course):
    """Fills missing fields with defaults."""
    course['rating'] = course.get('rating', 0)  # Set to 0 if missing
    course['course_info']['language'] = course['course_info'].get('language', 'Unknown')
    course['category'] = course.get('category', 'General')
    return course

# Apply to all results
results = [fill_missing_metadata(c) for c in ranked_courses]
def remove_duplicates(courses):
    """Removes duplicate courses based on title or course ID."""
    seen = set()
    unique_courses = []
    for course in courses:
        if course['title'] not in seen:
            unique_courses.append(course)
            seen.add(course['title'])
    return unique_courses

# Remove duplicates before final output
unique_ranked_courses = remove_duplicates(ranked_courses)

def semantic_rerank(query, results):
    """Reranks results by semantic similarity, prioritizing title and skills."""
    result_texts = []
    for r in results:
        title = r.get('title', "") or ""
        description = r.get('description', "") or ""
        skills = ", ".join(r.get('knowledge_requirements', {}).get('teaches', []) or [])
        
        # Prioritize title and skills more
        combined_text = f"{title} {skills} {description}"
        result_texts.append(combined_text)
    
    # Encode query and results
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    result_embeddings = embedding_model.encode(result_texts, convert_to_numpy=True)
    
    # Compute cosine similarity
    scores = np.dot(result_embeddings, query_embedding.T).flatten()
    
    # Sort by similarity score
    ranked = [r for _, r in sorted(zip(scores, results), reverse=True)]
    return ranked


# Re-rank the current results
reranked_courses = semantic_rerank("Getting started with statistical", unique_ranked_courses)


def format_output(courses):
    """Formats the final ranked list for output."""
    if not courses:
        return "No courses match your criteria."

    response = f"✅ Found {len(courses)} courses:\n"
    for i, course in enumerate(courses[:5], 1):  # Show top 5 results
        response += f"\n{i}. {course['title']} - {course['rating']}⭐"
        response += f"\n   URL: {course['url']}"
        response += f"\n   Duration: {course['duration_months']} months"
        response += f"\n   Suitable for: {', '.join(course['learning_path']['suitable_for'])}"
    return response

# Final formatted output
print(format_output(unique_ranked_courses))



✅ Found 5 courses:

1. Python 3 Programming Specialization - 4.7⭐
   URL: https://www.coursera.org/specializations/python-3-programming
   Duration: 5.0 months
   Suitable for: Advanced Learners
2. Python for Command-and-control, Exfiltration and Impact - 4.6⭐
   URL: https://www.coursera.org/learn/command-and-control-exfiltration-and-impact?specialization=pythonforcybersecurity
   Duration: 3.0 months
   Suitable for: Advanced Learners
3. Introduction to Python Programming - 4.4⭐
   URL: https://www.coursera.org/learn/python-programming-intro
   Duration: 27.0 months
   Suitable for: Advanced Learners
4. Python Certification Course - None⭐
   URL: https://www.simplilearn.com/mobile-and-software-development/python-development-training?tag=
   Duration: None months
   Suitable for: Advanced Learners
5. Learn Intermediate Python - None⭐
   URL: https://www.udacity.com/course/intermediate-python-nanodegree--nd303
   Duration: 2.0 months
   Suitable for: Advanced Learners


In [14]:
import ollama
import json

# Initialize Ollama client
client = ollama.Client()
model = "qwen2.5:1.5b"

def generate_inference_prompt(query, course):
    """Generates a prompt for Qwen to infer relevant details."""
    return f"""
    Analyze the following course and the user query. 
    
    **Course Details:**
    Title: {course['title']}
    Description: {course['description']}
    Skills Taught: {', '.join(course.get('knowledge_requirements', {}).get('teaches', []))}
    Prerequisites: {', '.join(course.get('knowledge_requirements', {}).get('prerequisites', []))}
    Duration: {course.get('duration_months', 'Unknown')} months
    Rating: {course.get('rating', 'Unknown')}
    Suitable For: {', '.join(course.get('learning_path', {}).get('suitable_for', []))}
    Career Paths: {', '.join(course.get('learning_path', {}).get('career_paths', []))}
    
    **User Query:** "{query}"
    
    **Task:** 
    - Determine if this course satisfies the user's query.
    - If the query involves difficulty, infer the level (beginner/intermediate/advanced).
    - If the query involves prerequisites, list the skills required.
    - If the query involves suitability, decide whether the course fits the career or skill needs.
    - If temporal reasoning is needed, explain the sequence of topics.

    Output ONLY a JSON object with this structure:
    {{
      "match": "<yes/no>",
      "difficulty": "<beginner/intermediate/advanced or None>",
      "suitability": "<suitable/not suitable>",
      "prerequisites": "<list of prerequisites or None>",
      "temporal_info": "<before/after/None>"
    }}
    """



### **3. General Inference Function:**  

def perform_inference(query, courses):
    """Performs inference on multiple courses using Qwen."""
    results = []
    
    for course in courses:
        prompt = generate_inference_prompt(query, course)
        response = client.generate(model=model, prompt=prompt)
        
        try:
            # Parse JSON response
            parsed_response = json.loads(response.response)
            # Add course ID to response for tracking
            parsed_response["course_id"] = course["course_id"]
            parsed_response["title"] = course["title"]
            results.append(parsed_response)
        except json.JSONDecodeError:
            # Handle parsing errors
            results.append({
                "course_id": course["course_id"],
                "title": course["title"],
                "error": "Invalid JSON response"
            })
    
    return results


In [15]:
# Example Query
user_query = "What are some beginner-friendly AI courses with no prerequisites?"

# Load processed courses data
with open("D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json", "r") as f:
    courses = json.load(f)

# Perform Inference
inference_results = perform_inference(user_query, courses)

# Output sample results
print(json.dumps(inference_results[:3], indent=4))  # Show first 3 responses


KeyboardInterrupt: 

In [2]:
def generate_batch_prompt(query, courses):
    """Generates a batch prompt for multiple courses with stricter JSON enforcement."""
    course_descriptions = ""
    for course in courses:
        course_descriptions += f"""
        - Course ID: {course['course_id']}
        - Title: {course['title']}
        - Description: {course['description'][:100]}...
        - Skills: {', '.join(course.get('knowledge_requirements', {}).get('teaches', []) or [])}
        - Prerequisites: {', '.join(course.get('knowledge_requirements', {}).get('prerequisites', []) or [])}
        - Duration: {course.get('duration_months', 'Unknown')} months
        - Rating: {course.get('rating', 'Unknown')}
        """
    
    return f"""
    **User Query:** "{query}"
    
    **Courses:** 
    {course_descriptions}
    
    **Task:** 
    1. For each course, determine:
       - If it fits the user query.
       - Difficulty (beginner/intermediate/advanced) - 1 in those 3.
       - Prerequisites (if any. if query request no prerequisites, retrieve courses with no prerequisites).
       - Suitability for career or skill goals.
    
    **IMPORTANT:**  
    - ONLY output a valid JSON object.
    - DO NOT include any text outside the JSON.
    - If a field is missing, use **null** instead of leaving it empty.

    **Final JSON format:**  
    ```json
    {{
        "results": [
            {{
                "course_id": "<course_id>",
                "course_title": "<title>",
                "match": "<yes/no>",
                "difficulty": "<beginner/intermediate/advanced>",
                "prerequisites": ["<prerequisite1>", "<prerequisite2>"] or null,
                "suitability": "<yes/no>"
            }}
        ]
    }}
    ```
    """


## 🚀 **4. Add a Safe JSON Parsing Function:**

import json
import ollama
from concurrent.futures import ThreadPoolExecutor

# Initialize Ollama client
client = ollama.Client()
model = "qwen2.5:1.5b"

def safe_json_parse(response_text):
    """Tries to safely extract a JSON block from response text."""
    try:
        # Look for the first valid JSON block
        json_start = response_text.find("{")
        json_end = response_text.rfind("}") + 1
        json_str = response_text[json_start:json_end]
        return json.loads(json_str)
    except json.JSONDecodeError:
        return {"error": "Invalid JSON response", "raw_response": response_text}

def perform_batch_inference(query, courses, batch_size=5):
    """Performs batch inference using Qwen with parallel execution."""
    results = []
    
    # Divide courses into batches
    batches = [courses[i:i + batch_size] for i in range(0, len(courses), batch_size)]
    
    def run_batch(batch):
        """Runs inference on a batch of courses."""
        prompt = generate_batch_prompt(query, batch)
        response = client.generate(model=model, prompt=prompt)
        
        # Parse response safely
        parsed = safe_json_parse(response.response)
        return parsed.get("results", [{"error": "No valid JSON extracted"}])
    
    # Run batches in parallel
    with ThreadPoolExecutor(max_workers=4) as executor:
        future_to_batch = {executor.submit(run_batch, batch): batch for batch in batches}
        
        for future in future_to_batch:
            try:
                batch_result = future.result()
                results.extend(batch_result)
            except Exception as e:
                results.append({"error": str(e)})
    
    return results


## 🏃‍♂️ **5. Run Inference Again:**
# Example Query
user_query = "Find beginner-level data science courses with no prerequisites."

# Load processed courses data
with open("D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json", "r") as f:
    courses = json.load(f)

# Limit to top 20 for speed (or use FAISS to reduce)
top_courses = courses[:20]  # Replace with FAISS output if needed

# Run inference with improved prompt
inference_results = perform_batch_inference(user_query, top_courses)

# Print the final output
print(json.dumps(inference_results, indent=4))


[
    {
        "course_id": "Ccd34bc1e",
        "course_title": "Machine Learning Specialization",
        "match": "Yes",
        "difficulty": "beginner",
        "prerequisites": [
            "Decision Trees",
            "Artificial Neural Network",
            "Logistic Regression"
        ],
        "suitability": "career"
    },
    {
        "course_id": "Ca68e750c",
        "course_title": "Introduction to Data Science Specialization",
        "match": "Yes",
        "difficulty": "intermediate/advanced",
        "prerequisites": [
            "Data Science",
            "Relational Database Management System (RDBMS)",
            "Cloud Databases"
        ],
        "suitability": "career"
    },
    {
        "course_id": "C43ba268c",
        "course_title": "Data Science Fundamentals with Python and SQL Specialization",
        "match": "Yes",
        "difficulty": "beginner/intermediate",
        "prerequisites": [
            "Data Science",
            "Github",
     

In [18]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import ollama

# ======== Step 1: Load SBERT Model ========
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# ======== Step 2: Load Courses from JSON File ========
with open('D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json', 'r', encoding='utf-8') as file:
    courses = json.load(file)

# ======== Step 3: Create FAISS Index (Embedding + Search) ========
def create_faiss_index(courses):
    course_texts = [course["title"] + " " + course["description"] for course in courses]
    embeddings = sbert_model.encode(course_texts)
    
    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return index, embeddings

faiss_index, embeddings = create_faiss_index(courses)

# ======== Step 4: Define Retrieval Function ========
def retrieve_top_k(query, k=10):
    query_embedding = sbert_model.encode([query])
    distances, indices = faiss_index.search(np.array(query_embedding), k)
    retrieved_courses = [courses[idx] for idx in indices[0]]
    return retrieved_courses

# ======== Step 5: Define Qwen Reasoning Function ========
def contextual_reasoning_qwen(query, retrieved_courses):
    # Define the prompt structure with data format explanation
    prompt = f"""
You are an AI assistant helping to answer course-related questions.

**Task:**  
- Analyze the retrieved courses to answer the query.
- Use context, reasoning, and comparison if needed.

**Data Format:**  
Each course has the following attributes:
- "course_id": Unique ID for the course.
- "title": Name of the course.
- "url": Link to the course.
- "description": Brief overview of the course.
- "category": Main category of the course.
- "sub_category": Subcategory if available.
- "course_info": Language and subtitle information.
- "rating": Overall user rating (out of 5).
- "reviews": Total number of reviews and positive percentage.
- "knowledge_requirements": What the course teaches and prerequisites.
- "learning_path": Suitable learner level and career paths.
- "instructors": List of instructor names.

**User Query:**  
{query}

**Retrieved Courses:**  
{json.dumps(retrieved_courses, indent=2)}

**Instructions:**  
- Analyze the courses to determine if they fully or partially answer the query.
- For comparative questions (e.g., "Which is better for beginners?"), rank or explain.
- For unavailable information, reply with the most appropriate courses to the query.

**Output:**  
Provide a brief reasoning and final suggestion or answer."""

    # ======== Step 6: Call the Qwen Model via Ollama ========
    client = ollama.Client()
    print("Query:", query)
    print("Number of Retrieved Courses:", len(retrieved_courses))
    print("Prompt:", prompt)

    response = client.generate(model="qwen2.5:1.5b", prompt=prompt)
    print("Raw Response:", response)  # Check if response is None




    return {"response": response}

# ======== Step 7: Execute the Full Pipeline ========
query = "Which course is suitable for beginners in Data Science?"
retrieved = retrieve_top_k(query, k=10)
result = contextual_reasoning_qwen(query, retrieved)

# ======== Final Output (Cleaned) ========
final_text = result["response"].response  # Extract the response text

print("Final Reasoning Result from Qwen:\n")
print("=" * 50)
print(final_text.strip())  # Now it's safe to use .strip()
print("=" * 50)



Query: Which course is suitable for beginners in Data Science?
Number of Retrieved Courses: 10
Prompt: 
You are an AI assistant helping to answer course-related questions.

**Task:**  
- Analyze the retrieved courses to answer the query.
- Use context, reasoning, and comparison if needed.

**Data Format:**  
Each course has the following attributes:
- "course_id": Unique ID for the course.
- "title": Name of the course.
- "url": Link to the course.
- "description": Brief overview of the course.
- "category": Main category of the course.
- "sub_category": Subcategory if available.
- "course_info": Language and subtitle information.
- "rating": Overall user rating (out of 5).
- "reviews": Total number of reviews and positive percentage.
- "knowledge_requirements": What the course teaches and prerequisites.
- "learning_path": Suitable learner level and career paths.
- "instructors": List of instructor names.

**User Query:**  
Which course is suitable for beginners in Data Science?

**Ret

In [5]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import ollama

# ======== Step 1: Load SBERT Model ========
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# ======== Step 2: Load Courses from JSON File ========
with open('D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json', 'r', encoding='utf-8') as file:
    courses = json.load(file)

# ======== Step 3: Create FAISS Index (Embedding + Search) ========
def create_faiss_index(courses):
    course_texts = []
    
    for course in courses:
        # Extract key attributes with safe defaults
        title = course.get("title", "")
        description = course.get("description", "")
        category = course.get("category", "")
        sub_category = course.get("sub_category", "")
        rating = str(course.get("rating", ""))
        career_paths = ", ".join(course.get("learning_path", {}).get("career_paths", []))

        # Ensure instructors is always a list
        instructors = course.get("instructors", [])
        if not isinstance(instructors, list):
            instructors = [str(instructors)]
        instructors = ", ".join(instructors)

        # Mark instructors as highly relevant for better retrieval
        instructors_text = f"Instructors: {instructors} (Highly Relevant)" if instructors else ""

        # Combine all relevant fields for embeddings
        text = f"{title} {description} {category} {sub_category} {rating} {career_paths} {instructors_text}"
        course_texts.append(text)
    
    # Generate embeddings
    embeddings = sbert_model.encode(course_texts, convert_to_numpy=True)
    
    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    
    return index, embeddings

# Initialize FAISS index
faiss_index, embeddings = create_faiss_index(courses)

def retrieve_top_k(query, k=10):
    query_embedding = sbert_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, k)

    retrieved_courses = [courses[idx] for idx in indices[0]]

    # Additional filtering for direct matches (e.g., instructor search)
    for course in courses:
        instructors = course.get("instructors", [])
        if not isinstance(instructors, list):
            instructors = [str(instructors)]  # Convert to list of strings

        instructor_text = ", ".join(instructors).lower()
        
        if query.lower() in instructor_text:
            retrieved_courses.insert(0, course)  # Prioritize direct matches

    return retrieved_courses[:k]  # Return only the top-k results


# ======== Step 5: Define Qwen Reasoning Function ========
def contextual_reasoning_qwen(query, retrieved_courses):
    # Define the prompt structure with data format explanation
    prompt = f"""
You are an AI assistant helping users find the best courses.

**Task:**  
- Analyze the retrieved courses to answer the query.
- Use reasoning and comparison where needed.

**Data Format:**  
Each course has the following attributes:
- "title": Course name
- "url": Course link
- "description": Course overview
- "category": Course category
- "rating": User rating
- "instructors": List of instructors
- "learning_path": Suggested learner level and career paths

**User Query:**  
{query}

**Retrieved Courses:**  
{json.dumps(retrieved_courses[:5], indent=2)}  # Show only top 5 for clarity

**Instructions:**  
- Summarize relevant courses for the user.
- If the query is about instructors, highlight their courses first.
- If multiple courses fit, compare them briefly.
- If no exact match, suggest the closest options.

**Output Format:**  
Provide a **concise answer** with reasoning and a final suggestion."""

    # ======== Step 6: Call Qwen Model via Ollama ========
    client = ollama.Client()

    response = client.generate(model="qwen2.5:1.5b", prompt=prompt)

    # Extract response safely
    final_response = response.response if response and hasattr(response, "response") else "No response generated."

    return {"response": final_response}

# ======== Step 7: Execute the Full Pipeline ========
query = "Courses taught by Andrew Ng"
retrieved = retrieve_top_k(query, k=10)
result = contextual_reasoning_qwen(query, retrieved)

# ======== Final Output ========
print("=" * 50)
print("Final Reasoning Result from Qwen:\n")
print(result["response"].strip())  # Ensure clean output
print("=" * 50)


Final Reasoning Result from Qwen:

The course taught by Andrew Ng is "Advanced Learning Algorithms." This specialization from Coursera covers advanced learning algorithms in depth. The courses are part of a Machine Learning Specialization, focusing on Artificial Neural Networks, Xgboost, Tensorflow, Tree Ensembles, and Advice for Model Development.

Andrew Ng was known for creating the original version of Coursera and has extensive experience in machine learning education. He is often associated with teaching advanced algorithms due to his background in artificial intelligence and natural language processing.

Given that multiple courses match "Advanced Learning Algorithms," it's likely that Andrew Ng teaches these as part of a larger specialization or course series on machine learning. The suggested courses align well with the title provided by the user, focusing on advanced learning algorithms with strong prerequisites and career paths for professionals in data science fields like AI

In [6]:

# ======== Step 7: Execute the Full Pipeline ========
query = "Top-rated courses with more than 90% positive reviews"
retrieved = retrieve_top_k(query, k=10)
result = contextual_reasoning_qwen(query, retrieved)

# ======== Final Output ========
print("=" * 50)
print("Final Reasoning Result from Qwen:\n")
print(result["response"].strip())  # Ensure clean output
print("=" * 50)

Final Reasoning Result from Qwen:

Based on the provided information, there are several courses that meet the criteria of being top-rated with over 90% positive reviews:

1. "Reviews & Metrics for Software Improvements" by Coursera.org is highly recommended due to its comprehensive focus on software quality monitoring and review techniques.
2. "Information Systems Auditing, Controls and Assurance" also scores well with a high percentage (94%) of positive feedback.

While there are multiple courses that could be considered for the top 5 based on their positive reviews alone, both "Reviews & Metrics for Software Improvements" and "Information Systems Auditing, Controls and Assurance" align closely with the user's request. Therefore, I suggest focusing on either one if you're looking to explore quality management or auditing in technology-related fields.

Final suggestion: Explore either "Reviews & Metrics for Software Improvements" or "Information Systems Auditing, Controls and Assurance

In [7]:
retrieved

[{'course_id': 'Ce744efaf',
  'title': 'Reviews & Metrics for Software Improvements',
  'url': 'https://www.coursera.org/learn/reviews-and-metrics-for-software-improvements?specialization=product-management',
  'description': 'This course covers techniques for monitoring your projects in order to align client needs, project plans, and software production. It focuses on metrics and reviews to track and improve project progress and software quality.',
  'category': 'Computer Science',
  'sub_category': 'Unknown',
  'course_info': {'provider': None,
   'type': None,
   'language': 'English',
   'subtitle_languages': ['Arabic',
    'French',
    'Portuguese (European)',
    'Italian',
    'Vietnamese',
    'German',
    'Russian',
    'English',
    'Spanish']},
  'rating': 4.7,
  'positive_percentage': 94.0,
  'duration_months': 8.0,
  'reviews': {'total_reviews': 1590, 'positive_percentage': 94.0},
  'knowledge_requirements': {'teaches': ['Software Metric,Agile Software Development,Softw

In [None]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import ollama

# ======== Step 1: Load SBERT Model ========
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# ======== Step 2: Load Courses from JSON File ========
with open('D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json', 'r', encoding='utf-8') as file:
    courses = json.load(file)

# ======== Step 3: Create FAISS Index (Embedding + Search) ========
def create_faiss_index(courses):
    course_texts = []
    for course in courses:
        title = course.get("title", "")
        description = course.get("description", "")
        instructors = course.get("instructors", [])
        if not isinstance(instructors, list):
            instructors = [str(instructors)]
        instructors = ", ".join(instructors)
        knowledge_reqs = course.get("knowledge_requirements", "")
        course_info = course.get("course_info", "")
        text = f"{title} {description} {instructors} {knowledge_reqs} {course_info}"
        course_texts.append(text)

    embeddings = sbert_model.encode(course_texts, convert_to_numpy=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index, embeddings

faiss_index, embeddings = create_faiss_index(courses)

# ======== Step 4: Extract Query Intent Using Qwen ========
def get_query_intent(query):
    prompt = f"""
You are an AI assistant that extracts structured search filters from user queries.

**Task:**
Analyze the user query and identify relevant attributes for filtering.

**Dataset Attributes:**
- "title": Course name.
- "instructors": List of instructor names.
- "category": Main category (e.g., Data Science, AI, Business).
- "sub_category": Subcategory if available.
- "knowledge_requirements.teaches": Topics taught in the course.
- "knowledge_requirements.prerequisites": Prerequisites for the course.
- "learning_path.suitable_for": Suitable learner level (Beginner, Advanced).
- "learning_path.career_paths": Relevant career paths (e.g., Data Scientist, Engineer).
- "language": Language of instruction.
- "rating": Course rating.
- "reviews.total_reviews": Number of reviews.

**User Query:**
"{query}"

**Instructions:**
- Identify relevant attributes based on the user query.
- Provide the search filters in valid JSON format.

**Output Example:**
{{
    "filters": {{
        "category": "Data Science",
        "learning_path.suitable_for": "Beginner"
    }}
}}
"""


    response = ollama.Client().generate(model="qwen2.5:1.5b", prompt=prompt)

    # Ensure response is structured JSON
    try:
        intent = json.loads(response.response)
        return intent
    except json.JSONDecodeError:
        return {"filters": {}}  # Fallback if Qwen fails


# ======== Step 5: Retrieve Courses Using FAISS ========
def retrieve_top_k(query, filters, k=10):
    query_embedding = sbert_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, k)
    retrieved_courses = [courses[idx] for idx in indices[0]]

    filtered_courses = []
    for course in retrieved_courses:
        match = True
        for key, value in filters.items():
            if key in course and value.lower() not in str(course[key]).lower():
                match = False
        if match:
            filtered_courses.append(course)

    return filtered_courses[:k]

# ======== Step 6: Generate Final Response Using Qwen ========
def contextual_reasoning_qwen(query, retrieved_courses):
    prompt = f"""
You are an AI assistant helping to answer course-related questions.

**User Query:**
{query}

**Retrieved Courses:**
{json.dumps(retrieved_courses, indent=2)}

Provide a brief reasoning and final suggestion.
"""
    
    client = ollama.Client()
    response = client.generate(model="qwen2.5:1.5b", prompt=prompt)
    
    return {"response": response.response}

# ======== Step 7: Execute the Full Pipeline ========
query = "Which courses are best for beginners in AI?"
query_intent = get_query_intent(query)
filters = query_intent.get("filters", {})

retrieved = retrieve_top_k(query, filters, k=10)
result = contextual_reasoning_qwen(query, retrieved)

print("Final Reasoning Result from Qwen:\n")
print("=" * 50)
print(result["response"].strip())  
print("=" * 50)



Final Reasoning Result from Qwen:

Reasoning: The provided data contains multiple courses titled "Introduction to Artificial Intelligence (AI)" but with different specializations. These courses cover various aspects of AI, including use cases, applications, concepts like machine learning and deep learning, ethical considerations, and job-related issues.

Final suggestion:
All these courses provide valuable information on artificial intelligence across different domains such as technology, business, key technologies for innovation, and data science. However, based on the provided details, selecting a course with a specialization in "AI-foundations-for-everyone" or "key-technologies-for-business" might offer deeper insights tailored to your specific interests within AI's applications or broader impact on various industries.

Choosing one of these specialized courses would likely provide more relevant and applicable knowledge for your professional goals.


In [3]:
query_intent

{'filters': {}}

In [5]:
import ollama
import json

def get_query_intent(query):
    client = ollama.Client()
    messages = [
        {
            "role": "system",
            "content": """
You are an AI assistant that extracts structured filters from course search queries.

**Task:**
- From the user's query, infer the best matching filters.
- Match query terms like "AI" to "Data Science" if reasonable.
- Only use the following attributes:
  - "title"
  - "instructors"
  - "category"
  - "sub_category"
  - "knowledge_requirements.teaches"
  - "knowledge_requirements.prerequisites"
  - "learning_path.suitable_for"
  - "learning_path.career_paths"
  - "course_info.language"
  - "rating"
  - "reviews.total_reviews"
- Output ONLY valid JSON like:
  {
    "filters": {
      "category": "Data Science",
      "learning_path.suitable_for": "Beginner"
    }
  }
- If nothing matches, output {"filters": {}}
"""
        },
        {
            "role": "user",
            "content": f"User Query: {query}"
        }
    ]
    response = client.chat(model="qwen2.5:1.5b", messages=messages)
    
    try:
        intent = json.loads(response['message']['content'])
        return intent
    except (json.JSONDecodeError, KeyError):
        return {"filters": {}}  # Fallback if parsing fails

# Example usage
query = "Which courses are best for beginners in AI?"
query_intent = get_query_intent(query)
print(query_intent)


{'filters': {'instructors': '', 'title': '', 'category': 'AI', 'sub_category': 'Beginner'}}


In [8]:


import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import ollama

# ======== Step 1: Load SBERT Model ========
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# ======== Step 2: Load Courses from JSON File ========
with open('D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json', 'r', encoding='utf-8') as file:
    courses = json.load(file)

# ======== Step 3: Create FAISS Index (Embedding + Search) ========
def create_faiss_index(courses, nlist=100):
    course_texts = []
    for course in courses:
        title = course.get("title", "")
        description = course.get("description", "")
        instructors = course.get("instructors", [])
        if not isinstance(instructors, list):
            instructors = [str(instructors)]
        instructors = ", ".join(instructors)
        knowledge_reqs = course.get("knowledge_requirements", "")
        course_info = course.get("course_info", "")
        text = f"{title} {description} {instructors} {knowledge_reqs} {course_info}"
        course_texts.append(text)

    embeddings = sbert_model.encode(course_texts, convert_to_numpy=True)
    dimension = embeddings.shape[1]
    
    # Create FAISS index with IVF (Inverted File) index for faster retrieval
    quantizer = faiss.IndexFlatL2(dimension)  # Flat index for quantization
    index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
    
    # Train the index (required before adding data)
    index.train(embeddings)
    
    # Add embeddings to the index
    index.add(embeddings)
    return index, embeddings


faiss_index, embeddings = create_faiss_index(courses)

# ======== Step 4: Extract Query Intent Using Qwen ========
def get_query_intent(query):
    client = ollama.Client()
    messages = [
        {
            "role": "system",
            "content": """
You are an AI assistant that extracts structured filters from course search queries.

**Task:**
- From the user's query, infer the best matching filters.
- Match query terms like "AI" to "Data Science" if reasonable.
- Only use the following attributes:
  - "title"
  - "instructors"
  - "category"
  - "sub_category"
  - "knowledge_requirements.teaches"
  - "knowledge_requirements.prerequisites"
  - "learning_path.suitable_for"
  - "learning_path.career_paths"
  - "course_info.language"
  - "rating"
  - "reviews.total_reviews"
- Output ONLY valid JSON like:
  {
    "filters": {
      "category": "Data Science",
      "learning_path.suitable_for": "Beginner"
    }
  }
- If nothing matches, output {"filters": {}}
"""
        },
        {
            "role": "user",
            "content": f"User Query: {query}"
        }
    ]
    
    try:
        response = client.chat(model="qwen2.5:1.5b", messages=messages)
        intent = json.loads(response['message']['content'])
        return intent
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error in processing query intent: {e}")
        return {"filters": {}}  # Fallback if parsing fails
    except Exception as e:
        print(f"Unexpected error: {e}")
        return {"filters": {}}

# ======== Step 5: Retrieve Courses Using FAISS ========
from fuzzywuzzy import fuzz

def retrieve_top_k(query, filters, k=10):
    query_embedding = sbert_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, k)
    retrieved_courses = [courses[idx] for idx in indices[0]]

    filtered_courses = []
    for course in retrieved_courses:
        match = True
        for key, value in filters.items():
            if key in course:
                # Use fuzzy matching to allow partial matches (case-insensitive)
                course_value = str(course[key]).lower()
                if fuzz.partial_ratio(value.lower(), course_value) < 80:  # Adjust threshold as needed
                    match = False
                    break
        if match:
            filtered_courses.append(course)

    return filtered_courses[:k]


# ======== Step 6: Generate Final Response Using Qwen ========
def contextual_reasoning_qwen(query, retrieved_courses):
    prompt = f"""
You are an AI assistant helping to answer course-related questions.

**User Query:**
{query}

**Retrieved Courses:**
{json.dumps(retrieved_courses, indent=2)}

Provide a brief reasoning and final suggestion.
"""
    try:
        client = ollama.Client()
        response = client.generate(model="qwen2.5:1.5b", prompt=prompt)
        return {"response": response.response}
    except Exception as e:
        print(f"Error generating response: {e}")
        return {"response": "An error occurred while processing the response."}


# ======== Step 7: Execute the Full Pipeline ========
query = "Which courses are best for beginners in AI?"
query_intent = get_query_intent(query)
filters = query_intent.get("filters", {})

retrieved = retrieve_top_k(query, filters, k=10)
result = contextual_reasoning_qwen(query, retrieved)

print("Final Reasoning Result from Qwen:\n")
print("=" * 50)
print(result["response"].strip())  
print("=" * 50)




Error in processing query intent: Expecting value: line 1 column 1 (char 0)
Final Reasoning Result from Qwen:

Based on the available data, we can conclude that several courses related to artificial intelligence are offered by Coursera. The courses cover various aspects of AI, including machine learning, deep learning, neural networks, ethics in AI, and careers in AI.

The final suggestion would be to consider enrolling in a course like "Introduction to Artificial Intelligence" from the list above, as it seems to offer comprehensive coverage of artificial intelligence concepts and applications. This could provide valuable knowledge for those interested in pursuing a career in data science or related fields.


In [10]:
import ollama
import json

def get_query_intent(query):
    client = ollama.Client()
    messages = [
        {
            "role": "system",
            "content": """
You are an AI assistant that extracts structured filters from course search queries.

**Task:**
- From the user's query, infer the best matching filters.
- Match query terms like "AI" to "Data Science" if reasonable.
- Only use the following attributes:
  - "title"
  - "instructors"
  - "category"
  - "sub_category"
  - "knowledge_requirements.teaches"
  - "knowledge_requirements.prerequisites"
  - "learning_path.suitable_for"
  - "learning_path.career_paths"
  - "course_info.language"
  - "rating"
  - "reviews.total_reviews"
- Output ONLY valid JSON like:
  {
    "filters": {
      "category": "Data Science",
      "learning_path.suitable_for": "Beginner"
    }
  }
- If nothing matches, output {"filters": {}}
"""
        },
        {
            "role": "user",
            "content": f"User Query: {query}"
        }
    ]
    response = client.chat(model="qwen2.5:1.5b", messages=messages)
    
    try:
        intent = json.loads(response['message']['content'])
        return intent
    except (json.JSONDecodeError, KeyError):
        return {"filters": {}}  # Fallback if parsing fails

# Example usage
query = "Which courses are best for beginners in AI?"
query_intent = get_query_intent(query)
print(query_intent)


{'filters': {'title': 'AI', 'learning_path.suitable_for': 'Beginner'}}


In [12]:
print(query_intent)

{'filters': {'title': 'AI', 'learning_path.suitable_for': 'Beginner'}}


In [15]:
filters = query_intent.get("filters", {})

retrieved = retrieve_top_k(query, filters, k=10)
result = contextual_reasoning_qwen(query, retrieved)

print("Final Reasoning Result from Qwen:\n")
print("=" * 50)
print(result["response"].strip())  
print("=" * 50)

Final Reasoning Result from Qwen:

Based on the information provided, there are multiple courses in the Coursera catalog that cover Artificial Intelligence (AI), including:

1. "Introduction to Artificial Intelligence" by Google AI - This is an introductory course focused on AI concepts and applications.

2. "Artificial Intelligence with Python" by Deep Learning Course - This course uses Python programming for AI projects and covers machine learning, neural networks, and deep learning algorithms.

3. "Applied Machine Learning in R: Fundamentals of Data Science" by Coursera - This course focuses on practical data science using R and applies machine learning techniques to real-world problems.

4. "Introduction to Artificial Intelligence (AI)" by University of New South Wales (UNSW) - This is a beginner-friendly course that introduces AI concepts through logical reasoning, programming, and problem-solving exercises.

5. "Artificial Intelligence with JavaScript" by Coursera - This course u

In [16]:
filters

{'title': 'AI', 'learning_path.suitable_for': 'Beginner'}

In [17]:
import json
import ollama

# ======== Step 4: Extract Query Intent Using Qwen ========
def get_query_intent(query):
    client = ollama.Client()
    messages = [
        {
            "role": "system",
            "content": """
You are an AI assistant that extracts structured filters from course search queries.

**Task:**
- From the user's query, infer the best matching filters.
- Match query terms like "AI" to "Data Science" if reasonable.
- Only use the following attributes:
  - "title"
  - "instructors"
  - "category"
  - "sub_category"
  - "knowledge_requirements.teaches"
  - "knowledge_requirements.prerequisites"
  - "learning_path.suitable_for"
  - "learning_path.career_paths"
  - "course_info.language"
  - "rating"
  - "reviews.total_reviews"
- Output ONLY valid JSON like:
  {
    "filters": {
      "category": "Data Science",
      "learning_path.suitable_for": "Beginner"
    }
  }
- If nothing matches, output {"filters": {}}
"""
        },
        {
            "role": "user",
            "content": f"User Query: {query}"
        }
    ]
    
    try:
        response = client.chat(model="qwen2.5:1.5b", messages=messages)
        intent = json.loads(response['message']['content'])
        return intent
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error in processing query intent: {e}")
        return {"filters": {}}  # Fallback if parsing fails
    except Exception as e:
        print(f"Unexpected error: {e}")
        return {"filters": {}}

# ======== Step 5: Execute the Query Intent Check ========
query = "Which courses are best for beginners in AI?"
query_intent = get_query_intent(query)

# In kết quả của query_intent
print("Query Intent Result:")
print(json.dumps(query_intent, indent=2))  # Chỉ in kết quả intent

# ======== Step 6: (Optional) If needed, proceed to FAISS retrieval or other processing ========

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from fuzzywuzzy import fuzz

# ======== Step 1: Load SBERT Model ========
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# ======== Step 2: Load Courses from JSON File ========
with open('D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json', 'r', encoding='utf-8') as file:
    courses = json.load(file)

# ======== Step 3: Create FAISS Index (Embedding + Search) ========
def create_faiss_index(courses, nlist=100):
    course_texts = []
    for course in courses:
        title = course.get("title", "")
        description = course.get("description", "")
        instructors = course.get("instructors", [])
        if not isinstance(instructors, list):
            instructors = [str(instructors)]
        instructors = ", ".join(instructors)
        knowledge_reqs = course.get("knowledge_requirements", "")
        course_info = course.get("course_info", "")
        text = f"{title} {description} {instructors} {knowledge_reqs} {course_info}"
        course_texts.append(text)

    embeddings = sbert_model.encode(course_texts, convert_to_numpy=True)
    dimension = embeddings.shape[1]
    
    # Create FAISS index with IVF (Inverted File) index for faster retrieval
    quantizer = faiss.IndexFlatL2(dimension)  # Flat index for quantization
    index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
    
    # Train the index (required before adding data)
    index.train(embeddings)
    
    # Add embeddings to the index
    index.add(embeddings)
    return index, embeddings


faiss_index, embeddings = create_faiss_index(courses)

# ======== Step 7: Retrieve Courses Using FAISS ========
def retrieve_top_k(query, filters, k=10):
    query_embedding = sbert_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, k)
    retrieved_courses = [courses[idx] for idx in indices[0]]

    filtered_courses = []
    for course in retrieved_courses:
        match = True
        for key, value in filters.items():
            if key in course:
                # Use fuzzy matching to allow partial matches (case-insensitive)
                course_value = str(course[key]).lower()
                if fuzz.partial_ratio(value.lower(), course_value) < 80:  # Adjust threshold as needed
                    match = False
                    break
        if match:
            filtered_courses.append(course)

    return filtered_courses[:k]


# ======== Step 8: Generate Final Response Using Qwen ========
def contextual_reasoning_qwen(query, retrieved_courses):
    prompt = f"""
You are an AI assistant helping to answer course-related questions.

**User Query:**
{query}

**Retrieved Courses:**
{json.dumps(retrieved_courses, indent=2)}

Provide a brief reasoning and final suggestion.
"""
    try:
        client = ollama.Client()
        response = client.generate(model="qwen2.5:1.5b", prompt=prompt)
        return {"response": response.response}
    except Exception as e:
        print(f"Error generating response: {e}")
        return {"response": "An error occurred while processing the response."}


# ======== Step 9: Execute the Full Pipeline ========
if query_intent.get("filters"):
    filters = query_intent.get("filters", {})
    retrieved = retrieve_top_k(query, filters, k=10)
    result = contextual_reasoning_qwen(query, retrieved)

    print("Final Reasoning Result from Qwen:\n")
    print("=" * 50)
    print(result["response"].strip())  
    print("=" * 50)


Query Intent Result:
{
  "filters": {
    "category": "AI",
    "learning_path.suitable_for": "Beginner"
  }
}
Final Reasoning Result from Qwen:

The user's question is somewhat unclear as the "courses" mentioned do not exist. However, I can suggest some popular introductory AI courses that one might find online through websites like Coursera, Udemy, or edX.

**Reasoning:**
1. **Coursera**: Offers a range of beginner-friendly AI courses from universities and top tech companies.
2. **Udemy**: Provides various AI tutorials on practical skills with hands-on projects.
3. **edX**: Features courses taught by MIT and other leading institutions in the field.

**Final Suggestion:**
For beginners in AI, I would recommend starting with introductory courses that focus on machine learning algorithms, natural language processing, computer vision basics, and data analysis techniques. Look for courses that cover Python programming as it is widely used in AI development.

Here are some beginner-friendl

In [None]:
import json
import ollama

# ======== Step 4: Extract Query Intent Using Qwen ========
# Assuming the intent includes multiple attributes like 'category', 'rating', etc.

def get_query_intent(query):
    client = ollama.Client()
    messages = [
        {
            "role": "system",
            "content": """
You are an AI assistant that extracts structured filters from course search queries.

**Task:**
- From the user's query, infer the best matching filters.
- Match query terms like "AI" to "Data Science" if reasonable.
- Only use the following attributes:
  - "title"
  - "instructors"
  - "category"
  - "sub_category"
  - "knowledge_requirements.teaches"
  - "knowledge_requirements.prerequisites"
  - "learning_path.suitable_for"
  - "learning_path.career_paths"
  - "course_info.language"
  - "rating"
  - "reviews.total_reviews"
- Output ONLY valid JSON like:
  {
    "filters": {
      "category": "Data Science",
      "learning_path.suitable_for": "Beginner",
      "rating": "4.5",
      "course_info.language": "English"
    }
  }
- If nothing matches, output {"filters": {}}
"""
        },
        {
            "role": "user",
            "content": f"User Query: {query}"
        }
    ]
    
    try:
        response = client.chat(model="qwen2.5:1.5b", messages=messages)
        intent = json.loads(response['message']['content'])
        return intent
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error in processing query intent: {e}")
        return {"filters": {}}  # Fallback if parsing fails
    except Exception as e:
        print(f"Unexpected error: {e}")
        return {"filters": {}}

query = "I am looking for AI courses in English for beginners with a rating above 4.5"
query_intent = get_query_intent(query)

# Print the filters returned
print("Query Intent Result:")
print(json.dumps(query_intent, indent=2))  # Checking the filters returned



# ======== Step 6: (Optional) If needed, proceed to FAISS retrieval or other processing ========

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from fuzzywuzzy import fuzz

# ======== Step 1: Load SBERT Model ========
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# ======== Step 2: Load Courses from JSON File ========
with open('D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json', 'r', encoding='utf-8') as file:
    courses = json.load(file)

# ======== Step 3: Create FAISS Index (Embedding + Search) ========
def create_faiss_index(courses, nlist=100):
    course_texts = []
    for course in courses:
        title = course.get("title", "")
        description = course.get("description", "")
        instructors = course.get("instructors", [])
        if not isinstance(instructors, list):
            instructors = [str(instructors)]
        instructors = ", ".join(instructors)
        knowledge_reqs = course.get("knowledge_requirements", "")
        course_info = course.get("course_info", "")
        text = f"{title} {description} {instructors} {knowledge_reqs} {course_info}"
        course_texts.append(text)

    embeddings = sbert_model.encode(course_texts, convert_to_numpy=True)
    dimension = embeddings.shape[1]
    
    # Create FAISS index with IVF (Inverted File) index for faster retrieval
    quantizer = faiss.IndexFlatL2(dimension)  # Flat index for quantization
    index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
    
    # Train the index (required before adding data)
    index.train(embeddings)
    
    # Add embeddings to the index
    index.add(embeddings)
    return index, embeddings


faiss_index, embeddings = create_faiss_index(courses)

# ======== Step 7: Retrieve Courses Using FAISS ========
def retrieve_top_k(query, filters, k=10):
    query_embedding = sbert_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, k)
    retrieved_courses = [courses[idx] for idx in indices[0]]

    filtered_courses = []
    for course in retrieved_courses:
        match = True
        for key, value in filters.items():
            if key in course:
                # Use fuzzy matching to allow partial matches (case-insensitive)
                course_value = str(course[key]).lower()
                if fuzz.partial_ratio(value.lower(), course_value) < 80:  # Adjust threshold as needed
                    match = False
                    break
        if match:
            filtered_courses.append(course)

    return filtered_courses[:k]



# ======== Step 8: Generate Final Response Using Qwen ========
def contextual_reasoning_qwen(query, retrieved_courses):
    prompt = f"""
You are an AI assistant helping to answer course-related questions.

**User Query:**
{query}

**Retrieved Courses:**
{json.dumps(retrieved_courses, indent=2)}

Provide a brief reasoning and final suggestion.
"""
    try:
        client = ollama.Client()
        response = client.generate(model="qwen2.5:1.5b", prompt=prompt)
        return {"response": response.response}
    except Exception as e:
        print(f"Error generating response: {e}")
        return {"response": "An error occurred while processing the response."}


# ======== Step 9: Execute the Full Pipeline ========
if query_intent.get("filters"):
    filters = query_intent.get("filters", {})
    retrieved = retrieve_top_k(query, filters, k=10)
    result = contextual_reasoning_qwen(query, retrieved)

    print("Final Reasoning Result from Qwen:\n")
    print("=" * 50)
    print(result["response"].strip())  
    print("=" * 50)


Query Intent Result:
{
  "filters": {
    "category": "Data Science",
    "knowledge_requirements.teaches": "AI",
    "course_info.language": "English",
    "learning_path.suitable_for": "Beginner",
    "rating": "4.5"
  }
}
Final Reasoning Result from Qwen:

Since no relevant AI courses were found, I will suggest creating or identifying new courses based on the following criteria:
1. The course should be beginner-level.
2. It should cover AI topics using English as the primary language.
3. It has a rating above 4.5.

Based on these conditions, here's an example of what such a course might look like:

**Course Title: "Introduction to Artificial Intelligence for Beginners in English"**

This course would likely start with basic concepts:
- Introduction to artificial intelligence
- History and importance of AI
- Key areas in AI development (e.g., machine learning, natural language processing)

It could then progress through more technical topics at a beginner level:
- Programming languag

In [20]:

query = "Show me beginner-friendly Python courses with rating above 4 with a focus on machine learning in Vietnamese."
query_intent = get_query_intent(query)

# Print the filters returned
print("Query Intent Result:")
print(json.dumps(query_intent, indent=2))  # Checking the filters returned
if query_intent.get("filters"):
    filters = query_intent.get("filters", {})
    retrieved = retrieve_top_k(query, filters, k=10)
    result = contextual_reasoning_qwen(query, retrieved)

    print("Final Reasoning Result from Qwen:\n")
    print("=" * 50)
    print(result["response"].strip())  
    print("=" * 50)

Error in processing query intent: Expecting value: line 1 column 1 (char 0)
Query Intent Result:
{
  "filters": {}
}


In [24]:
import json

def get_query_intent(query):
    client = ollama.Client()
    messages = [
        {
            "role": "system",
            "content": """
You are an AI assistant that extracts structured filters from course search queries.

**Task:**
- From the user's query, infer the best matching filters.
- Match query terms like "AI" to "Data Science" if reasonable.
- Only use the following attributes:
  - "title"
  - "instructors"
  - "category"
  - "sub_category"
  - "knowledge_requirements.teaches"
  - "knowledge_requirements.prerequisites"
  - "learning_path.suitable_for"
  - "learning_path.career_paths"
  - "course_info.language"
  - "rating"
  - "reviews.total_reviews"
- Output ONLY valid JSON like:
  {
    "filters": {
      "category": "Data Science",
      "learning_path.suitable_for": "Beginner"
    }
  }
- If nothing matches, output {"filters": {}}
"""
        },
        {
            "role": "user",
            "content": f"User Query: {query}"
        }
    ]
    
    try:
        response = client.chat(model="qwen2.5:1.5b", messages=messages)
        print("Raw Response:", response)  # Print the raw response to debug
        
        # Extracting the JSON content by removing the code block formatting
        content = response['message']['content']
        if content.startswith('```json') and content.endswith('```'):
            json_content = content[7:-3].strip()  # Remove the '```json' and '```' parts
        else:
            json_content = content.strip()

        # Now try to parse the cleaned JSON content
        intent = json.loads(json_content)
        return intent
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error in processing query intent: {e}")
        return {"filters": {}}  # Fallback if parsing fails
    except Exception as e:
        print(f"Unexpected error: {e}")
        return {"filters": {}}

# Test the function with a query
query = "Show me beginner-friendly Python courses with rating above 4 with a focus on machine learning in Vietnamese."
query_intent = get_query_intent(query)
print("Query Intent Result:")
print(json.dumps(query_intent, indent=2))


Raw Response: model='qwen2.5:1.5b' created_at='2025-04-05T11:00:58.1934005Z' done=True done_reason='stop' total_duration=5874547600 load_duration=48698800 prompt_eval_count=215 prompt_eval_duration=85000000 eval_count=68 eval_duration=5733000000 message=Message(role='assistant', content='```json\n{\n  "filters": {\n    "category": "Python",\n    "learning_path.suitable_for": "Beginner",\n    "knowledge_requirements.prerequisites": "Basic programming knowledge and an interest in computer science.",\n    "course_info.language": "Vietnamese",\n    "rating": ">4"\n  }\n}\n```', images=None, tool_calls=None)
Query Intent Result:
{
  "filters": {
    "category": "Python",
    "learning_path.suitable_for": "Beginner",
    "knowledge_requirements.prerequisites": "Basic programming knowledge and an interest in computer science.",
    "course_info.language": "Vietnamese",
    "rating": ">4"
  }
}


In [25]:
if query_intent.get("filters"):
    filters = query_intent.get("filters", {})
    retrieved = retrieve_top_k(query, filters, k=10)
    result = contextual_reasoning_qwen(query, retrieved)

    print("Final Reasoning Result from Qwen:\n")
    print("=" * 50)
    print(result["response"].strip())  
    print("=" * 50)

Final Reasoning Result from Qwen:

Currently, there are no beginner-friendly Python courses focused specifically on machine learning that have ratings over 4 stars available in Vietnamese. This makes it difficult to provide direct recommendations at the moment. However, I can suggest exploring other resources such as online platforms like Coursera or Udemy where you might find suitable courses with a higher rating. Additionally, local universities or community colleges offering introductory Python and machine learning courses could also be worth checking out for a Vietnamese-speaking audience.


In [27]:
filters

{'category': 'Python',
 'learning_path.suitable_for': 'Beginner',
 'knowledge_requirements.prerequisites': 'Basic programming knowledge and an interest in computer science.',
 'course_info.language': 'Vietnamese',
 'rating': '>4'}

In [32]:
import json

def get_query_intent(query):
    client = ollama.Client()
    messages = [
        {
            "role": "system",
            "content": """
You are an AI assistant that extracts structured filters from course search queries.

**Task:**
- From the user's query, infer the best matching filters.
- Match query terms like "AI" to "Data Science" if reasonable.
- Only use the following attributes:
  - "title"
  - "instructors"
  - "category"
  - "sub_category"
  - "knowledge_requirements.teaches"
  - "knowledge_requirements.prerequisites"
  - "learning_path.suitable_for"
  - "learning_path.career_paths"
  - "course_info.language"
  - "rating"
  - "reviews.total_reviews"
- Output ONLY valid JSON like:
  {
    "filters": {
      "category": "Data Science",
      "learning_path.suitable_for": "Beginner"
    }
  }
- If nothing matches, output {"filters": {}}
"""
        },
        {
            "role": "user",
            "content": f"User Query: {query}"
        }
    ]
    
    try:
        response = client.chat(model="qwen2.5:1.5b", messages=messages)
        print("Raw Response:", response)  # Print the raw response to debug
        
        # Extracting the JSON content by removing the code block formatting
        content = response['message']['content']
        if content.startswith('```json') and content.endswith('```'):
            json_content = content[7:-3].strip()  # Remove the '```json' and '```' parts
        else:
            json_content = content.strip()

        # Now try to parse the cleaned JSON content
        intent = json.loads(json_content)
        return intent
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error in processing query intent: {e}")
        return {"filters": {}}  # Fallback if parsing fails
    except Exception as e:
        print(f"Unexpected error: {e}")
        return {"filters": {}}

# Test the function with a query
query = "Courses above 4 stars rating taught in French."
query_intent = get_query_intent(query)
print("Query Intent Result:")
print(json.dumps(query_intent, indent=2))



# ======== Step 6: (Optional) If needed, proceed to FAISS retrieval or other processing ========

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from fuzzywuzzy import fuzz

# ======== Step 1: Load SBERT Model ========
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# ======== Step 2: Load Courses from JSON File ========
with open('D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json', 'r', encoding='utf-8') as file:
    courses = json.load(file)

# ======== Step 3: Create FAISS Index (Embedding + Search) ========
def create_faiss_index(courses, nlist=100):
    course_texts = []
    for course in courses:
        title = course.get("title", "")
        description = course.get("description", "")
        instructors = course.get("instructors", [])
        if not isinstance(instructors, list):
            instructors = [str(instructors)]
        instructors = ", ".join(instructors)
        knowledge_reqs = course.get("knowledge_requirements", "")
        course_info = course.get("course_info", "")
        text = f"{title} {description} {instructors} {knowledge_reqs} {course_info}"
        course_texts.append(text)

    embeddings = sbert_model.encode(course_texts, convert_to_numpy=True)
    dimension = embeddings.shape[1]
    
    # Create FAISS index with IVF (Inverted File) index for faster retrieval
    quantizer = faiss.IndexFlatL2(dimension)  # Flat index for quantization
    index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
    
    # Train the index (required before adding data)
    index.train(embeddings)
    
    # Add embeddings to the index
    index.add(embeddings)
    return index, embeddings


faiss_index, embeddings = create_faiss_index(courses)

# ======== Step 7: Retrieve Courses Using FAISS ========
def retrieve_top_k(query, filters, k=10):
    query_embedding = sbert_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, k)
    retrieved_courses = [courses[idx] for idx in indices[0]]

    filtered_courses = []
    for course in retrieved_courses:
        match = True
        for key, value in filters.items():
            if key in course:
                # Use fuzzy matching to allow partial matches (case-insensitive)
                course_value = str(course[key]).lower()
                if fuzz.partial_ratio(value.lower(), course_value) < 80:  # Adjust threshold as needed
                    match = False
                    break
        if match:
            filtered_courses.append(course)

    return filtered_courses[:k]



# ======== Step 8: Generate Final Response Using Qwen ========
def contextual_reasoning_qwen(query, retrieved_courses):
    prompt = f"""
You are an AI assistant helping to answer course-related questions.

**User Query:**
{query}

**Retrieved Courses:**
{json.dumps(retrieved_courses, indent=2)}

Provide a brief reasoning and final suggestion.
"""
    try:
        client = ollama.Client()
        response = client.generate(model="qwen2.5:1.5b", prompt=prompt)
        return {"response": response.response}
    except Exception as e:
        print(f"Error generating response: {e}")
        return {"response": "An error occurred while processing the response."}


# ======== Step 9: Execute the Full Pipeline ========
if query_intent.get("filters"):
    filters = query_intent.get("filters", {})
    retrieved = retrieve_top_k(query, filters, k=10)
    result = contextual_reasoning_qwen(query, retrieved)

    print("Final Reasoning Result from Qwen:\n")
    print("=" * 50)
    print(result["response"].strip())  
    print("=" * 50)


Raw Response: model='qwen2.5:1.5b' created_at='2025-04-05T11:29:15.6098549Z' done=True done_reason='stop' total_duration=6878169500 load_duration=30296500 prompt_eval_count=205 prompt_eval_duration=5084000000 eval_count=38 eval_duration=1755000000 message=Message(role='assistant', content='{\n  "filters": {\n    "rating": ">4"\n  },\n  "learning_path.suitable_for": "Beginner",\n    "course_info.language": "French"\n}', images=None, tool_calls=None)
Query Intent Result:
{
  "filters": {
    "rating": ">4"
  },
  "learning_path.suitable_for": "Beginner",
  "course_info.language": "French"
}
Final Reasoning Result from Qwen:

There were no courses with ratings above 4 stars that were specifically taught in French. To provide suggestions, I would need to look at specific course offerings or search within my database for any relevant courses offered in France. Would you like me to assist further?
