# **Main Model**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install faiss-cpu
import faiss

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import requests
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import os
import json
from typing import Dict, List, Tuple
import gc
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

In [None]:
######## Data Preprocessing #########
class DataPreprocess:
    def __init__(self, device='auto'):
        print(f" Initializing First Step: Data Preprocessing")

        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by','this','these','that','course'])
        self.faiss_index = None
        self.student_profile = {}

        # Data quality tracking
        self.preprocessing_log = {
            'timestamp': datetime.now().isoformat(),
            'issues_found': [],
            'issues_fixed': [],
            'statistics': {}
        }


    def load_and_preprocess_data(self):
        """Load and preprocess real datasets with comprehensive data-driven approach"""

        self.raw_course_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Thesis/course_data.csv")
        self.preprocessing_log['statistics']['course_rows'] = len(self.raw_course_data)

        self.raw_student_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Thesis/student_data.csv")
        self.preprocessing_log['statistics']['student_rows'] = len(self.raw_student_data)


        # Step 1: Clean course data column by column
        self._clean_course_data_comprehensive()

        # Step 2: Clean student data column by column
        self._clean_student_data_comprehensive()

        print(f"\n✅ Data-driven preprocessing completed successfully!")


    def _clean_course_data_comprehensive(self):
        """Clean course data column by column with comprehensive approach"""


        self.course_data = self.raw_course_data.copy()

        # Remove duplicates
        initial_rows = len(self.course_data)
        self.course_data = self.course_data.drop_duplicates()
        removed_duplicates = initial_rows - len(self.course_data)
        if removed_duplicates > 0:
            # print(f"✅ Removed {removed_duplicates} duplicate rows")
            self.preprocessing_log['issues_fixed'].append(f"Removed {removed_duplicates} duplicate course rows")

        # Process each text column individually
        text_columns = ['Department', 'Description', 'Type', 'Skill Required', 'Field Interest','Career Paths', 'Industry Sectors']

        for col in text_columns:
            if col in self.course_data.columns:
                # Step 1: Clean text
                self.course_data[col] = self.course_data[col].apply(self._clean_individual_column)

                # Step 2: Tokenize
                self.course_data[f'{col}_tokens'] = self.course_data[col].apply(self._tokenize_individual_column)

                # Step 3: Lemmatize
                self.course_data[f'{col}_lemmatized'] = self.course_data[f'{col}_tokens'].apply(self._lemmatize_tokens)

                # Step 4: Replace original with processed text
                self.course_data[col] = self.course_data[f'{col}_lemmatized'].apply(lambda x: ' '.join(x) if x else '')

                # Clean up temporary columns
                self.course_data.drop([f'{col}_tokens', f'{col}_lemmatized'], axis=1, inplace=True)

        # Handle Stress Level separately (categorical)
        if 'Stress Level' in self.course_data.columns:
            original_values = self.course_data['Stress Level'].unique()
            self.course_data['Stress Level'] = self.course_data['Stress Level'].apply(self._clean_individual_column)
            self.course_data['Stress Level'] = self.course_data['Stress Level'].apply(self._standardize_stress_level)

            standardized_values = self.course_data['Stress Level'].unique()

            # Generate stress_numeric values
            stress_mapping = {'Low': 1, 'Medium': 2, 'High': 3}
            self.course_data['stress_numeric'] = self.course_data['Stress Level'].map(stress_mapping)
            # print("   ✅ Generated stress_numeric values")
            self.preprocessing_log['issues_fixed'].append("Generated stress_numeric values")


        print(f"\n✅ Course data cleaning completed: {len(self.course_data)} rows")
        self.course_data.to_csv('/content/drive/MyDrive/Colab Notebooks/Thesis/course_data_cleaned.csv', index=False)

    def _clean_student_data_comprehensive(self):
        """Clean student data column by column with comprehensive approach"""

        self.student_data = self.raw_student_data.copy()

        # Process Q1_Study_Hours column specially
        if 'Q1_Study_Hours' in self.student_data.columns:

            def categorize_study_hours(value):
                """Categorize study hours into high, medium, low based on numerical values"""
                # Convert to string for processing
                value_str = str(value).strip().lower()

                # Extract numbers from the string using regex
                numbers = re.findall(r'\d+(?:\.\d+)?', value_str)

                try:
                    hours = float(numbers[0])

                    # Define thresholds (adjust these according to your requirements)
                    if hours <= 2:
                        return 'low'
                    elif hours <= 6:
                        return 'medium'
                    else:
                        return 'high'

                except (ValueError, IndexError):
                    return 'medium'

            self.student_data['Q1_Study_Hours'] = self.student_data['Q1_Study_Hours'].apply(categorize_study_hours)

        # Process each Q column individually (excluding Q1_Study_Hours as it's already processed)
        q_columns = [col for col in self.student_data.columns if col.startswith('Q') and col != 'Q1_Study_Hours']

        for col in q_columns:

            # Step 1: Clean text
            self.student_data[col] = self.student_data[col].apply(self._clean_individual_column)

            # Step 2: Tokenize
            self.student_data[f'{col}_tokens'] = self.student_data[col].apply(self._tokenize_individual_column)

            # Step 3: Lemmatize
            self.student_data[f'{col}_lemmatized'] = self.student_data[f'{col}_tokens'].apply(self._lemmatize_tokens)

            # Step 4: Replace original with processed text
            self.student_data[col] = self.student_data[f'{col}_lemmatized'].apply(lambda x: ' '.join(x) if x else '')

            # Clean up temporary columns
            self.student_data.drop([f'{col}_tokens', f'{col}_lemmatized'], axis=1, inplace=True)

        print(f"\n✅ Student data cleaning completed: {len(self.student_data)} rows")
        self.student_data.to_csv('/content/drive/MyDrive/Colab Notebooks/Thesis/student_data_cleaned.csv', index=False)

    def _clean_individual_column(self, text):
        """Clean individual column text thoroughly"""
        if pd.isna(text) or text == "":
            return ""

        text = str(text)

        # Remove newlines and replace with spaces
        text = re.sub(r'\n+', ' ', text)
        text = re.sub(r'\r+', ' ', text)

        # Remove leading/trailing whitespace
        text = text.strip()

        # Replace multiple spaces with single space
        text = re.sub(r'\s+', ' ', text)

        # Remove trailing periods and commas
        text = text.rstrip('.,;')

        # Remove excessive punctuation
        text = re.sub(r'[.]{2,}', '.', text)
        text = re.sub(r'[!]{2,}', '!', text)
        text = re.sub(r'[?]{2,}', '?', text)
        text = re.sub(r'[,]{2,}', ',', text)

        # Clean up mixed separators
        text = re.sub(r'[,;\n\\]+', ', ', text)
        text = text.strip(', ')

        return text

    def _tokenize_individual_column(self, text):
        """Tokenize individual column text and remove stopwords"""
        if pd.isna(text) or text == "":
            return []

        try:
            # Convert to lowercase and tokenize
            tokens = word_tokenize(str(text).lower())

            # Remove punctuation and non-alphabetic tokens
            tokens = [token for token in tokens if token.isalpha()]

            # Remove stopwords
            tokens = [token for token in tokens if token not in self.stop_words]

            # Remove short tokens
            tokens = [token for token in tokens if len(token) > 2]

            # Remove duplicates while preserving order
            seen = set()
            unique_tokens = []
            for token in tokens:
                if token not in seen:
                    seen.add(token)
                    unique_tokens.append(token)

            return unique_tokens

        except:
            # Fallback tokenization
            text = re.sub(r'[^\w\s]', ' ', str(text).lower())
            tokens = text.split()
            tokens = [token for token in tokens if len(token) > 2 and token not in self.stop_words]
            return list(dict.fromkeys(tokens))

    def _lemmatize_tokens(self, tokens):
        """Lemmatize tokens"""
        if not tokens:
            return []
        try:
            return [self.lemmatizer.lemmatize(token) for token in tokens]
        except:
            return tokens

    def _standardize_stress_level(self, stress):
        """Standardize stress level values"""
        if pd.isna(stress):
            return "Medium"

        stress_str = str(stress).lower().strip()

        if any(word in stress_str for word in ['high', 'difficult', 'challenging', 'intense', 'very high','hard', 'harder','strong']):
            return "High"
        elif any(word in stress_str for word in ['low', 'easy', 'light', 'minimal','easier','very low','weak','lighter']):
            return "Low"
        else:
            return "Medium"




system_instance = DataPreprocess(device='auto')
system_instance.load_and_preprocess_data()

In [None]:
######## MAIN MODEL #########
class ProductionCourseRecommendationSystem:
    def __init__(self, device='auto'):
        """Initialize the system with real Mistral-7B and production-grade components"""
        self.device = self._setup_device(device)
        print(f" Using device: {self.device}")


        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2', device=self.device, trust_remote_code=True)



        # Initialize NLP components for preprocessing
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by','this','these','that'])

        # Data components
        self.course_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Thesis/course_data_cleaned.csv")
        self.student_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Thesis/student_data_cleaned.csv")
        self.course_embeddings = None
        self.faiss_index = None
        self.student_profile = {}
        self.rag_context = ""

        # Data quality tracking
        self.preprocessing_log = {
            'timestamp': datetime.now().isoformat(),
            'issues_found': [],
            'issues_fixed': [],
            'statistics': {}
        }

        # Survey questions
        self.survey_questions = [
            "How many hours can you dedicate to studying?",
            "From Your previous semesters which course was your favorite?",
            "If you had unlimited resources, what project topic would you work on?",
            "What strategies do you naturally use to find solutions to a problem?",
            "What profession do you want to be in the next five years?",
            "List some of your strongest soft/technical skills?",
            "List some of your weakest points about yourself?",
            "What research areas do you find most motivating outside of your academic discipline?",
            "What kind of course would you like the most?",
            "How do you typically respond when you are under stress?"
        ]

    def _setup_device(self, device):
          """Setup optimal device for computation"""
          if device == 'auto':
              if torch.cuda.is_available():
                  return 'cuda'
              elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
                  return 'mps'
              else:
                  return 'cpu'
          return device
    def load_mistral_model(self, use_quantization=True):
        """Load real Mistral-7B model with optimization"""
        model_name = "mistralai/Mistral-7B-Instruct-v0.1"
        token = "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"

        print(" Fallback: Using embedding-only mode")
        self.model_loaded = False
        


    def conduct_enhanced_survey(self):
        """Conduct enhanced survey with validation"""
        print("\n" + "="*80)
        print("🎓 ADVANCED COURSE RECOMMENDATION Questionaries")
        print("="*80)
        print("Please provide detailed answers for better recommendations.")

        responses = {}

        for i, question in enumerate(self.survey_questions, 1):
            print(f"\n📝 Q{i}: {question}")

            # Add specific guidance for certain questions
            if i == 1:
                print("   💡 Example: '8-10 hours per day' or '40 hours per week'")
            elif i == 6:
                print("   💡 Example: 'Python, Machine Learning, Problem Solving, Communication'")
            elif i == 7:
                print("   💡 Example: 'Perfectionism, Time management, Public speaking'")

            response = input("   Your answer: ").strip()

            # Validate response
            while len(response) < 5:
                print("   ⚠️  Please provide a more detailed answer (minimum 5 characters)")
                response = input("   Your answer: ").strip()

            responses[f'Q{i}'] = response

        self.student_profile = responses
        return responses


    def _create_enhanced_embeddings_and_faiss_index(self):
        """Create ultra-optimized embeddings specifically tailored for your dataset"""
        print("\n🎯 CREATING ULTRA-OPTIMIZED EMBEDDINGS FOR YOUR DATASET")
        print("-" * 50)

        combined_texts = []

        for _, row in self.course_data.iterrows():
            # Create hyper-focused semantic components for maximum alignment
            sentence = f"This course is {row['Course Name'].lower()}. {row['Description'].lower()}. It is a {row['Type'].lower()} course. This course require skill like {row['Skill Required'].lower()}. A student should have interest on {row['Field Interest'].lower()}. A student can be {row['Career Paths'].lower()}, after completingthis course. The stress level of this course is {row['Stress Level'].lower()}."
            combined_texts.append(sentence)

        print(f"📚 Encoding {len(combined_texts)} ultra-targeted course descriptions...")

        # Print sample for verification
        print(f"{combined_texts}")

        self.course_embeddings = self.embedding_model.encode(
            combined_texts,
            batch_size=8,  # Smaller batch for maximum quality
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True
        )

        # Build optimized FAISS index
        dimension = self.course_embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatIP(dimension)
        self.faiss_index.add(self.course_embeddings.astype('float32'))

        print(f"✅ FAISS index created with {self.faiss_index.ntotal} courses")
        print(f"📏 Embedding dimension: {dimension}")

        self.preprocessing_log['issues_fixed'].append(f"Created ultra-optimized FAISS index with {self.faiss_index.ntotal} courses")


    def create_enhanced_student_profile(self):
        """Create hyper-optimized student profile for maximum semantic overlap"""
        profile_components = []

        stress_tolerance = self._assess_enhanced_stress_tolerance(self.student_profile['Q10'].lower())
        study_dedication = self._extract_enhanced_hours_preference(self.student_profile['Q1'].lower())

        sentence = f"I can dedicate {study_dedication} study hours. I loved the {self.student_profile['Q2'].lower()}  course from previous semester and want to build projects on {self.student_profile['Q3'].lower()}  that can help in practical applications. To solve problems, I {self.student_profile['Q4'].lower()}. I want to work as a {self.student_profile['Q5'].lower()}. I have skills in {self.student_profile['Q6'].lower()}. My other field interests includes {self.student_profile['Q8'].lower()}. I prefer courses that are {self.student_profile['Q9'].lower()}. My stress management and ability to handle high workload is {stress_tolerance}."

        profile_embedding = self.embedding_model.encode([sentence],normalize_embeddings=True)

        return profile_embedding[0], profile_components


    def advanced_similarity_search(self, student_embedding, k=None):
        """Ultra-enhanced similarity search optimized for your specific dataset"""
        if k is None:
            k = len(self.course_data)

        # Perform FAISS search
        similarities, indices = self.faiss_index.search(student_embedding.reshape(1, -1).astype('float32'), k)

        print("from advanced_similarity_search():")
        print("similarity:", similarities)
        print("indices:", indices)


        # Convert inner product to percentage
        base_similarities = (similarities[0] * 100).clip(0, 100)

        # Apply aggressive intelligent boosting for your student profile
        final_similarities = []

        for i, (base_sim, idx) in enumerate(zip(base_similarities, indices[0])):
            course_row = self.course_data.iloc[idx]
            final_sim = base_sim
            final_similarities.append(final_sim)

        return np.array(final_similarities), indices[0]

    def add_recommended_course(self):
        """Add recommended course to the student profile"""

        self.student_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Thesis/student_data_cleaned.csv')

        k = len(self.course_data)

        for _, row in self.student_data.iterrows():
            stress_tolerance = self._assess_enhanced_stress_tolerance(row['Q10_Stress_Response'].lower())
            sentence = f"I can dedicate {row['Q1_Study_Hours'].lower()} study hours. I loved the {row['Q2_Favorite_Course'].lower()}  course from previous semester and want to build projects on {row['Q3_Project_Topic'].lower()}  that can help in practical applications. To solve problems, I {row['Q4_Problem_Solving'].lower()}. I want to work as a {row['Q5_Career_Goals'].lower()}. I have skills in {row['Q6_Strengths'].lower()}. My other field interests includes {row['Q8_Research_Interests'].lower()}. I prefer courses that are {row['Q9_Course_Preference'].lower()}. My stress management and ability to handle high workload is {stress_tolerance}."

            profile_embeddings = self.embedding_model.encode([sentence],
            batch_size=8,  # Smaller batch for maximum quality
            convert_to_numpy=True,
            normalize_embeddings=True)

            similarities, indices = self.faiss_index.search(profile_embeddings[0].reshape(1, -1).astype('float32'), k)

            arr=[]
            for i in indices[0][:3]:
                course = self.course_data.iloc[i]
                avg = self.calculate_dataset_behavioral_matrics(course, row)
                arr.append({'name':course['Course Name'],'average':avg})

            sorted_recommendations = sorted(arr, key=lambda x: x['average'], reverse=True)

            row['Q11_Course_Recommendation'] = sorted_recommendations[0]['name']

        self.student_data.to_csv('/content/drive/MyDrive/Colab Notebooks/Thesis/student_data_cleaned.csv', index=False)

    def student_similarity_search(self, student_embedding, recommendations):
        """Ultra-enhanced similarity search optimized for your specific dataset"""

        self.student_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Thesis/student_data_cleaned.csv')
        self.faiss_index.reset()

        k = len(self.student_data)

        combined_texts = []
        for _, row in self.student_data.iterrows():
            stress_tolerance = self._assess_enhanced_stress_tolerance(row['Q10_Stress_Response'].lower())
            sentence = f"I can dedicate {row['Q1_Study_Hours'].lower()} study hours and have strong interest in technical courses. I loved the {row['Q2_Favorite_Course'].lower()}  course from previous semester and want to build projects on {row['Q3_Project_Topic'].lower()}  that can help in practical applications. To solve problems, I {row['Q4_Problem_Solving'].lower()}. I want to work as a {row['Q5_Career_Goals'].lower()}. I have skills in {row['Q6_Strengths'].lower()}. My other field interests includes {row['Q8_Research_Interests'].lower()}. I prefer courses that are {row['Q9_Course_Preference'].lower()}. My stress management and ability to handle high workload is {stress_tolerance}."
            combined_texts.append(sentence)

        profile_embeddings = self.embedding_model.encode(
            combined_texts,
            batch_size=8,  # Smaller batch for maximum quality
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True
        )

        # Build optimized FAISS index
        dimension = profile_embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatIP(dimension)
        self.faiss_index.add(profile_embeddings.astype('float32'))

        print(f"✅ FAISS index created with {self.faiss_index.ntotal} students")
        print(f"📏 Embedding dimension: {dimension}")

        # Perform FAISS search
        similarities, indices = self.faiss_index.search(student_embedding.reshape(1, -1).astype('float32'), k)

        # Convert inner product to percentage
        base_similarities = (similarities[0] * 100).clip(0, 100)

        # Apply aggressive intelligent boosting for your student profile
        final_similarities = []

        for i, (base_sim, idx) in enumerate(zip(base_similarities, indices[0])):
            student_row = self.student_data.iloc[idx]
            final_sim = base_sim
            final_similarities.append(final_sim)


        historic_courses = []
        for i in indices[0][:5]:
            student = self.student_data.iloc[i]
            historic_courses.append(student['Q11_Course_Recommendation'])

        main_recommendation=[]
        for rec in recommendations:
          main_recommendation.append(rec['course']['Course Name'])


        matches = 0
        for i, a in enumerate(main_recommendation):
                matches += historic_courses.count(main_recommendation[i])

        # Calculate percentage
        percentage = (matches / len(historic_courses)) * 100


        print("main: ",main_recommendation)
        print("historic: ",historic_courses)
        print(f"Match Percentage: {percentage}%")


        return percentage

    def calculate_dataset_behavioral_matrics(self, course, student_data):

        study_hours = student_data['Q1_Study_Hours']
        favourite_course = student_data['Q2_Favorite_Course']
        project_topic = student_data['Q3_Project_Topic']
        problem_strategy = student_data['Q4_Problem_Solving']
        career_goals = student_data['Q5_Career_Goals']
        strengths = student_data['Q6_Strengths']
        weaknesses = student_data['Q7_Weaknesses']
        research_interests = student_data['Q8_Research_Interests']
        course_preference = student_data['Q9_Course_Preference']
        stress_response = student_data['Q10_Stress_Response']


        stress_tolerance = self._assess_enhanced_stress_tolerance(stress_response)
        study_dedication = self._extract_enhanced_hours_preference(study_hours)


        stress = self._calculate_stress_compatibility(stress_tolerance, study_dedication, course)
        learning_type = self._calculate_type_compatibility(course_preference, course)
        desc = self._calculate_description_compatibility(favourite_course, project_topic,career_goals, course)
        skill = self._calculate_skill_compatibility(strengths, weaknesses, course)
        field = self._calculate_field_compatibility(research_interests, career_goals, course)

        avg_score = (stress + learning_type + desc + skill + field) / 5

        return avg_score



    def calculate_advanced_behavioral_metrics(self):
        """Calculate comprehensive behavioral matching with enhanced logic"""
        metrics = {}

        # Extract key information
        study_hours = self.student_profile['Q1']
        favourite_course = self.student_profile['Q2']
        project_topic = self.student_profile['Q3']
        problem_strategy = self.student_profile['Q4']
        career_goals = self.student_profile['Q5']
        strengths = self.student_profile['Q6']
        weaknesses = self.student_profile['Q7']
        research_interests = self.student_profile['Q8']
        course_preference = self.student_profile['Q9']
        stress_response = self.student_profile['Q10']

        # Enhanced stress tolerance assessment
        stress_tolerance = self._assess_enhanced_stress_tolerance(stress_response)
        study_dedication = self._extract_enhanced_hours_preference(study_hours)

        # Calculate metrics for each course
        for metric_name, calculator in [
            ('stress_matching', self._calculate_stress_compatibility),
            ('type_matching', self._calculate_type_compatibility),
            ('description_matching', self._calculate_description_compatibility),
            ('skill_matching', self._calculate_skill_compatibility),
            ('field_matching', self._calculate_field_compatibility)
        ]:
            matches = []
            for _, course in self.course_data.iterrows():
                if metric_name == 'stress_matching':
                    match = calculator(stress_tolerance, study_dedication, course)
                elif metric_name == 'type_matching':
                    match = calculator(course_preference, course)
                elif metric_name == 'description_matching':
                    match = calculator(favourite_course, project_topic, career_goals, course)
                elif metric_name == 'skill_matching':
                    match = calculator(strengths, weaknesses, course)
                else:  # field_matching
                    match = calculator(research_interests, career_goals, course)

                matches.append(match)

            metrics[metric_name] = matches

        return metrics

    def _calculate_stress_compatibility(self, stress_tolerance, study_dedication, course):
        """Enhanced stress compatibility calculation"""
        course_stress = course['stress_numeric']

        # Base compatibility matrix
        compatibility_matrix = {
            ('high', 3): 95, ('high', 2): 85, ('high', 1): 70,
            ('medium', 3): 60, ('medium', 2): 90, ('medium', 1): 85,
            ('low', 3): 25, ('low', 2): 70, ('low', 1): 95
        }

        base_score = compatibility_matrix.get((stress_tolerance, course_stress), 50)

        # Adjust based on study dedication
        if study_dedication == 'high':
            base_score += 5
        elif study_dedication == 'low' and course_stress >= 2:
            base_score -= 10

        return min(100, max(0, base_score))

    def _calculate_type_compatibility(self, course_preference, course):
        """Enhanced type compatibility using semantic matching"""
        course_type = course['Type']
        course_type_array = course['Type'].split()
        base_similarity = self._calculate_enhanced_text_similarity(course_preference, course_type)

        # Type-specific boost based on keywords
        type_keywords = {
          'technical': ['test','code', 'program', 'technical', 'algorithm', 'system', 'software','application','hands-on','hands on'],
          'practical': ['hands-on', 'practical', 'build', 'create', 'implement', 'project'],
          'analytical': ['analytical','design','analyze', 'data', 'research', 'statistical', 'study', 'investigate','hands-on','hands on'],
          'creative': ['creative', 'design', 'innovative', 'artistic', 'visual', 'original'],
          'theoretical': ['theory', 'theoretical', 'concept', 'abstract', 'academic', 'principle', 'framework'],
          'research': ['research', 'investigate', 'explore', 'discover', 'academic', 'scholarly']
        }

        strategy_lower = course_preference.lower()

        keyword_matches = 0
        for course_types in course_type_array:
          if course_types in type_keywords:
            for keyword in type_keywords[course_types]:
              if keyword in strategy_lower:
                  keyword_matches += 1

        keyword_boost = min(20, keyword_matches * 5)
        base_similarity += keyword_boost

        return min(100, max(0, int(base_similarity)))

    def _calculate_description_compatibility(self, favourite_course, project_topic, career_goals, course):
        # Extract course information
        course_desc = course['Description']

        # Initialize total similarity score
        total_similarity = 0
        weight_sum = 0

        # Question 1: Favorite course from previous semesters (Weight: 25)
        fav_course_similarity = self._calculate_enhanced_text_similarity(favourite_course, course_desc)
        total_similarity += fav_course_similarity * 30
        weight_sum += 30

        # Question 2: Dream project topic (Weight: 30 - Highest weight)

        project_similarity = self._calculate_enhanced_text_similarity(project_topic, course_desc)
        # Also check against field interest for better matching
        field_similarity = self._calculate_enhanced_text_similarity(project_topic, course['Field Interest'])
        combined_similarity = max(project_similarity, field_similarity)
        total_similarity += combined_similarity * 40
        weight_sum += 40



        # Question 3: Career goals (Weight: 25)
        career_similarity = self._calculate_enhanced_text_similarity(career_goals, course_desc)
        career_boost = self._calculate_career_alignment(career_goals, course_desc)
        total_similarity += (career_similarity + career_boost) * 30
        weight_sum += 30


        # Calculate weighted average
        if weight_sum > 0:
            base_similarity = total_similarity / weight_sum
        else:
            base_similarity = 0


        total = fav_course_similarity+combined_similarity+career_similarity


        return min(100, max(0, int(total)))


    def _calculate_career_alignment(self, career_goals, course_desc):
        """Calculate career-specific alignment boost"""
        career_keywords = {
            'data scientist': ['data science', 'machine learning', 'analytics', 'statistical', 'python', 'data analysis'],
            'software engineer': ['software development', 'programming', 'coding', 'software engineering', 'system design'],
            'cybersecurity': ['security', 'cryptography', 'network security', 'ethical hacking', 'cybersecurity'],
            'ai researcher': ['artificial intelligence', 'machine learning', 'neural networks', 'deep learning', 'AI'],
            'web developer': ['web development', 'frontend', 'backend', 'javascript', 'html', 'css'],
            'mobile developer': ['mobile', 'android', 'ios', 'app development', 'mobile computing'],
            'game developer': ['game development', 'computer graphics', 'gaming', 'unity', 'unreal'],
            'database administrator': ['database', 'sql', 'data management', 'database design'],
            'cloud engineer': ['cloud computing', 'aws', 'azure', 'devops', 'cloud architecture'],
            'robotics engineer': ['robotics', 'embedded systems', 'automation', 'sensors', 'control systems']
        }

        career_lower = career_goals.lower()
        course_lower = course_desc.lower()

        boost = 0
        for career, keywords in career_keywords.items():
            if career in career_lower:
                matching_keywords = sum(1 for keyword in keywords if keyword in course_lower)
                boost += matching_keywords * 3  # 3 points per matching keyword

        return boost

    def _calculate_skill_compatibility(self, strengths, weaknesses, course):
        """Enhanced skill compatibility calculation"""
        skills_required = course['Skill Required']
        strength_match = self._calculate_enhanced_text_similarity(strengths, skills_required)

        # Check for weakness conflicts
        weakness_penalty = 0
        weakness_lower = weaknesses.lower()
        skills_lower = skills_required.lower()

        conflict_terms = {
            'math': ['mathematics', 'statistical', 'analytics'],
            'programming': ['python', 'javascript', 'coding', 'software'],
            'communication': ['presentation', 'writing', 'teamwork'],
            'time': ['deadline', 'project management', 'organization']
        }

        for weakness_key, skill_terms in conflict_terms.items():
            if weakness_key in weakness_lower:
                if any(term in skills_lower for term in skill_terms):
                    weakness_penalty += 5

        final_score = strength_match - weakness_penalty
        return min(100, max(0, int(final_score)))


    def _calculate_field_compatibility(self, research_interests, career_goals, course):
        """Enhanced field compatibility calculation"""
        career_interest = course['Career Paths'].lower()
        industry = course['Industry Sectors'].lower()
        field_interest = course['Field Interest'].lower()
        base_similarity = self._calculate_enhanced_text_similarity(research_interests, field_interest)

        career_similarity = self._calculate_enhanced_text_similarity(career_goals, career_interest)

        industry_similarity = self._calculate_enhanced_text_similarity(career_goals, industry)

        total = base_similarity+career_similarity+industry_similarity

        if total <= 100:
          return total
        else:
          if 100 < total <= 110:
            new_total = (total * 0.85)
          elif 110 < total <= 120:
            new_total = (total * 0.80)
          elif 120 < total <= 130:
            new_total = (total * 0.75)
          else:
            new_total = (total * 0.70)

          return min(100, new_total)

    def _assess_enhanced_stress_tolerance(self, stress_response):
        """Enhanced stress tolerance assessment"""
        response_lower = stress_response.lower()

        high_indicators = ['calm', 'organized', 'handle', 'manage', 'control', 'systematic',
                          'planned', 'structured', 'methodical', 'efficient']
        medium_indicators = ['break', 'pause', 'time', 'step back', 'breathe', 'moderate']
        low_indicators = ['overwhelmed', 'panic', 'stressed', 'anxious', 'difficult',
                         'struggle', 'freeze', 'shutdown']

        high_score = sum(1 for indicator in high_indicators if indicator in response_lower)
        medium_score = sum(1 for indicator in medium_indicators if indicator in response_lower)
        low_score = sum(1 for indicator in low_indicators if indicator in response_lower)

        if high_score >= 2 or (high_score > 0 and medium_score == 0 and low_score == 0):
            return 'high'
        elif low_score >= 2 or (low_score > 0 and high_score == 0):
            return 'low'
        else:
            return 'medium'

    def _extract_enhanced_hours_preference(self, hours_text):
        """Enhanced study hours preference extraction"""
        hours_lower = hours_text.lower()

        # Extract numerical values
        numbers = re.findall(r'\d+', hours_text)

        if numbers:
            max_hours = max(int(num) for num in numbers)
            if max_hours >= 7:
                return 'high'
            elif 3 <= max_hours <= 6:
                return 'medium'
            else:
                return 'low'

        # Fallback to keyword analysis
        if any(word in hours_lower for word in ['many', 'lot', 'intensive', 'dedicated','six', 'seven', 'eight', 'nine', 'very']):
            return 'high'
        elif any(word in hours_lower for word in ['moderate', 'average', 'three', 'four','five', 'not much','about']):
            return 'medium'
        else:
            return 'low'

    def _calculate_enhanced_text_similarity(self, text1, text2):
        """Ultra-optimized text similarity for your specific domain"""

        if not text1 or not text2:
            return 30

        text1 = str(text1).lower().strip()
        text2 = str(text2).lower().strip()

        if not text1 or not text2:
            return 30

        if text1 == text2:
            return 100

        # Method 1: Semantic similarity using embeddings
        try:
            embeddings = self.embedding_model.encode([text1, text2])
            semantic_similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
            semantic_score = semantic_similarity * 100
        except:
            semantic_score = 0

        # Method 2: Enhanced domain-specific matching
        domain_synonyms = {
            'data_analysis': ['data analytics', 'data analysis', 'data science', 'analytics', 'data mining', 'business intelligence'],
            'programming': ['programming', 'coding', 'development', 'software', 'python', 'sql'],
            'prediction': ['prediction', 'forecasting', 'stock market', 'machine learning', 'modeling'],
            'practical': ['practical', 'hands-on', 'applied', 'real-world', 'implementation'],
            'quantum': ['quantum computing', 'quantum', 'architecture', 'nanotechnology', 'advanced computing'],
            'corporate': ['corporate', 'business', 'professional', 'industry', 'enterprise'],
            'technical': ['technical', 'programming', 'software', 'system', 'computer']
        }

        # Calculate domain-specific similarity boost
        domain_boost = 0
        for category, synonyms in domain_synonyms.items():
            text1_has = any(syn in text1 for syn in synonyms)
            text2_has = any(syn in text2 for syn in synonyms)
            if text1_has and text2_has:
                domain_boost += 20  # Significant boost for domain matches

        # Method 3: Direct keyword matching with weights

        high_value_keywords = {
            'python': 25, 'sql': 25, 'data': 20, 'analytics': 20, 'practical': 20,
            'machine learning': 25, 'prediction': 20, 'quantum': 25, 'programming': 15,
            'hands-on': 20, 'corporate': 15, 'development': 15
        }

        keyword_boost = 0
        for keyword, weight in high_value_keywords.items():
            if keyword in text1 and keyword in text2:
                keyword_boost += weight

        # Method 4: TF-IDF with n-grams
        try:
            vectorizer = TfidfVectorizer(
                stop_words='english',
                lowercase=True,
                min_df=1,
                ngram_range=(1, 3),
                analyzer='word'
            )

            tfidf_matrix = vectorizer.fit_transform([text1, text2])
            if tfidf_matrix.shape[0] >= 2:
                tfidf_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
                tfidf_score = tfidf_similarity * 100
            else:
                tfidf_score = 0
        except:
            tfidf_score = 0

        # Intelligent score combination with emphasis on domain relevance
        final_score = max(
            semantic_score * 0.3 + domain_boost * 0.3 + keyword_boost * 0.2 + tfidf_score * 0.2,
            max(semantic_score, tfidf_score) + domain_boost * 0.5 + keyword_boost * 0.3
        )

        # print(int(final_score))
        return min(100, max(0, int(final_score)))

    def generate_real_rag_recommendations(self, top_course_indices, similarity_scores, behavioral_metrics):
        """Generate recommendations using real RAG with Mistral-7B"""
        print("\n🤖 Generating AI-powered recommendations using Mistral-7B...")

        if not self.model_loaded:
            print("⚠️  Mistral-7B not loaded. Using enhanced embedding-based recommendations.")
            return self._generate_fallback_recommendations(top_course_indices, similarity_scores, behavioral_metrics)

        # Prepare context for RAG
        student_context = {
            'study_hours': self.student_profile['Q1'],
            'favorite_course': self.student_profile['Q2'],
            'project_interests': self.student_profile['Q3'],
            'problem_solving': self.student_profile['Q4'],
            'career_goals': self.student_profile['Q5'],
            'strengths': self.student_profile['Q6'],
            'weaknesses': self.student_profile['Q7'],
            'research_interests': self.student_profile['Q8'],
            'course_preferences': self.student_profile['Q9'],
            'stress_management': self.student_profile['Q10']
        }
        course_context = self._prepare_course_context(top_course_indices[:5])
        recommendations = []

        for i, course_idx in enumerate(top_course_indices[:3]):
            course = self.course_data.iloc[course_idx]
            base_confidence = similarity_scores[i]

            # Prepare prompt for this specific course
            prompt = self._create_recommendation_prompt(student_context, course, behavioral_metrics, course_idx, base_confidence)

            try:
                ai_confidence = self._generate_with_mistral(prompt)


                recommendations.append({
                    'course': course,
                    'confidence': ai_confidence[i],
                    'index': course_idx,
                    'ai_analysis': ai_confidence[i],
                    'base_confidence': base_confidence
                })

            except Exception as e:
                print("Operation Failed")
                recommendations.append({
                    'course': course,
                    'confidence': base_confidence,
                    'index': course_idx,
                    'ai_analysis': "Analysis unavailable - using embedding-based recommendation.",
                    'base_confidence': base_confidence
                })

        return recommendations

    def _prepare_course_context(self, course_indices):
        """Prepare course context for RAG"""
        courses = []
        for idx in course_indices:
            course = self.course_data.iloc[idx]
            courses.append({
                'name': course['Course Name'],
                'description': course['Description'],
                'type': course['Type'],
                'skills': course['Skill Required'],
                'field': course['Field Interest'],
                'stress_level': course['Stress Level']
            })
        return courses

    def _generate_fallback_recommendations(self, top_course_indices, similarity_scores, behavioral_metrics):
        """Generate enhanced recommendations without Mistral-7B"""
        recommendations = []

        for i, course_idx in enumerate(top_course_indices[:3]):
            course = self.course_data.iloc[course_idx]
            base_confidence = similarity_scores[i]
            print(f"confidence {i}:{base_confidence}")


            # Calculate enhanced confidence using behavioral metrics
            behavior_scores = [
                behavioral_metrics['stress_matching'][course_idx],
                behavioral_metrics['type_matching'][course_idx],
                behavioral_metrics['description_matching'][course_idx],
                behavioral_metrics['skill_matching'][course_idx],
                behavioral_metrics['field_matching'][course_idx]
            ]

            avg_behavior_score = np.mean(behavior_scores)
            enhanced_confidence = (base_confidence * 0.4 + avg_behavior_score * 0.6)

            # Generate basic analysis
            avg_bhvr_score = self._generate_basic_analysis(course, behavioral_metrics, course_idx)

            recommendations.append({
                'course': course,
                'confidence': enhanced_confidence,
                'index': course_idx,
                'avg_bhvr_score': avg_bhvr_score,
                'base_confidence': base_confidence
            })

        sorted_recommendations = sorted(recommendations, key=lambda x: x['avg_bhvr_score'], reverse=True)

        return sorted_recommendations

    def _generate_basic_analysis(self, course, behavioral_metrics, course_idx):
        """Generate basic analysis without AI model"""
        stress_score = behavioral_metrics['stress_matching'][course_idx]
        type_score = behavioral_metrics['type_matching'][course_idx]
        desc_score = behavioral_metrics['description_matching'][course_idx]
        skill_score = behavioral_metrics['skill_matching'][course_idx]
        field_score = behavioral_metrics['field_matching'][course_idx]

        avg_score = np.mean([stress_score, type_score, desc_score, skill_score, field_score])

        return avg_score

    def create_user_course_correlation_matrix(self):
        """Create correlation matrix between user inputs and course fields"""
        print("\n🔍 Creating User Input vs Course Field Correlation Analysis...")

        # Define user questions and course fields for analysis
        user_questions = {
            'Q1': 'Study Hours',
            'Q2': 'Favorite Course',
            'Q3': 'Project Interest',
            'Q4': 'Problem Solving',
            'Q5': 'Career Goals',
            'Q6': 'Strengths',
            'Q7': 'Weaknesses',
            'Q8': 'Research Interests',
            'Q9': 'Course Preference',
            'Q10': 'Stress Response'
        }

        course_fields = {
            'Description': 'Course Description',
            'Type': 'Course Type',
            'Skill Required': 'Required Skills',
            'Field Interest': 'Field Interest',
            'Stress Level': 'Stress Level'
        }

        # Create correlation matrix
        correlation_matrix = np.zeros((len(user_questions), len(course_fields)))

        print("📊 Calculating cross-correlations...")

        # For each user question
        for i, (q_key, q_label) in enumerate(user_questions.items()):
            user_response = self.student_profile.get(q_key, '')

            # For each course field
            for j, (field_key) in enumerate(course_fields.items()):
                similarities = []

                # Calculate similarity with each course's field
                for _, course in self.course_data.iterrows():
                    course_field_value = str(course[field_key])
                    similarity = self._calculate_enhanced_text_similarity(user_response, course_field_value)
                    similarities.append(similarity)

                # Average similarity across all courses
                correlation_matrix[i, j] = np.mean(similarities)

        return correlation_matrix, list(user_questions.values()), list(course_fields.values())


    def _print_correlation_analysis_results(self, correlation_matrix, user_labels, course_labels, top_correlations):
        """Print detailed correlation analysis results"""
        print("\n📊 CORRELATION ANALYSIS RESULTS:")
        print("=" * 60)

        print("\n🏆 TOP 5 STRONGEST CORRELATIONS:")
        for i, item in enumerate(top_correlations[:5], 1):
            print(f"{i}. {item['user_input']} ↔ {item['course_field']}: {item['correlation']:.1f}%")

        print(f"\n📈 OVERALL STATISTICS:")
        print(f"Average Correlation: {np.mean(correlation_matrix):.1f}%")
        print(f"Highest Correlation: {np.max(correlation_matrix):.1f}%")
        print(f"Lowest Correlation: {np.min(correlation_matrix):.1f}%")
        print(f"Standard Deviation: {np.std(correlation_matrix):.1f}%")

        # Analyze which user inputs are most predictive
        user_avg_correlations = np.mean(correlation_matrix, axis=1)
        best_user_input_idx = np.argmax(user_avg_correlations)

        print(f"\n🎯 MOST PREDICTIVE USER INPUT:")
        print(f"{user_labels[best_user_input_idx]}: {user_avg_correlations[best_user_input_idx]:.1f}% avg correlation")

        # Analyze which course fields are most correlated
        course_avg_correlations = np.mean(correlation_matrix, axis=0)
        best_course_field_idx = np.argmax(course_avg_correlations)

        print(f"\n🎯 MOST CORRELATED COURSE FIELD:")
        print(f"{course_labels[best_course_field_idx]}: {course_avg_correlations[best_course_field_idx]:.1f}% avg correlation")

        # Find strongest positive correlations
        high_correlations = [(i, j, correlation_matrix[i, j])
                            for i in range(len(user_labels))
                            for j in range(len(course_labels))
                            if correlation_matrix[i, j] > 70]

        if high_correlations:
            print(f"\n🔥 HIGH CORRELATIONS (>70%):")
            for i, j, corr in sorted(high_correlations, key=lambda x: x[2], reverse=True):
                print(f"   {user_labels[i]} ↔ {course_labels[j]}: {corr:.1f}%")

        # Find potential weak spots
        low_correlations = [(i, j, correlation_matrix[i, j])
                           for i in range(len(user_labels))
                           for j in range(len(course_labels))
                           if correlation_matrix[i, j] < 30]

        if low_correlations:
            print(f"\n⚠️  LOW CORRELATIONS (<30%) - Potential Improvement Areas:")
            for i, j, corr in sorted(low_correlations, key=lambda x: x[2]):
                print(f"   {user_labels[i]} ↔ {course_labels[j]}: {corr:.1f}%")

    def _run_production_system(self):
        """Run the complete production recommendation system"""
        print("🚀 Initializing Production Course Recommendation System...")
        print("=" * 80)

        # Initialize system components
        self.load_mistral_model(use_quantization=True)
        self.load_and_preprocess_data()

        # Conduct enhanced survey
        self.conduct_enhanced_survey()

        # Create enhanced student profile
        student_embedding, student_profile_sections = self.create_enhanced_student_profile()

        # Perform advanced similarity search
        similarity_scores, course_indices = self.advanced_similarity_search(student_embedding)

        # Calculate comprehensive behavioral metrics
        behavioral_metrics = self.calculate_advanced_behavioral_metrics()

        # Generate AI-powered recommendations
        recommendations = self.generate_real_rag_recommendations(
            course_indices, similarity_scores, behavioral_metrics
        )

        # Display comprehensive results
        self._display_production_results(recommendations, behavioral_metrics, similarity_scores)

        # Generate advanced visualizations
        self._create_production_visualizations(recommendations, behavioral_metrics, similarity_scores)

        # Cleanup GPU memory if needed
        if self.device == 'cuda':
            torch.cuda.empty_cache()
            gc.collect()

        return recommendations, behavioral_metrics

    def _display_production_results(self, recommendations, metrics, all_similarity_scores):
        """Display comprehensive production results"""
        print("\n" + "="*80)
        print("🎯 PRODUCTION RECOMMENDATION RESULTS")
        print("="*80)
        print(all_similarity_scores)
        best_course = recommendations[0]
        best_idx = best_course['index']

        print(f"\n🏆 TOP 3 RECOMMENDATIONS:")
        for i, rec in enumerate(recommendations, 1):
            confidence_icon = "🥇" if i == 1 else "🥈" if i == 2 else "🥉"
            print(f"\n\n{confidence_icon} {i}. {rec['course']['Course Name']}")
            # print(f"    Confidence: {rec['confidence']:.1f}% | Type: {rec['course']['Type']} | Field: {rec['course']['Field Interest']}")
            print(f"Confidence: {rec['confidence']:.1f}%")

            print(f"avg behavior score: {rec['avg_bhvr_score']:.1f}%")
            print(f"Base Similarity: {rec['base_confidence']:.1f}%")
            print(f"Stress Compatibility: {metrics['stress_matching'][rec['index']]:.1f}%")
            print(f"Learning Preference: {metrics['type_matching'][rec['index']]:.1f}%")
            print(f"Interest Alignment: {metrics['description_matching'][rec['index']]:.1f}%")
            print(f"Skill Compatibility: {metrics['skill_matching'][rec['index']]:.1f}%")
            print(f"Career Interest: {metrics['field_matching'][rec['index']]:.1f}%")


        # Statistics
        avg_confidence = np.mean([rec['confidence'] for rec in recommendations])
        print(f"\n📈 SYSTEM STATISTICS:")
        print(f"Average Top-3 Confidence: {avg_confidence:.1f}%")
        print(f"Total Courses Analyzed: {len(self.course_data)}")
        print(f"AI Model Status: {'✅ Active' if self.model_loaded else '⚠️ Fallback Mode'}")

    def _create_production_visualizations(self, recommendations, metrics, all_similarity_scores):
        """Create modern, professional individual plots using pure matplotlib"""
        print("\n📊 Generating modern individual visualization reports...")

        # Modern color palettes and styling
        modern_colors = ['#FF6B9D', '#45B7D1', '#96CEB4', '#FECA57', '#FF9FF3']
        accent_color = '#2C3E50'
        background_color = '#FAFAFA'
        text_color = '#2C3E50'
        grid_color = '#E8E8E8'

        # Enhanced typography
        title_font = {'family': 'sans-serif', 'weight': 'bold', 'size': 16}
        label_font = {'family': 'sans-serif', 'weight': 'normal', 'size': 12}
        tick_font = {'family': 'sans-serif', 'weight': 'normal', 'size': 10}

        # Set global matplotlib parameters for modern look
        plt.rcParams.update({
            'font.family': 'sans-serif',
            'font.sans-serif': ['Arial', 'Helvetica', 'DejaVu Sans'],
            'axes.facecolor': background_color,
            'figure.facecolor': 'white',
            'axes.edgecolor': '#CCCCCC',
            'axes.linewidth': 1,
            'axes.spines.top': False,
            'axes.spines.right': False,
            'axes.grid': True,
            'grid.color': grid_color,
            'grid.alpha': 0.6,
            'grid.linewidth': 0.8
        })

        # Prepare common data
        best_idx = recommendations[0]['index']
        best_course_name = recommendations[0]['course']['Course Name']
        metric_names = ['Stress\nCompatibility', 'Learning Style\nAlignment', 'Interest\nConvergence',
                      'Skill\nCompatibility', 'Career\nRelevance']
        metric_values = [
            metrics['stress_matching'][best_idx],
            metrics['type_matching'][best_idx],
            metrics['description_matching'][best_idx],
            metrics['skill_matching'][best_idx],
            metrics['field_matching'][best_idx]
        ]

        # 1. Enhanced Behavioral Analysis Plot
        fig1, ax1 = plt.subplots(figsize=(14, 8), facecolor='white')

        # Create gradient effect for bars
        bars1 = ax1.bar(metric_names, metric_values, color=modern_colors,
                      edgecolor='white', linewidth=2, alpha=0.9, zorder=3)

        ax1.set_title(f'Five-Dimensional Behavioral Analysis\nBest Match: {best_course_name}',
                    fontdict=title_font, pad=25, color=text_color)
        ax1.set_ylabel('Compatibility Score (%)', fontdict=label_font, color=text_color)
        ax1.set_ylim(0, 110)

        # Modern grid styling
        ax1.grid(True, axis='y', color=grid_color, linewidth=0.8, alpha=0.7, zorder=0)
        ax1.set_axisbelow(True)

        # Enhanced labels with modern styling
        for bar, value in zip(bars1, metric_values):
            height = bar.get_height()
            # Percentage label
            ax1.text(bar.get_x() + bar.get_width()/2, height + 2,
                    f'{value:.1f}%', ha='center', va='bottom',
                    fontweight='bold', fontsize=12, color=text_color,
                    bbox=dict(boxstyle="round,pad=0.3", facecolor='white',
                            edgecolor='none', alpha=0.8))


        # Clean up axes
        ax1.spines['top'].set_visible(False)
        ax1.spines['right'].set_visible(False)
        ax1.spines['left'].set_color('#CCCCCC')
        ax1.spines['bottom'].set_color('#CCCCCC')
        ax1.tick_params(colors=text_color, labelsize=10)


        # 2. Enhanced Top Recommendations Comparison
        fig3, ax3 = plt.subplots(figsize=(16, 9), facecolor='white')

        course_names = [f"{i+1}: {rec['course']['Course Name']}"
                      for i, rec in enumerate(recommendations)]

        metrics_data = {
            'Stress': [metrics['stress_matching'][rec['index']] for rec in recommendations],
            'Learning\nStyle': [metrics['type_matching'][rec['index']] for rec in recommendations],
            'Interest': [metrics['description_matching'][rec['index']] for rec in recommendations],
            'Skills': [metrics['skill_matching'][rec['index']] for rec in recommendations],
            'Career': [metrics['field_matching'][rec['index']] for rec in recommendations]
        }

        x = np.arange(len(course_names))
        width = 0.15

        # Modern color gradient for grouped bars
        for i, (metric, values) in enumerate(metrics_data.items()):
            offset = (i - 2) * width
            bars = ax3.bar(x + offset, values, width,
                          color=modern_colors[i], alpha=0.9,
                          edgecolor='white', linewidth=1.5, zorder=3)

            # Add subtle shadow for each bar group
            shadow_bars = ax3.bar(x + offset + 0.01, values, width,
                                color='gray', alpha=0.15, zorder=1)

            for bar, value in zip(bars, values):
                height = bar.get_height()
                # Modern percentage labels
                ax3.text(bar.get_x() + bar.get_width()/2, height + 2,
                        f'{value:.1f}%', ha='center', va='bottom',
                        fontweight='bold', fontsize=9, color=text_color,
                        bbox=dict(boxstyle="round,pad=0.2", facecolor='white',
                                edgecolor='none', alpha=0.9))

                # Elegant metric names inside bars
                ax3.text(bar.get_x() + bar.get_width()/2, height/2,
                        metric, ha='center', va='center',
                        fontweight='bold', fontsize=8, color='black',
                        rotation=90 if height < 30 else 0)

        ax3.set_title('Detailed Course Recommendations Comparison',
                    fontdict=title_font, pad=25, color=text_color)
        ax3.set_ylabel('Compatibility Score (%)', fontdict=label_font, color=text_color)
        ax3.set_xlabel('Recommended Courses', fontdict=label_font, color=text_color)
        ax3.set_xticks(x)
        ax3.set_xticklabels(course_names, rotation=0, ha='center', fontsize=10)
        ax3.grid(True, axis='y', color=grid_color, linewidth=0.8, alpha=0.7, zorder=0)
        ax3.set_ylim(0, 110)
        ax3.set_axisbelow(True)

        # Clean modern axes
        ax3.spines['top'].set_visible(False)
        ax3.spines['right'].set_visible(False)
        ax3.tick_params(colors=text_color)



def run_production_demo():
    """Run production demo with real Mistral-7B and comprehensive preprocessing"""
    print(f" Initializing Personalized Course Recommendation System")


    # Initialize system
    system = ProductionCourseRecommendationSystem(device='auto')

    # Weak Student
    # system.student_profile = {
    #     'Q1': "Maybe 2-3 hours a day if I'm being optimistic. I get distracted easily and find it hard to focus for long periods.",
    #     'Q2': "I guess Introduction to Programming because it was the easiest and the professor was lenient with deadlines.",
    #     'Q3': "I don't really know. maybe something simple like a basic website or mobile app. Nothing too complicated.",
    #     'Q4': "I usually Google everything or ask classmates for help. Sometimes I just copy solutions and try to understand them later.",
    #     'Q5': "I'm not really sure. Maybe just any software job that pays okay and doesn't require too much overtime.",
    #     'Q6': "HTML, CSS",
    #     'Q7': "I procrastinate a lot, have trouble concentrating, and I'm not very good at math or complex algorithms.",
    #     'Q8': "I don't really do much research. Maybe watching YouTube videos about tech trends?",
    #     'Q9': "Easy courses with lots of practical work and minimal theory. I hate courses with heavy math or complex concepts.",
    #     'Q10': "I usually panic and either give up or do the bare minimum to pass. Sometimes I just skip classes when it gets overwhelming."
    # }

    #Specitic Domain Based Student (Biology)
    system.student_profile = {
        'Q1': '8-10',
        'Q2': 'Introduction to Biology',
        'Q3': 'AI-powered medical diagnosis system that can help doctors',
        'Q4': 'I research to understand the application requirements first, then design test cases systematically and document results clearly',
        'Q5': 'Biological researcher',
        'Q6': 'python programming, molecular biology, biological pattern, analytical skill',
        'Q7': 'Sometimes perfectionist which slows me down, need to improve time management',
        'Q8': 'Artificial Intelligence in healthcare, computer vision, natural language processing',
        'Q9': 'theoretical, analytical and research based',
        'Q10': 'I stay very organized, break tasks into manageable pieces, and maintain work-life balance'
    }

    # # Business Student
    # system.student_profile = {
    #     'Q1': "About 2 hours daily. I'm interested in how technology can improve business operations and customer experience.",
    #     'Q2': "Strategic Management. I enjoyed learning how companies make decisions and compete in different markets.",
    #     'Q3': "A customer analytics platform that helps small businesses understand their customer behavior and optimize marketing strategies.",
    #     'Q4': "I start by understanding the business problem then research what solutions already exist and adapt them to specific needs.",
    #     'Q5': "Business analyst",
    #     'Q6': "Analytical thinking, presentation skills, understanding business processes, Excel.",
    #     'Q7': "I struggle with complex programming and sometimes get overwhelmed by technical details when working with developers.",
    #     'Q8': "Consumer behavior digital marketing strategy and how technology disrupts traditional business models.",
    #     'Q9': "Case study based course with real business scenarios and practical applications of technology solutions.",
    #     'Q10': "I organize my tasks prioritize based on deadlines and discuss challenges with classmates to get different perspectives."
    # }


        # Average Student
    # system.student_profile = {
    #     'Q1': "medium study time around 7 hours.",
    #     'Q2': "Object Oriented Programming - concepts were clear and practical",
    #     'Q3': "Web-based inventory management system for small businesses.",
    #     'Q4': "Research existing solutions online then adapt to my needs",
    #     'Q5': "Full-stack web developer working on web applications",
    #     'Q6': "Strong logical thinking, basic web development, teamwork skills, python, html, css",
    #     'Q7': "Advanced mathematics can be challenging, time management problems.",
    #     'Q8': "Mobile computing and cross-platform development tools.",
    #     'Q9': "Practical Courses with clear examples and step-by-step guidance.",
    #     'Q10': "Talk to friends or family, break work into smaller chunks"
    # }



    # Null Inputs
    # system.student_profile = {
    #     'Q1': '',
    #     'Q2': '',
    #     'Q3': '',
    #     'Q4': '',
    #     'Q5': '',
    #     'Q6': '',
    #     'Q7': '',
    #     'Q8': '',
    #     'Q9': '',
    #     'Q10': '',
    # }

    has_null = any(value == '' for value in system.student_profile.values())

    if(has_null):
        print("You Have not Answered some Questions")
        return system, 0, 0
    else:
      # Load system components with comprehensive preprocessing
      system.load_mistral_model(use_quantization=True)


      system._create_enhanced_embeddings_and_faiss_index()
      student_embedding, student_profile_sections = system.create_enhanced_student_profile()
      similarity_scores, course_indices = system.advanced_similarity_search(student_embedding) #FAISS Performing

      behavioral_metrics = system.calculate_advanced_behavioral_metrics()
      recommendations = system.generate_real_rag_recommendations(course_indices, similarity_scores, behavioral_metrics)


      system._display_production_results(recommendations, behavioral_metrics, similarity_scores)
      system._create_production_visualizations(recommendations, behavioral_metrics, similarity_scores)

      # Cleanup
      if system.device == 'cuda':
          torch.cuda.empty_cache()
          gc.collect()

      print("\n🎯 Production demo completed successfully!")

      return recommendations, behavioral_metrics, student_embedding

if __name__ == "__main__":
    # Run production demo
    # system_instance = ProductionCourseRecommendationSystem(device='auto')
    recommendations, metrics, student_embedding = run_production_demo()
