In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Install required packages
# !pip install transformers datasets sentencepiece

# Global variables
vectorizer = None
tfidf_matrix = None
tokenizer = None
model = None

# Load the raw text data directly - handle the case where these are actual Quranic verses
# instead of a properly formatted CSV
try:
    # First, check if the data has already been processed from a previous run
    try:
        urdu_data = pd.read_csv('/kaggle/working/processed_quran.csv')
        print("Found previously processed data. Loading it...")
    except:
        # Try loading from multiple possible kaggle paths
        possible_paths = [
            '/kaggle/input/urdu-quran-dataset/Urdu.csv',
            '/kaggle/input/urdu-quran/Urdu.csv',
            '/kaggle/input/urdu-csv/Urdu.csv',
            '/kaggle/input/urdu-quran-translation/Urdu.csv',
            '/kaggle/working/Urdu.csv'
        ]
        
        for path in possible_paths:
            try:
                # Try reading the file as raw text first
                with open(path, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
                
                print(f"Successfully read {len(lines)} lines from {path}")
                
                # Create a DataFrame with the verses
                verses = []
                for i, line in enumerate(lines):
                    line = line.strip()
                    if line:  # Skip empty lines
                        verses.append({
                            "Surah": i // 10 + 1,  # Assign artificial surah numbers
                            "Ayah": i % 10 + 1,    # Assign artificial ayah numbers
                            "Translation": line
                        })
                
                urdu_data = pd.DataFrame(verses)
                
                # Save the processed data for future use
                urdu_data.to_csv('/kaggle/working/processed_quran.csv', index=False)
                
                print(f"Created DataFrame with {len(verses)} verses")
                break
            except Exception as e:
                print(f"Error reading {path}: {str(e)}")
                continue
        else:
            # If we get here, none of the paths worked
            # As a fallback, create a DataFrame from the error text itself
            print("Using error messages as data source...")
            
            # Extract verse text from error messages
            pattern = r"Error processing row: (.*?), Error:"
            with open('/kaggle/working/error_log.txt', 'r', encoding='utf-8') as f:
                error_text = f.read()
            
            matches = re.findall(pattern, error_text)
            
            if matches:
                verses = []
                for i, verse in enumerate(matches):
                    verses.append({
                        "Surah": i // 10 + 1,  # Arbitrary surah assignment
                        "Ayah": i % 10 + 1,    # Arbitrary ayah assignment
                        "Translation": verse
                    })
                urdu_data = pd.DataFrame(verses)
                print(f"Created DataFrame with {len(verses)} verses from error log")
            else:
                # Last resort - create a small dummy dataset
                test_verses = [
                    "بسم اللہ الرحمن الرحیم",
                    "الحمد للہ رب العالمین",
                    "الرحمن الرحیم",
                    "مالک یوم الدین",
                    "ایاک نعبد و ایاک نستعین",
                    "اھدنا الصراط المستقیم",
                    "صراط الذین انعمت علیھم غیر المغضوب علیھم و لا الضالین"
                ]
                verses = []
                for i, verse in enumerate(test_verses):
                    verses.append({
                        "Surah": 1,
                        "Ayah": i + 1,
                        "Translation": verse
                    })
                urdu_data = pd.DataFrame(verses)
                print("Created small sample dataset with Surah Al-Fatiha")
except Exception as e:
    print(f"Error during data loading: {str(e)}")
    # Create a minimal dataset as a fallback
    urdu_data = pd.DataFrame([
        {"Surah": 1, "Ayah": 1, "Translation": "بسم اللہ الرحمن الرحیم"},
        {"Surah": 1, "Ayah": 2, "Translation": "الحمد للہ رب العالمین"},
        {"Surah": 1, "Ayah": 3, "Translation": "الرحمن الرحیم"}
    ])
    print("Created minimal fallback dataset")

# Check if we have any data to work with
if urdu_data.empty:
    print("Error: No data available. Please check the input files.")
    exit()

# Check if 'Translation' column exists
if 'Translation' not in urdu_data.columns:
    print("Error: 'Translation' column not found. Check data format.")
    exit()

# Now we can proceed with the chatbot setup
print("\nProcessed data sample:")
print(urdu_data.head())
print(f"Total verses: {len(urdu_data)}")

# Initialize NLP components
try:
    global tokenizer, model, vectorizer, tfidf_matrix
    
    # Initialize tokenizer and model
    print("\nInitializing NLP components...")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    model = AutoModelForQuestionAnswering.from_pretrained("bert-base-multilingual-cased")
    
    # Create the TF-IDF vectorizer and fit it on the translations
    print("Fitting TF-IDF vectorizer...")
    vectorizer = TfidfVectorizer(stop_words=None)  # Don't use English stopwords for Urdu
    tfidf_matrix = vectorizer.fit_transform(urdu_data['Translation'])
    print("TF-IDF vectorizer fitted successfully.")
except Exception as e:
    print(f"Error initializing NLP components: {str(e)}")
    print("Will attempt to continue with limited functionality.")

def find_relevant_context(question, dataset, top_k=3):
    """
    Find relevant Quranic verses for a given question.
    Uses both exact matching and TF-IDF similarity if available.
    
    Args:
        question: The question in Urdu
        dataset: The processed Quran dataset
        top_k: Number of verses to return
        
    Returns:
        String containing relevant verses with references
    """
    global vectorizer, tfidf_matrix
    
    # Step 1: Try an exact match or substring match for critical words
    exact_matches = []
    for _, row in dataset.iterrows():
        # Check if question is contained in the verse or vice versa
        if question in row['Translation'] or any(word in row['Translation'] for word in question.split()):
            exact_matches.append(f"Surah {row['Surah']}, Ayah {row['Ayah']}: {row['Translation']}")
            if len(exact_matches) >= top_k:
                break

    if exact_matches:
        print(f"Found {len(exact_matches)} matches for question: '{question}'")
        return "\n".join(exact_matches)

    # Step 2: If no exact matches and vectorizer is available, use TF-IDF
    if vectorizer is not None and tfidf_matrix is not None:
        try:
            query_tfidf = vectorizer.transform([question])
            cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
            
            # Get the indices of the top_k most similar translations
            most_similar_indices = cosine_similarities.argsort()[-top_k:][::-1]
            
            results = []
            for idx in most_similar_indices:
                row = dataset.iloc[idx]
                similarity_score = cosine_similarities[idx]
                # Only include results with some minimal similarity
                if similarity_score > 0.01:  # Lower threshold for Urdu
                    results.append(f"Surah {row['Surah']}, Ayah {row['Ayah']}: {row['Translation']} (similarity: {similarity_score:.2f})")
            
            if results:
                print(f"Found {len(results)} similar verses using TF-IDF")
                return "\n".join(results)
        except Exception as e:
            print(f"Error during TF-IDF similarity: {str(e)}")
    
    # Step 3: Last resort - return a few random verses if no matches
    import random
    if len(dataset) > 0:
        random_indices = random.sample(range(min(len(dataset), 20)), min(top_k, len(dataset)))
        results = []
        for idx in random_indices:
            row = dataset.iloc[idx]
            results.append(f"Surah {row['Surah']}, Ayah {row['Ayah']}: {row['Translation']} (random selection)")
        
        print(f"No matches found. Returning {len(results)} random verses.")
        return "\n".join(results)
    
    return "معذرت، کوئی متعلقہ آیت نہیں مل سکی۔"

def urdu_chatbot():
    """Interactive Urdu Quran chatbot function"""
    global tokenizer, model
    
    print("Quranic Urdu Chatbot: قرآن سے متعلق سوال پوچھیں۔ (exit لکھ کر باہر نکلیں)")
    
    # Check if we can create the QA pipeline
    try:
        qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
        pipeline_available = True
    except Exception as e:
        print(f"Warning: Could not initialize QA pipeline: {str(e)}")
        print("Will proceed with basic context retrieval only.")
        pipeline_available = False

    while True:
        user_question = input("آپ: ")
        if user_question.lower() == "exit":
            print("چَیٹ بوٹ: اللہ حافظ!")
            break

        # Find context
        try:
            context = find_relevant_context(user_question, urdu_data)
            if not context:
                print("چَیٹ بوٹ: معذرت، میں اس سوال کا جواب نہیں دے سکتا۔")
                continue
            
            # If we have a pipeline, use it for QA
            if pipeline_available:
                try:
                    result = qa_pipeline(question=user_question, context=context, max_length=512)
                    print(f"چَیٹ بوٹ: {result['answer']}")
                except Exception as e:
                    print(f"چَیٹ بوٹ: {context}\n\n(معذرت، QA پائپ لائن میں مسئلہ: {str(e)})")
            else:
                # Just return the context if no pipeline
                print(f"چَیٹ بوٹ: {context}")
        except Exception as e:
            print(f"چَیٹ بوٹ: معذرت، میں اس سوال کا جواب دینے سے قاصر ہوں۔ (Error: {str(e)})")

# Test the function with a sample question
print("\n--- Testing Chatbot ---")
test_question = "رحمن"
print(f"Testing with question: '{test_question}'")

try:
    test_context = find_relevant_context(test_question, urdu_data)
    print(f"Context for '{test_question}':")
    print(test_context)
except Exception as e:
    print(f"Error during testing: {str(e)}")

# Uncomment to run the interactive chatbot
# urdu_chatbot()

print("\nScript completed successfully.")

Error reading /kaggle/input/urdu-quran-dataset/Urdu.csv: [Errno 2] No such file or directory: '/kaggle/input/urdu-quran-dataset/Urdu.csv'
Error reading /kaggle/input/urdu-quran/Urdu.csv: [Errno 2] No such file or directory: '/kaggle/input/urdu-quran/Urdu.csv'
Successfully read 6237 lines from /kaggle/input/urdu-csv/Urdu.csv
Created DataFrame with 6237 verses

Processed data sample:
   Surah  Ayah                                        Translation
0      1     1                            ﻿Surah,Ayah,Translation
1      1     2  1,1,شروع الله کا نام لے کر جو بڑا مہربان نہایت...
2      1     3  1,2,سب طرح کی تعریف خدا ہی کو (سزاوار) ہے جو ت...
3      1     4                      1,3,بڑا مہربان نہایت رحم والا
4      1     5                            1,4,انصاف کے دن کا حاکم
Total verses: 6237

Initializing NLP components...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fitting TF-IDF vectorizer...
TF-IDF vectorizer fitted successfully.

--- Testing Chatbot ---
Testing with question: 'رحمن'
No matches found. Returning 3 random verses.
Context for 'رحمن':
Surah 1, Ayah 8: 1,7,ان لوگوں کے رستے جن پر تو اپنا فضل وکرم کرتا رہا نہ ان کے جن پر غصے ہوتا رہا اور نہ گمراہوں کے (random selection)
Surah 1, Ayah 6: 1,5,(اے پروردگار) ہم تیری ہی عبادت کرتے ہیں اور تجھ ہی سے مدد مانگتے ہیں (random selection)
Surah 2, Ayah 3: 2,5,یہی لوگ اپنے پروردگار (کی طرف) سے ہدایت پر ہیں اور یہی نجات پانے والے ہیں (random selection)

Script completed successfully.


In [3]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import os
import json
import pickle
from pathlib import Path
import difflib
from collections import defaultdict
import logging
import warnings
import unicodedata

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger('QuranSearchEngine')

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning)

class EnhancedQuranSearchEngine:
    """
    Enhanced Quran Search Engine with multi-technique search capabilities
    and comprehensive reference handling.
    """
    
    def __init__(self, cache_dir="/kaggle/working/quran_cache"):
        """Initialize the search engine with various search techniques"""
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True, parents=True)
        
        self.data = None
        self.surah_names = {}
        self.vectorizer = None
        self.tfidf_matrix = None
        self.sentence_transformer = None
        self.sentence_embeddings = None
        self.qa_model = None
        self.qa_tokenizer = None
        self.index_map = {}  # For mapping between different verse numbering systems
        
        logger.info("Initializing Enhanced Quran Search Engine")
        
    def normalize_arabic_text(self, text):
        """Normalize Arabic/Urdu text by removing diacritics and standardizing characters"""
        # Handle None or non-string inputs
        if not isinstance(text, str):
            return ""
            
        # Remove diacritics (harakat)
        text = ''.join(c for c in unicodedata.normalize('NFKD', text) 
                      if not unicodedata.combining(c))
        
        # Standardize certain characters
        text = text.replace('ی', 'ي')  # Standardize ya
        text = text.replace('ۃ', 'ة')  # Standardize ta marbutah
        text = text.replace('ك', 'ک')  # Standardize kaf
        
        return text
    
    def load_data_from_multiple_sources(self):
        """Load Quran data from multiple possible sources, with fallbacks"""
        # 1. Try to load from cache first
        cache_path = self.cache_dir / "processed_quran.pkl"
        if cache_path.exists():
            try:
                logger.info(f"Loading data from cache: {cache_path}")
                with open(cache_path, 'rb') as f:
                    self.data = pickle.load(f)
                return True
            except Exception as e:
                logger.warning(f"Failed to load from cache: {e}")
        
        # 2. Try multiple possible file locations
        possible_paths = [
            # Standard Kaggle paths
            '/kaggle/input/urdu-quran-dataset/Urdu.csv',
            '/kaggle/input/urdu-quran/Urdu.csv',
            '/kaggle/input/urdu-csv/Urdu.csv',
            '/kaggle/input/quran-urdu-translation/quran-urdu.csv',
            '/kaggle/input/urdu-quran-translation/Urdu.csv',
            '/kaggle/working/Urdu.csv',
            # Common local paths
            'data/quran-urdu.csv',
            'quran-data/Urdu.csv',
            # Plain text file possibilities
            '/kaggle/input/urdu-quran-dataset/quran.txt',
            '/kaggle/input/urdu-quran/quran.txt'
        ]
        
        for path in possible_paths:
            try:
                # Try to determine the file type and read appropriately
                if path.endswith('.csv'):
                    logger.info(f"Attempting to read CSV file: {path}")
                    # First try with standard CSV format
                    try:
                        df = pd.read_csv(path)
                        if all(col in df.columns for col in ['Surah', 'Ayah', 'Translation']):
                            self.data = df
                            logger.info(f"Successfully loaded standard CSV: {path}")
                            break
                    except Exception as e:
                        logger.debug(f"Failed to read as standard CSV: {e}")
                    
                    # Try with delimiters
                    for delimiter in [',', '|', '\t']:
                        try:
                            df = pd.read_csv(path, delimiter=delimiter, header=None)
                            if df.shape[1] >= 3:
                                # Assume first 3 columns are Surah, Ayah, Translation
                                df.columns = ['Surah', 'Ayah', 'Translation'] + [f'Extra{i}' for i in range(df.shape[1]-3)]
                                self.data = df
                                logger.info(f"Successfully loaded delimited file: {path} with delimiter: {delimiter}")
                                break
                        except Exception as e:
                            logger.debug(f"Failed to read with delimiter {delimiter}: {e}")
                    
                    # Try as plain text with no header
                    if self.data is None:
                        try:
                            df = pd.read_csv(path, header=None, names=['Text'], encoding='utf-8')
                            verses = []
                            for i, row in df.iterrows():
                                if not isinstance(row['Text'], str):
                                    continue
                                parts = row['Text'].split('|')
                                if len(parts) >= 3:
                                    try:
                                        surah, ayah = int(parts[0]), int(parts[1])
                                        translation = parts[2]
                                        verses.append({"Surah": surah, "Ayah": ayah, "Translation": translation})
                                    except (ValueError, TypeError):
                                        continue
                            
                            if verses:
                                self.data = pd.DataFrame(verses)
                                logger.info(f"Successfully parsed plain text CSV: {path}")
                                break
                        except Exception as e:
                            logger.debug(f"Failed to read as plain text: {e}")
                
                # Try as plain text file
                elif path.endswith('.txt'):
                    logger.info(f"Attempting to read text file: {path}")
                    try:
                        with open(path, 'r', encoding='utf-8') as f:
                            lines = f.readlines()
                        
                        verses = []
                        for i, line in enumerate(lines):
                            line = line.strip()
                            if not line:
                                continue
                                
                            # Check if line has a format like "1:1 - Translation"
                            match = re.match(r'(\d+):(\d+)\s*[-–]\s*(.*)', line)
                            if match:
                                surah, ayah, translation = match.groups()
                                verses.append({
                                    "Surah": int(surah),
                                    "Ayah": int(ayah),
                                    "Translation": translation.strip()
                                })
                            else:
                                # Assign artificial numbers
                                verses.append({
                                    "Surah": (i // 10) + 1, 
                                    "Ayah": (i % 10) + 1,
                                    "Translation": line
                                })
                        
                        if verses:
                            self.data = pd.DataFrame(verses)
                            logger.info(f"Successfully parsed text file: {path}")
                            break
                    except Exception as e:
                        logger.warning(f"Failed to read text file {path}: {e}")
            
            except Exception as e:
                logger.warning(f"Error processing {path}: {e}")
                continue
        
        # 3. Try to extract from error messages if needed
        if self.data is None:
            logger.info("Attempting to extract data from error messages")
            try:
                error_files = [
                    '/kaggle/working/error_log.txt',
                    'error_log.txt'
                ]
                
                for error_file in error_files:
                    if os.path.exists(error_file):
                        with open(error_file, 'r', encoding='utf-8') as f:
                            error_text = f.read()
                        
                        pattern = r"Error processing row: (.*?), Error:"
                        matches = re.findall(pattern, error_text)
                        
                        if matches:
                            verses = []
                            for i, verse in enumerate(matches):
                                verses.append({
                                    "Surah": (i // 10) + 1,
                                    "Ayah": (i % 10) + 1,
                                    "Translation": verse.strip()
                                })
                            
                            self.data = pd.DataFrame(verses)
                            logger.info(f"Created dataset from error log: {error_file}")
                            break
            except Exception as e:
                logger.warning(f"Failed to extract from error log: {e}")
        
        # 4. Create dataset from direct text input
        if self.data is None and 'paste.txt' in possible_paths:
            logger.info("Attempting to create dataset from paste.txt content")
            try:
                with open('paste.txt', 'r', encoding='utf-8') as f:
                    text = f.read()
                
                # Extract error messages containing verses
                pattern = r"Error processing row: (.*?), Error: invalid literal for int\(\)"
                matches = re.findall(pattern, text)
                
                if matches:
                    verses = []
                    for i, verse in enumerate(matches):
                        verses.append({
                            "Surah": (i // 10) + 1,
                            "Ayah": (i % 10) + 1,
                            "Translation": verse.strip()
                        })
                    
                    self.data = pd.DataFrame(verses)
                    logger.info(f"Created dataset from paste.txt content with {len(verses)} verses")
            except Exception as e:
                logger.warning(f"Failed to extract from paste.txt: {e}")
        
        # 5. Use hard-coded fallback data if all else fails
        if self.data is None:
            logger.warning("All data loading methods failed. Using fallback dataset.")
            self.data = pd.DataFrame([
                {"Surah": 1, "Ayah": 1, "Translation": "بسم اللہ الرحمن الرحیم"},
                {"Surah": 1, "Ayah": 2, "Translation": "الحمد للہ رب العالمین"},
                {"Surah": 1, "Ayah": 3, "Translation": "الرحمن الرحیم"},
                {"Surah": 1, "Ayah": 4, "Translation": "مالک یوم الدین"},
                {"Surah": 1, "Ayah": 5, "Translation": "ایاک نعبد و ایاک نستعین"},
                {"Surah": 1, "Ayah": 6, "Translation": "اھدنا الصراط المستقیم"},
                {"Surah": 1, "Ayah": 7, "Translation": "صراط الذین انعمت علیھم غیر المغضوب علیھم و لا الضالین"},
                {"Surah": 2, "Ayah": 1, "Translation": "الم"},
                {"Surah": 2, "Ayah": 2, "Translation": "ذٰلِکَ الْکِتٰبُ لَا رَیْبَ ۚ فِیْہِ ۚ ہُدًی لِّلْمُتَّقِیْنَ"},
                {"Surah": 112, "Ayah": 1, "Translation": "قل هو الله احد"},
                {"Surah": 112, "Ayah": 2, "Translation": "الله الصمد"},
                {"Surah": 112, "Ayah": 3, "Translation": "لم يلد ولم يولد"},
                {"Surah": 112, "Ayah": 4, "Translation": "ولم يكن له كفوا احد"}
            ])
        
        # Load surah names
        self.load_surah_names()
        
        # Cache the processed data
        try:
            with open(cache_path, 'wb') as f:
                pickle.dump(self.data, f)
            logger.info(f"Cached processed data to {cache_path}")
        except Exception as e:
            logger.warning(f"Failed to cache data: {e}")
        
        # Add normalized text column for improved matching
        if 'Translation' in self.data.columns:
            self.data['NormalizedText'] = self.data['Translation'].apply(self.normalize_arabic_text)
            
            # Add combined reference string for convenience
            self.data['Reference'] = self.data.apply(
                lambda x: f"Surah {x['Surah']}:{x['Ayah']}" + 
                        (f" ({self.surah_names.get(x['Surah'], '')})" if x['Surah'] in self.surah_names else ""), 
                axis=1
            )
        else:
            logger.error("Data loaded but 'Translation' column not found. Check data format.")
            return False
        
        return True
    
    def load_surah_names(self):
        """Load Surah names from various possible sources"""
        # First try to load from cache
        cache_path = self.cache_dir / "surah_names.json"
        if cache_path.exists():
            try:
                with open(cache_path, 'r', encoding='utf-8') as f:
                    self.surah_names = json.load(f)
                return
            except:
                pass
        
        # Hardcoded names as fallback
        self.surah_names = {
            1: "الفاتحة (Al-Fatiha)",
            2: "البقرة (Al-Baqara)",
            3: "آل عمران (Aal-Imran)",
            4: "النساء (An-Nisa)",
            5: "المائدة (Al-Ma'ida)",
            6: "الأنعام (Al-An'am)",
            112: "الإخلاص (Al-Ikhlas)",
            113: "الفلق (Al-Falaq)",
            114: "الناس (An-Nas)"
        }
        
        # Try to find a surah names file
        possible_paths = [
            '/kaggle/input/quran-metadata/surah_names.json',
            '/kaggle/input/quran-urdu-translation/surah_names.json',
            'data/surah_names.json'
        ]
        
        for path in possible_paths:
            try:
                with open(path, 'r', encoding='utf-8') as f:
                    self.surah_names = json.load(f)
                logger.info(f"Loaded surah names from {path}")
                break
            except:
                continue
        
        # Cache the names
        try:
            with open(cache_path, 'w', encoding='utf-8') as f:
                json.dump(self.surah_names, f, ensure_ascii=False, indent=2)
        except:
            pass
    
    def initialize_search_methods(self):
        """Initialize multiple search methods for robust text retrieval"""
        if self.data is None or len(self.data) == 0:
            logger.error("No data available. Please load data first.")
            return False
        
        # 1. Initialize TF-IDF vectorizer
        logger.info("Initializing TF-IDF vectorizer...")
        try:
            self.vectorizer = TfidfVectorizer(
                min_df=1, 
                max_df=0.9,
                ngram_range=(1, 3),
                sublinear_tf=True
            )
            self.tfidf_matrix = self.vectorizer.fit_transform(self.data['NormalizedText'])
            logger.info(f"TF-IDF matrix shape: {self.tfidf_matrix.shape}")
        except Exception as e:
            logger.warning(f"Failed to initialize TF-IDF: {e}")
        
        # 2. Try to initialize sentence transformer (if available)
        try:
            from sentence_transformers import SentenceTransformer
            
            # Try loading cached embeddings first
            embeddings_path = self.cache_dir / "sentence_embeddings.npy"
            if embeddings_path.exists():
                try:
                    self.sentence_embeddings = np.load(embeddings_path)
                    logger.info(f"Loaded sentence embeddings from cache: {embeddings_path}")
                    self.sentence_transformer = True  # Just a flag that we have embeddings
                except Exception as e:
                    logger.warning(f"Failed to load cached embeddings: {e}")
            
            # If not loaded from cache, try to generate them
            if self.sentence_embeddings is None:
                try:
                    # Try multilingual model first
                    model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
                    self.sentence_transformer = SentenceTransformer(model_name)
                    
                    # Compute embeddings (may take time for large datasets)
                    logger.info(f"Computing sentence embeddings using {model_name}...")
                    self.sentence_embeddings = self.sentence_transformer.encode(
                        self.data['Translation'].tolist(), 
                        show_progress_bar=True,
                        batch_size=32
                    )
                    
                    # Cache the embeddings
                    try:
                        np.save(embeddings_path, self.sentence_embeddings)
                        logger.info(f"Cached sentence embeddings to {embeddings_path}")
                    except Exception as e:
                        logger.warning(f"Failed to cache embeddings: {e}")
                        
                except Exception as e:
                    logger.warning(f"Failed to initialize sentence transformer: {e}")
                    self.sentence_transformer = None
        except ImportError:
            logger.info("SentenceTransformer not available. Skipping semantic search capabilities.")
            self.sentence_transformer = None
        
        # 3. Initialize QA model if transformers is available
        try:
            from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
            
            # Only initialize if data is large enough to be useful
            if len(self.data) > 10:
                logger.info("Initializing QA model...")
                try:
                    model_name = "bert-base-multilingual-cased"
                    self.qa_tokenizer = AutoTokenizer.from_pretrained(model_name)
                    self.qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
                    logger.info("QA model initialized successfully")
                except Exception as e:
                    logger.warning(f"Failed to initialize QA model: {e}")
        except ImportError:
            logger.info("Transformers not available. Skipping QA capabilities.")
        
        logger.info("Search methods initialization complete")
        return True
    
    def search(self, query, top_k=5, include_partial=True, include_similar=True, threshold=0.1):
        """
        Search for verses matching the query using multiple techniques
        
        Args:
            query: The search query text
            top_k: Number of results to return
            include_partial: Whether to include partial matches
            include_similar: Whether to include semantically similar results
            threshold: Similarity threshold for including results
            
        Returns:
            Dictionary with primary match and other matches
        """
        if self.data is None or len(self.data) == 0:
            return {"error": "No data available. Please load data first."}
        
        # Handle empty query
        if not query or not isinstance(query, str):
            return {"error": "Empty or invalid query", "primary_match": None, "other_matches": [], "total_matches": 0}
        
        # Normalize the query
        normalized_query = self.normalize_arabic_text(query)
        
        # Dictionary to store all matches with their scores and methods
        all_matches = defaultdict(lambda: {"score": 0, "methods": []})
        
        # 1. Look for exact matches first (highest priority)
        exact_indices = []
        for idx, row in self.data.iterrows():
            if query in row['Translation'] or normalized_query in row['NormalizedText']:
                match_key = f"{row['Surah']}:{row['Ayah']}"
                all_matches[match_key]["verse"] = row['Translation']
                all_matches[match_key]["reference"] = row['Reference']
                all_matches[match_key]["score"] += 10  # High score for exact match
                all_matches[match_key]["methods"].append("exact")
                exact_indices.append(idx)
        
        # 2. Use TF-IDF for partial keyword matching
        if self.vectorizer is not None and (include_partial or len(all_matches) < top_k):
            try:
                query_vec = self.vectorizer.transform([normalized_query])
                similarity_scores = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
                
                # Get indices of top matches
                top_indices = similarity_scores.argsort()[-top_k*2:][::-1]
                
                for idx in top_indices:
                    if similarity_scores[idx] > threshold:
                        row = self.data.iloc[idx]
                        match_key = f"{row['Surah']}:{row['Ayah']}"
                        
                        # Only add if not already an exact match or update score if better
                        if match_key not in all_matches or all_matches[match_key]["score"] < similarity_scores[idx] * 5:
                            all_matches[match_key]["verse"] = row['Translation']
                            all_matches[match_key]["reference"] = row['Reference']
                            all_matches[match_key]["score"] = max(all_matches[match_key]["score"], similarity_scores[idx] * 5)
                            all_matches[match_key]["methods"].append("tfidf")
            except Exception as e:
                logger.warning(f"TF-IDF search failed: {e}")
        
        # 3. Use semantic search with sentence embeddings
        if self.sentence_transformer is not None and self.sentence_embeddings is not None and include_similar:
            try:
                # If it's just a flag, we only have cached embeddings
                if isinstance(self.sentence_transformer, bool):
                    # Use a simpler approach with dot product
                    from sentence_transformers import SentenceTransformer
                    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
                    query_embedding = model.encode([query])[0]
                    
                    # Calculate similarities
                    similarities = np.dot(self.sentence_embeddings, query_embedding)
                    top_indices = similarities.argsort()[-top_k*2:][::-1]
                    
                    for idx in top_indices:
                        if similarities[idx] > threshold:
                            row = self.data.iloc[idx]
                            match_key = f"{row['Surah']}:{row['Ayah']}"
                            
                            all_matches[match_key]["verse"] = row['Translation']
                            all_matches[match_key]["reference"] = row['Reference']
                            all_matches[match_key]["score"] = max(all_matches[match_key]["score"], similarities[idx] * 3)
                            all_matches[match_key]["methods"].append("semantic")
                else:
                    # Use the model directly
                    query_embedding = self.sentence_transformer.encode([query])[0]
                    
                    # Calculate similarities
                    similarities = np.dot(self.sentence_embeddings, query_embedding)
                    top_indices = similarities.argsort()[-top_k*2:][::-1]
                    
                    for idx in top_indices:
                        if similarities[idx] > threshold:
                            row = self.data.iloc[idx]
                            match_key = f"{row['Surah']}:{row['Ayah']}"
                            
                            all_matches[match_key]["verse"] = row['Translation']
                            all_matches[match_key]["reference"] = row['Reference']
                            all_matches[match_key]["score"] = max(all_matches[match_key]["score"], similarities[idx] * 3)
                            all_matches[match_key]["methods"].append("semantic")
            except Exception as e:
                logger.warning(f"Semantic search failed: {e}")
        
        # 4. Use fuzzy matching for query with typos
        if include_partial and len(all_matches) < top_k:
            for idx, row in self.data.iterrows():
                # Skip already matched verses
                match_key = f"{row['Surah']}:{row['Ayah']}"
                if match_key in all_matches:
                    continue
                
                # Calculate fuzzy match ratio
                ratio = difflib.SequenceMatcher(None, normalized_query, row['NormalizedText']).ratio()
                
                if ratio > max(0.6, threshold * 2):  # Higher threshold for fuzzy matching
                    all_matches[match_key]["verse"] = row['Translation']
                    all_matches[match_key]["reference"] = row['Reference']
                    all_matches[match_key]["score"] = max(all_matches[match_key]["score"], ratio * 2)
                    all_matches[match_key]["methods"].append("fuzzy")
        
        # Sort matches by score
        sorted_matches = sorted(all_matches.items(), key=lambda x: x[1]["score"], reverse=True)
        
        # Prepare results
        results = {
            "primary_match": None,
            "other_matches": [],
            "total_matches": len(sorted_matches)
        }
        
        # Set primary match (the one with highest score)
        if sorted_matches:
            primary = sorted_matches[0][1]
            results["primary_match"] = {
                "verse": primary["verse"],
                "reference": primary["reference"],
                "score": primary["score"],
                "methods": primary["methods"]
            }
            
            # Add other matches
            other_matches = []
            for _, match in sorted_matches[1:top_k]:
                other_matches.append({
                    "verse": match["verse"],
                    "reference": match["reference"],
                    "score": match["score"],
                    "methods": match["methods"]
                })
            
            results["other_matches"] = other_matches
        
        return results
    
    def answer_question(self, question, context=None, max_length=512):
        """
        Answer a question using the QA model.
        
        Args:
            question: The question to answer
            context: Optional context to use, otherwise will search for relevant verses
            
        Returns:
            Dictionary with answer and reference
        """
        if self.qa_model is None or self.qa_tokenizer is None:
            return {"error": "QA model not available"}
        
        try:
            from transformers import pipeline
            qa_pipeline = pipeline("question-answering", model=self.qa_model, tokenizer=self.qa_tokenizer)
            
            # Get context if not provided
            if context is None:
                search_results = self.search(question, top_k=3)
                
                if search_results["primary_match"]:
                    context = search_results["primary_match"]["verse"]
                    
                    # Add some additional context if available
                    for match in search_results["other_matches"][:2]:
                        context += " " + match["verse"]
                else:
                    return {"error": "No relevant context found for the question"}
            
            # Use the QA pipeline to get an answer
            result = qa_pipeline(question=question, context=context, max_length=max_length)
            
            return {
                "answer": result["answer"],
                "score": result["score"],
                "context": context
            }
            
        except Exception as e:
            logger.error(f"Error in answer_question: {e}")
            return {"error": f"Failed to answer question: {str(e)}"}

    def formatted_search_results(self, query, top_k=5):
        """
        Return search results in a nicely formatted string.
        
        Args:
            query: The search query
            top_k: Number of results to return
            
        Returns:
            Formatted string with search results
        """
        results = self.search(query, top_k=top_k)
        
        if "error" in results:
            return f"Error: {results['error']}"
        
        output = [f"Search results for: '{query}'"]
        output.append("=" * 50)
        
        if results["primary_match"]:
            primary = results["primary_match"]
            output.append("Primary Match:")
            output.append(f"📖 {primary['reference']}")
            output.append(f"📝 {primary['verse']}")
            output.append(f"✓ Match score: {primary['score']:.2f} using {', '.join(primary['methods'])}")
            output.append("-" * 50)
        else:
            output.append("No primary match found.")
            output.append("-" * 50)
        
        if results["other_matches"]:
            output.append("Other Relevant Matches:")
            for i, match in enumerate(results["other_matches"], 1):
                output.append(f"{i}. 📖 {match['reference']}")
                output.append(f"   📝 {match['verse']}")
                output.append(f"   ✓ Match score: {match['score']:.2f} using {', '.join(match['methods'])}")
                output.append("   " + "-" * 40)
        else:
            output.append("No other matches found.")
        
        output.append(f"\nTotal matches: {results['total_matches']}")
        
        return "\n".join(output)

def initialize_search_engine():
    """Helper function to initialize the search engine"""
    engine = EnhancedQuranSearchEngine()
    success = engine.load_data_from_multiple_sources()
    
    if success:
        engine.initialize_search_methods()
        logger.info(f"Engine initialized with {len(engine.data)} verses")
        return engine
    else:
        logger.error("Failed to initialize search engine")
        return None

def main():
    """Main function to demonstrate search capabilities"""
    logger.info("Starting Quran Search Engine")
    
    # Initialize engine
    engine = initialize_search_engine()
    
    if not engine:
        logger.error("Engine initialization failed. Exiting.")
        return
    
    # Example search
    test_queries = [
        "رحمن",
        "نماز",
        "جنت",
        "توبه",
        "صبر"
    ]
    
    for query in test_queries:
        logger.info(f"\nTesting search for: '{query}'")
        results = engine.formatted_search_results(query)
        print(results)
    
    # Interactive mode
    print("\n" + "=" * 60)
    print("Interactive Quran Search Engine")
    print("=" * 60)
    print("Enter your search query (or 'exit' to quit):")
    
    while True:
        query = input("\nSearch: ")
        if query.lower() in ['exit', 'quit', 'q']:
            break
        
        if not query.strip():
            continue
            
        results = engine.formatted_search_results(query)
        print(results)

if __name__ == "__main__":
    main()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/195 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Search results for: 'رحمن'
Primary Match:
📖 Surah 32:1
📝 الٓمٓ
✓ Match score: 28.49 using semantic
--------------------------------------------------
Other Relevant Matches:
1. 📖 Surah 31:1
   📝 الٓمٓ
   ✓ Match score: 28.49 using semantic
   ----------------------------------------
2. 📖 Surah 30:1
   📝 الٓمٓ
   ✓ Match score: 28.49 using semantic
   ----------------------------------------
3. 📖 Surah 42:1
   📝 حٰمٓ
   ✓ Match score: 28.00 using semantic
   ----------------------------------------
4. 📖 Surah 40:1
   📝 حٰم
   ✓ Match score: 27.60 using semantic
   ----------------------------------------

Total matches: 22


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Search results for: 'نماز'
Primary Match:
📖 Surah 70:22
📝 مگر نماز گزار
✓ Match score: 57.42 using exact, semantic
--------------------------------------------------
Other Relevant Matches:
1. 📖 Surah 96:10
   📝 (یعنی) ایک بندے کو جب وہ نماز پڑھنے لگتا ہے
   ✓ Match score: 56.05 using exact, semantic
   ----------------------------------------
2. 📖 Surah 108:2
   📝 تو اپنے پروردگار کے لیے نماز پڑھا کرو اور قربانی دیا کرو
   ✓ Match score: 55.77 using exact, semantic
   ----------------------------------------
3. 📖 Surah 23:2
   📝 جو نماز میں عجزو نیاز کرتے ہیں
   ✓ Match score: 53.31 using exact, semantic
   ----------------------------------------
4. 📖 Surah 70:34
   📝 اور جو اپنی نماز کی خبر رکھتے ہیں
   ✓ Match score: 51.55 using exact, semantic
   ----------------------------------------

Total matches: 101


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Search results for: 'جنت'
Primary Match:
📖 Surah 74:3
📝 اور اپنے پروردگار کی بڑائی کرو
✓ Match score: 38.05 using semantic
--------------------------------------------------
Other Relevant Matches:
1. 📖 Surah 53:15
   📝 اسی کے پاس رہنے کی جنت ہے
   ✓ Match score: 36.42 using exact, semantic
   ----------------------------------------
2. 📖 Surah 70:22
   📝 مگر نماز گزار
   ✓ Match score: 35.99 using semantic
   ----------------------------------------
3. 📖 Surah 43:72
   📝 اور یہ جنت جس کے تم مالک کر دیئے گئے ہو تمہارے اعمال کا صلہ ہے
   ✓ Match score: 35.45 using exact, semantic
   ----------------------------------------
4. 📖 Surah 36:58
   📝 پروردگار مہربان کی طرف سے سلام (کہا جائے گا)
   ✓ Match score: 35.27 using semantic
   ----------------------------------------

Total matches: 27


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Search results for: 'توبه'
Primary Match:
📖 Surah 2:52 (البقرة (Al-Baqara))
📝 پھر اس کے بعد ہم نے تم کو معاف کر دیا، تاکہ تم شکر کرو
✓ Match score: 36.72 using semantic
--------------------------------------------------
Other Relevant Matches:
1. 📖 Surah 42:43
   📝 اور جو صبر کرے اور قصور معاف کردے تو یہ ہمت کے کام ہیں
   ✓ Match score: 34.46 using semantic
   ----------------------------------------
2. 📖 Surah 23:106
   📝 اے ہمارے پروردگار! ہم پر ہماری کم بختی غالب ہوگئی اور ہم رستے سے بھٹک گئے
   ✓ Match score: 31.36 using semantic
   ----------------------------------------
3. 📖 Surah 2:160 (البقرة (Al-Baqara))
   📝 ہاں جو توبہ کرتے ہیں اور اپنی حالت درست کرلیتے اور (احکام الہیٰ کو) صاف صاف بیان کردیتے ہیں تو میں ان کے قصور معاف کردیتا ہوں اور میں بڑا معاف کرنے والا (اور) رحم والا ہوں
   ✓ Match score: 31.21 using semantic
   ----------------------------------------
4. 📖 Surah 75:35
   📝 پھر افسوس ہے تجھ پر پھر افسوس ہے
   ✓ Match score: 30.97 using semantic
   ---------------------

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Search results for: 'صبر'
Primary Match:
📖 Surah 44:59
📝 پس تم بھی انتظار کرو یہ بھی انتظار کر رہے ہیں
✓ Match score: 39.91 using semantic
--------------------------------------------------
Other Relevant Matches:
1. 📖 Surah 74:7
   📝 اور اپنے پروردگار کے لئے صبر کرو
   ✓ Match score: 38.87 using exact, semantic
   ----------------------------------------
2. 📖 Surah 11:122
   📝 اور (نتیجہٴ اعمال کا) تم بھی انتظار کرو، ہم بھی انتظار کرتے ہیں
   ✓ Match score: 36.30 using semantic
   ----------------------------------------
3. 📖 Surah 52:31
   📝 کہہ دو کہ انتظار کئے جاؤ میں بھی تمہارے ساتھ انتظار کرتا ہوں
   ✓ Match score: 30.51 using semantic
   ----------------------------------------
4. 📖 Surah 73:2
   📝 رات کو قیام کیا کرو مگر تھوڑی سی رات
   ✓ Match score: 29.43 using semantic
   ----------------------------------------

Total matches: 77

Interactive Quran Search Engine
Enter your search query (or 'exit' to quit):



Search:  exit


In [5]:
# kaggle_create_model.py
"""
This script is designed to run on Kaggle to:
1. Create the Quran search engine model
2. Save it in a format that can be downloaded
"""
import pickle
import os
import sys
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('KaggleModelCreation')



def create_and_save_model():
    """Create and save the search engine model on Kaggle"""
    # Create output directory for the model
    output_dir = Path('/kaggle/working/model_output')
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Initialize the search engine
    logger.info("Initializing search engine...")
    engine = EnhancedQuranSearchEngine(cache_dir=str(output_dir))
    
    # Load data and initialize search methods
    success = engine.load_data_from_multiple_sources()
    if not success:
        logger.error("Failed to load data for search engine")
        return False
    
    engine.initialize_search_methods()
    
    # Save the model to a file that can be downloaded
    model_path = output_dir / "quran_search_engine.pkl"
    try:
        with open(model_path, 'wb') as f:
            pickle.dump(engine, f)
        logger.info(f"Model saved to {model_path}")
        
        # Get model stats for reporting
        data_count = len(engine.data) if hasattr(engine, 'data') else 0
        has_tfidf = engine.vectorizer is not None if hasattr(engine, 'vectorizer') else False
        has_st = engine.sentence_transformer is not None if hasattr(engine, 'sentence_transformer') else False
        
        print("\n" + "="*50)
        print("MODEL CREATION SUCCESSFUL")
        print("="*50)
        print(f"Model saved to: {model_path}")
        print(f"Verses loaded: {data_count}")
        print(f"TF-IDF vectorizer: {'Enabled' if has_tfidf else 'Disabled'}")
        print(f"Semantic search: {'Enabled' if has_st else 'Disabled'}")
        print("\nIMPORTANT: Download this file from the Kaggle output")
        print("="*50)
        
        return True
    except Exception as e:
        logger.error(f"Failed to save model: {e}")
        return False

# Run the model creation
if __name__ == "__main__":
    create_and_save_model()

Batches:   0%|          | 0/195 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



MODEL CREATION SUCCESSFUL
Model saved to: /kaggle/working/model_output/quran_search_engine.pkl
Verses loaded: 6236
TF-IDF vectorizer: Enabled
Semantic search: Enabled

IMPORTANT: Download this file from the Kaggle output
