# M√¥ h√¨nh LSI (Latent Semantic Indexing)

Notebook n√†y tri·ªÉn khai m√¥ h√¨nh LSI ho√†n ch·ªânh theo y√™u c·∫ßu:

1. **Gi·ªõi thi·ªáu m√¥ h√¨nh**: Ph∆∞∆°ng ph√°p bi·ªÉu di·ªÖn t√†i li·ªáu v√† truy v·∫•n, nguy√™n t·∫Øc t√≠nh to√°n ƒë·ªô li√™n quan
2. **Ch·ªçn term**: Ph∆∞∆°ng ph√°p x√°c ƒë·ªãnh term v·ªõi v√≠ d·ª• minh h·ªça
3. **C√¥ng th·ª©c t√≠nh tr·ªçng s·ªë term**: TF-IDF v√† c√°c th√†nh ph·∫ßn
4. **L·∫≠p ch·ªâ m·ª•c**: C·∫•u tr√∫c ch·ªâ m·ª•c v√† qu√° tr√¨nh x·ª≠ l√Ω t√†i li·ªáu
5. **X·ª≠ l√Ω truy v·∫•n**: Ph√¢n t√≠ch truy v·∫•n v√† t√≠nh to√°n ƒë·ªô t∆∞∆°ng ƒë·ªìng
6. **ƒê√°nh gi√° m√¥ h√¨nh**: ƒê√°nh gi√° tr√™n ng·ªØ li·ªáu Cranfield theo P, R v√† MAP n·ªôi suy 11 ƒëi·ªÉm TREC


In [None]:
# Import th∆∞ vi·ªán c·∫ßn thi·∫øt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import os
import glob
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data if needed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

print("ƒê√£ import th√†nh c√¥ng c√°c th∆∞ vi·ªán c·∫ßn thi·∫øt!")


In [None]:
# Load d·ªØ li·ªáu Cranfield
def load_cranfield_documents():
    """ƒê·ªçc t√†i li·ªáu t·ª´ b·ªô d·ªØ li·ªáu Cranfield"""
    documents = {}
    doc_path = "../dataset/Crandfield/Cranfield/"
    
    if not os.path.exists(doc_path):
        doc_path = "dataset/Crandfield/Cranfield/"
    
    txt_files = glob.glob(os.path.join(doc_path, "*.txt"))
    
    for file_path in txt_files:
        doc_id = os.path.basename(file_path).replace('.txt', '')
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if content:
                    documents[doc_id] = content
        except Exception as e:
            print(f"L·ªói khi ƒë·ªçc file {file_path}: {e}")
    
    return documents

def load_cranfield_queries():
    """ƒê·ªçc truy v·∫•n t·ª´ b·ªô d·ªØ li·ªáu Cranfield"""
    queries = {}
    query_paths = ["../dataset/Crandfield/query.txt", "dataset/Crandfield/query.txt"]
    
    for query_path in query_paths:
        if os.path.exists(query_path):
            break
    else:
        print("Kh√¥ng t√¨m th·∫•y file query.txt")
        return queries
    
    try:
        with open(query_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    query_id = parts[0]
                    query_text = parts[1]
                    queries[query_id] = query_text
    except Exception as e:
        print(f"L·ªói khi ƒë·ªçc queries: {e}")
    
    return queries

# Load d·ªØ li·ªáu
documents = load_cranfield_documents()
queries = load_cranfield_queries()

print(f"ƒê√£ load {len(documents)} t√†i li·ªáu")
print(f"ƒê√£ load {len(queries)} truy v·∫•n")

if documents:
    first_doc_id = list(documents.keys())[0]
    print(f"\nV√≠ d·ª• t√†i li·ªáu ƒë·∫ßu ti√™n (ID: {first_doc_id}):")
    print(f"{documents[first_doc_id][:200]}...")

if queries:
    first_query_id = list(queries.keys())[0]
    print(f"\nV√≠ d·ª• truy v·∫•n ƒë·∫ßu ti√™n (ID: {first_query_id}):")
    print(f"{queries[first_query_id]}")


In [None]:
# H√†m ti·ªÅn x·ª≠ l√Ω vƒÉn b·∫£n
def preprocess_text(text):
    """
    Ti·ªÅn x·ª≠ l√Ω vƒÉn b·∫£n: tokenization, lo·∫°i b·ªè stopwords, stemming
    """
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    # Chuy·ªÉn v·ªÅ ch·ªØ th∆∞·ªùng v√† tokenize
    text = text.lower()
    tokens = word_tokenize(text)
    
    # Lo·∫°i b·ªè d·∫•u c√¢u, s·ªë, v√† t·ª´ qu√° ng·∫Øn
    tokens = [token for token in tokens if token.isalpha() and len(token) > 2]
    
    # Lo·∫°i b·ªè stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

# V√≠ d·ª• minh h·ªça qu√° tr√¨nh ch·ªçn term
if documents:
    sample_text = list(documents.values())[0][:100]  # L·∫•y 100 k√Ω t·ª± ƒë·∫ßu
else:
    sample_text = "experimental investigation of the aerodynamics of a wing in a slipstream"

print("=== V√ç D·ª§ MINH H·ªåA QU√Å TR√åNH CH·ªåN TERM ===")
print(f"VƒÉn b·∫£n g·ªëc:\n'{sample_text}'")
print(f"\nD·ªô d√†i: {len(sample_text)} k√Ω t·ª±\n")

# B∆∞·ªõc 1: Tokenization v√† chuy·ªÉn v·ªÅ ch·ªØ th∆∞·ªùng
tokens = word_tokenize(sample_text.lower())
print(f"B∆∞·ªõc 1 - Tokenization:")
print(f"Tokens: {tokens}")
print(f"S·ªë l∆∞·ª£ng tokens: {len(tokens)}\n")

# B∆∞·ªõc 2: L·ªçc ch·ªâ gi·ªØ l·∫°i c√°c t·ª´ (lo·∫°i b·ªè d·∫•u c√¢u, s·ªë)
alpha_tokens = [token for token in tokens if token.isalpha()]
print(f"B∆∞·ªõc 2 - L·ªçc t·ª´ (ch·ªâ gi·ªØ k√Ω t·ª± ch·ªØ c√°i):")
print(f"Tokens: {alpha_tokens}")
print(f"S·ªë l∆∞·ª£ng: {len(alpha_tokens)}\n")

# B∆∞·ªõc 3: Lo·∫°i b·ªè t·ª´ qu√° ng·∫Øn (< 3 k√Ω t·ª±)
long_tokens = [token for token in alpha_tokens if len(token) > 2]
print(f"B∆∞·ªõc 3 - Lo·∫°i b·ªè t·ª´ ng·∫Øn (< 3 k√Ω t·ª±):")
print(f"Tokens: {long_tokens}")
print(f"S·ªë l∆∞·ª£ng: {len(long_tokens)}\n")

# B∆∞·ªõc 4: Lo·∫°i b·ªè stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in long_tokens if token not in stop_words]
print(f"B∆∞·ªõc 4 - Lo·∫°i b·ªè stopwords:")
print(f"Stopwords ƒë∆∞·ª£c lo·∫°i: {[token for token in long_tokens if token in stop_words]}")
print(f"Tokens c√≤n l·∫°i: {filtered_tokens}")
print(f"S·ªë l∆∞·ª£ng: {len(filtered_tokens)}\n")

# B∆∞·ªõc 5: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print(f"B∆∞·ªõc 5 - Stemming:")
print("T·ª´ g·ªëc -> T·ª´ sau stemming:")
for original, stemmed in zip(filtered_tokens, stemmed_tokens):
    if original != stemmed:
        print(f"  {original} -> {stemmed}")
print(f"\nTerms cu·ªëi c√πng: {stemmed_tokens}")
print(f"S·ªë l∆∞·ª£ng terms: {len(stemmed_tokens)}")

# Ph√¢n t√≠ch th·ªëng k√™ tr√™n to√†n b·ªô collection
print(f"\n=== PH√ÇN T√çCH TH·ªêNG K√ä TR√äN TO√ÄN B·ªò COLLECTION ===")

if documents:
    all_terms = []
    doc_count = 0
    for doc_id, content in list(documents.items())[:100]:  # Ph√¢n t√≠ch 100 t√†i li·ªáu ƒë·∫ßu
        terms = preprocess_text(content)
        all_terms.extend(terms)
        doc_count += 1
    
    term_counter = Counter(all_terms)
    vocabulary_size = len(term_counter)
    total_terms = len(all_terms)
    
    print(f"S·ªë t√†i li·ªáu ƒë√£ ph√¢n t√≠ch: {doc_count}")
    print(f"T·ªïng s·ªë terms: {total_terms:,}")
    print(f"K√≠ch th∆∞·ªõc t·ª´ v·ª±ng (unique terms): {vocabulary_size:,}")
    print(f"T·ª∑ l·ªá unique terms: {vocabulary_size/total_terms:.2%}")
    
    print(f"\n10 terms xu·∫•t hi·ªán nhi·ªÅu nh·∫•t:")
    for term, count in term_counter.most_common(10):
        print(f"  '{term}': {count} l·∫ßn")
    
    print(f"\n10 terms xu·∫•t hi·ªán √≠t nh·∫•t:")
    rare_terms = [item for item in term_counter.most_common()[-10:]]
    for term, count in rare_terms:
        print(f"  '{term}': {count} l·∫ßn")
else:
    print("Kh√¥ng c√≥ d·ªØ li·ªáu ƒë·ªÉ ph√¢n t√≠ch")


In [None]:
# T·∫°o TF-IDF vectorizer t√πy ch·ªânh
class CustomTfidfVectorizer:
    def __init__(self, max_features=1000, min_df=2, max_df=0.8):
        self.max_features = max_features
        self.min_df = min_df
        self.max_df = max_df
        self.vocabulary_ = {}
        self.idf_ = {}
        
    def fit_transform(self, documents):
        # Ti·ªÅn x·ª≠ l√Ω t√†i li·ªáu
        processed_docs = []
        for doc in documents:
            terms = preprocess_text(doc)
            processed_docs.append(' '.join(terms))
        
        # S·ª≠ d·ª•ng TfidfVectorizer c·ªßa sklearn v·ªõi d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω
        self.vectorizer = TfidfVectorizer(
            max_features=self.max_features,
            min_df=self.min_df,
            max_df=self.max_df,
            lowercase=False,  # ƒê√£ chuy·ªÉn v·ªÅ ch·ªØ th∆∞·ªùng
            stop_words=None,  # ƒê√£ lo·∫°i b·ªè stopwords
            token_pattern=r'\b\w+\b'
        )
        
        matrix = self.vectorizer.fit_transform(processed_docs)
        self.vocabulary_ = self.vectorizer.vocabulary_
        self.idf_ = self.vectorizer.idf_
        
        return matrix
    
    def transform(self, documents):
        processed_docs = []
        for doc in documents:
            terms = preprocess_text(doc)
            processed_docs.append(' '.join(terms))
        return self.vectorizer.transform(processed_docs)

# V√≠ d·ª• t√≠nh to√°n TF-IDF th·ªß c√¥ng
print("=== V√ç D·ª§ T√çNH TO√ÅN TF-IDF TH·ª¶ C√îNG ===")

if documents:
    # L·∫•y 3 t√†i li·ªáu ƒë·∫ßu ti√™n ƒë·ªÉ demo
    sample_docs = list(documents.items())[:3]
    print("T√†i li·ªáu m·∫´u:")
    for doc_id, content in sample_docs:
        print(f"Doc {doc_id}: {content[:80]}...")
    
    # Ti·ªÅn x·ª≠ l√Ω c√°c t√†i li·ªáu
    processed_sample_docs = []
    for doc_id, content in sample_docs:
        terms = preprocess_text(content)
        processed_sample_docs.append(terms)
        print(f"\nDoc {doc_id} - Terms sau x·ª≠ l√Ω: {terms[:10]}...")
    
    # T√≠nh TF-IDF cho m·ªôt term c·ª• th·ªÉ
    target_term = "wing"  # Ch·ªçn term "wing" ƒë·ªÉ demo
    target_term_stemmed = PorterStemmer().stem(target_term)
    
    print(f"\n=== T√çNH TF-IDF CHO TERM: '{target_term}' (stemmed: '{target_term_stemmed}') ===")
    
    # T√≠nh TF cho t·ª´ng document
    tfs = []
    for i, terms in enumerate(processed_sample_docs):
        tf_raw = terms.count(target_term_stemmed)  # TF th√¥
        tf_normalized = tf_raw / len(terms) if len(terms) > 0 else 0  # TF chu·∫©n h√≥a
        tf_log = 1 + np.log(tf_raw) if tf_raw > 0 else 0  # TF logarithm
        
        tfs.append({
            'doc_id': sample_docs[i][0],
            'tf_raw': tf_raw,
            'tf_normalized': tf_normalized,
            'tf_log': tf_log,
            'doc_length': len(terms)
        })
        
        print(f"Doc {sample_docs[i][0]}:")
        print(f"  S·ªë l·∫ßn xu·∫•t hi·ªán: {tf_raw}")
        print(f"  ƒê·ªô d√†i document: {len(terms)} terms")
        print(f"  TF th√¥: {tf_raw}")
        print(f"  TF chu·∫©n h√≥a: {tf_normalized:.4f}")
        print(f"  TF logarithm: {tf_log:.4f}")
    
    # T√≠nh IDF
    N = len(processed_sample_docs)  # T·ªïng s·ªë documents
    df = sum(1 for terms in processed_sample_docs if target_term_stemmed in terms)  # Document frequency
    
    idf_basic = np.log(N / df) if df > 0 else 0
    idf_smooth = np.log(N / (1 + df)) + 1
    
    print(f"\n=== T√çNH IDF ===")
    print(f"T·ªïng s·ªë documents (N): {N}")
    print(f"S·ªë documents ch·ª©a term '{target_term_stemmed}' (df): {df}")
    print(f"IDF c∆° b·∫£n: log({N}/{df}) = {idf_basic:.4f}")
    print(f"IDF smoothed: log({N}/{1+df}) + 1 = {idf_smooth:.4f}")
    
    # T√≠nh TF-IDF cu·ªëi c√πng
    print(f"\n=== TF-IDF CU·ªêI C√ôNG ===")
    print("Doc ID | TF th√¥ | TF norm | TF log | IDF    | TF-IDF(norm) | TF-IDF(log)")
    print("-------|--------|---------|--------|--------|--------------|------------")
    
    for tf_data in tfs:
        tfidf_norm = tf_data['tf_normalized'] * idf_basic
        tfidf_log = tf_data['tf_log'] * idf_basic
        print(f"{tf_data['doc_id']:6} | {tf_data['tf_raw']:6} | {tf_data['tf_normalized']:7.4f} | {tf_data['tf_log']:6.4f} | {idf_basic:6.4f} | {tfidf_norm:12.4f} | {tfidf_log:11.4f}")

# T·∫°o ma tr·∫≠n TF-IDF cho to√†n b·ªô collection
if documents:
    print(f"\n=== T·∫†O MA TR·∫¨N TF-IDF CHO TO√ÄN B·ªò COLLECTION ===")
    
    doc_list = list(documents.values())
    doc_ids = list(documents.keys())
    
    # Kh·ªüi t·∫°o TF-IDF vectorizer
    tfidf_vectorizer = CustomTfidfVectorizer(max_features=500, min_df=2, max_df=0.8)
    
    # T·∫°o ma tr·∫≠n TF-IDF
    tfidf_matrix = tfidf_vectorizer.fit_transform(doc_list)
    
    print(f"K√≠ch th∆∞·ªõc ma tr·∫≠n TF-IDF: {tfidf_matrix.shape}")
    print(f"S·ªë t√†i li·ªáu: {tfidf_matrix.shape[0]}")
    print(f"S·ªë features (terms): {tfidf_matrix.shape[1]}")
    print(f"Density (t·ª∑ l·ªá non-zero): {tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1]):.4f}")
    
    # Hi·ªÉn th·ªã m·ªôt s·ªë th·ªëng k√™
    feature_names = tfidf_vectorizer.vectorizer.get_feature_names_out()
    print(f"\nM·ªôt s·ªë terms trong vocabulary: {feature_names[:20]}")
    
    # Hi·ªÉn th·ªã TF-IDF values cho t√†i li·ªáu ƒë·∫ßu ti√™n
    first_doc_tfidf = tfidf_matrix[0].toarray().flatten()
    non_zero_indices = np.nonzero(first_doc_tfidf)[0]
    
    print(f"\nT√†i li·ªáu ƒë·∫ßu ti√™n - Top 10 terms v·ªõi TF-IDF cao nh·∫•t:")
    sorted_indices = non_zero_indices[np.argsort(first_doc_tfidf[non_zero_indices])[::-1]]
    for i, idx in enumerate(sorted_indices[:10]):
        term = feature_names[idx]
        tfidf_val = first_doc_tfidf[idx]
        print(f"  {i+1:2d}. '{term}': {tfidf_val:.4f}")
else:
    print("Kh√¥ng c√≥ d·ªØ li·ªáu ƒë·ªÉ t·∫°o ma tr·∫≠n TF-IDF")


In [None]:
# X√¢y d·ª±ng ch·ªâ m·ª•c LSI
class LSIModel:
    def __init__(self, n_components=100):
        self.n_components = n_components
        self.svd = TruncatedSVD(n_components=n_components, random_state=42)
        self.tfidf_vectorizer = None
        self.doc_ids = None
        self.lsi_matrix = None
        
    def fit(self, documents, doc_ids):
        """
        X√¢y d·ª±ng ch·ªâ m·ª•c LSI t·ª´ collection t√†i li·ªáu
        """
        self.doc_ids = doc_ids
        
        print("B∆∞·ªõc 1: T·∫°o ma tr·∫≠n TF-IDF...")
        self.tfidf_vectorizer = CustomTfidfVectorizer(max_features=500, min_df=2, max_df=0.8)
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents)
        
        print(f"Ma tr·∫≠n TF-IDF g·ªëc: {tfidf_matrix.shape}")
        print(f"Density: {tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1]):.4f}")
        
        print("B∆∞·ªõc 2: √Åp d·ª•ng SVD...")
        self.lsi_matrix = self.svd.fit_transform(tfidf_matrix)
        
        print(f"Ma tr·∫≠n LSI sau SVD: {self.lsi_matrix.shape}")
        print(f"Explained variance ratio (10 th√†nh ph·∫ßn ƒë·∫ßu): {self.svd.explained_variance_ratio_[:10]}")
        print(f"T·ªïng explained variance: {sum(self.svd.explained_variance_ratio_):.4f}")
        
        return self
    
    def transform_query(self, query):
        """Chuy·ªÉn ƒë·ªïi truy v·∫•n sang kh√¥ng gian LSI"""
        query_tfidf = self.tfidf_vectorizer.transform([query])
        query_lsi = self.svd.transform(query_tfidf)
        return query_lsi
    
    def search(self, query, top_k=10):
        """T√¨m ki·∫øm t√†i li·ªáu t∆∞∆°ng t·ª±"""
        query_lsi = self.transform_query(query)
        similarities = cosine_similarity(query_lsi, self.lsi_matrix).flatten()
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append({
                'doc_id': self.doc_ids[idx],
                'similarity': similarities[idx],
                'rank': len(results) + 1
            })
        
        return results

# X√¢y d·ª±ng m√¥ h√¨nh LSI
if documents and len(documents) > 0:
    print("=== X√ÇY D·ª∞NG CH·ªà M·ª§C LSI ===")
    
    doc_list = list(documents.values())
    doc_ids = list(documents.keys())
    
    # T·∫°o m√¥ h√¨nh LSI
    lsi_model = LSIModel(n_components=50)  # S·ª≠ d·ª•ng 50 components
    lsi_model.fit(doc_list, doc_ids)
    
    # Ph√¢n t√≠ch SVD components
    print(f"\n=== PH√ÇN T√çCH C√ÅC TH√ÄNH PH·∫¶N SVD ===")
    print(f"S·ªë singular values: {len(lsi_model.svd.singular_values_)}")
    print(f"10 singular values l·ªõn nh·∫•t: {lsi_model.svd.singular_values_[:10]}")
    
    # V·∫Ω bi·ªÉu ƒë·ªì explained variance
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(range(1, min(21, len(lsi_model.svd.explained_variance_ratio_) + 1)), 
             lsi_model.svd.explained_variance_ratio_[:20], 'bo-')
    plt.title('Explained Variance Ratio theo Component')
    plt.xlabel('Component')
    plt.ylabel('Explained Variance Ratio')
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    cumulative_variance = np.cumsum(lsi_model.svd.explained_variance_ratio_)
    plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'ro-')
    plt.title('Cumulative Explained Variance')
    plt.xlabel('S·ªë l∆∞·ª£ng Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nCumulative explained variance v·ªõi {lsi_model.n_components} components: {cumulative_variance[-1]:.4f}")
    
    # Hi·ªÉn th·ªã bi·ªÉu di·ªÖn documents trong kh√¥ng gian LSI
    print(f"\n=== BI·ªÇU DI·ªÑN DOCUMENTS TRONG KH√îNG GIAN LSI ===")
    print(f"Document ƒë·∫ßu ti√™n trong kh√¥ng gian LSI (10 chi·ªÅu ƒë·∫ßu):")
    print(lsi_model.lsi_matrix[0][:10])
    print(f"K√≠ch th∆∞·ªõc bi·ªÉu di·ªÖn cho m·ªói document: {lsi_model.lsi_matrix[0].shape}")
    
    # Test ch·ªâ m·ª•c v·ªõi truy v·∫•n m·∫´u
    sample_query = "aerodynamic wing flow"
    print(f"\n=== TEST CH·ªà M·ª§C V·ªöI TRUY V·∫§N M·∫™U ===")
    print(f"Truy v·∫•n: '{sample_query}'")
    
    results = lsi_model.search(sample_query, top_k=5)
    print("\nTop 5 t√†i li·ªáu t∆∞∆°ng t·ª±:")
    for result in results:
        if result['doc_id'] in documents:
            doc_content = documents[result['doc_id']][:100] + "..."
            print(f"H·∫°ng {result['rank']}: Doc {result['doc_id']} (ƒë·ªô t∆∞∆°ng t·ª±: {result['similarity']:.4f})")
            print(f"N·ªôi dung: {doc_content}")
            print()
    
    # So s√°nh k√≠ch th∆∞·ªõc l∆∞u tr·ªØ
    original_size = lsi_model.tfidf_vectorizer.vectorizer.transform([' '.join(preprocess_text(doc)) for doc in doc_list]).data.nbytes
    lsi_size = lsi_model.lsi_matrix.nbytes
    compression_ratio = original_size / lsi_size if lsi_size > 0 else 0
    
    print(f"=== SO S√ÅNH K√çCH TH∆Ø·ªöC LUU TR·ªÆ ===")
    print(f"K√≠ch th∆∞·ªõc ma tr·∫≠n TF-IDF g·ªëc: {original_size / 1024:.2f} KB")
    print(f"K√≠ch th∆∞·ªõc ma tr·∫≠n LSI: {lsi_size / 1024:.2f} KB")
    print(f"T·ª∑ l·ªá n√©n: {compression_ratio:.2f}x")
    
else:
    print("Kh√¥ng c√≥ d·ªØ li·ªáu ƒë·ªÉ x√¢y d·ª±ng ch·ªâ m·ª•c LSI")


In [None]:
# Demo chi ti·∫øt qu√° tr√¨nh x·ª≠ l√Ω truy v·∫•n
def detailed_query_processing(lsi_model, query, show_steps=True):
    """
    Minh h·ªça chi ti·∫øt c√°c b∆∞·ªõc x·ª≠ l√Ω truy v·∫•n
    """
    if show_steps:
        print(f"=== X·ª¨ L√ù TRUY V·∫§N CHI TI·∫æT ===")
        print(f"Truy v·∫•n g·ªëc: '{query}'")
    
    # B∆∞·ªõc 1: Ti·ªÅn x·ª≠ l√Ω truy v·∫•n
    preprocessed_terms = preprocess_text(query)
    preprocessed_query = ' '.join(preprocessed_terms)
    
    if show_steps:
        print(f"1. Truy v·∫•n sau ti·ªÅn x·ª≠ l√Ω: '{preprocessed_query}'")
        print(f"   Terms: {preprocessed_terms}")
    
    # B∆∞·ªõc 2: Chuy·ªÉn ƒë·ªïi sang TF-IDF vector
    query_tfidf = lsi_model.tfidf_vectorizer.transform([query])
    query_tfidf_dense = query_tfidf.toarray().flatten()
    
    if show_steps:
        print(f"2. TF-IDF vector shape: {query_tfidf.shape}")
        non_zero_indices = np.nonzero(query_tfidf_dense)[0]
        print(f"   Terms c√≥ tr·ªçng s·ªë non-zero: {len(non_zero_indices)}")
        
        if len(non_zero_indices) > 0:
            feature_names = lsi_model.tfidf_vectorizer.vectorizer.get_feature_names_out()
            print("   Top terms v·ªõi tr·ªçng s·ªë cao nh·∫•t:")
            sorted_indices = non_zero_indices[np.argsort(query_tfidf_dense[non_zero_indices])[::-1]]
            for i, idx in enumerate(sorted_indices[:5]):
                term = feature_names[idx]
                weight = query_tfidf_dense[idx]
                print(f"     {i+1}. '{term}': {weight:.4f}")
    
    # B∆∞·ªõc 3: Chuy·ªÉn ƒë·ªïi sang kh√¥ng gian LSI
    query_lsi = lsi_model.svd.transform(query_tfidf)
    query_lsi_flat = query_lsi.flatten()
    
    if show_steps:
        print(f"3. LSI vector shape: {query_lsi.shape}")
        print(f"   LSI representation (5 chi·ªÅu ƒë·∫ßu): {query_lsi_flat[:5]}")
        print(f"   LSI vector norm: {np.linalg.norm(query_lsi_flat):.4f}")
    
    # B∆∞·ªõc 4: T√≠nh ƒë·ªô t∆∞∆°ng ƒë·ªìng
    similarities = cosine_similarity(query_lsi, lsi_model.lsi_matrix).flatten()
    
    if show_steps:
        print(f"4. T√≠nh ƒë·ªô t∆∞∆°ng ƒë·ªìng:")
        print(f"   S·ªë documents ƒë∆∞·ª£c so s√°nh: {len(similarities)}")
        print(f"   Similarity min: {similarities.min():.4f}")
        print(f"   Similarity max: {similarities.max():.4f}")
        print(f"   Similarity trung b√¨nh: {similarities.mean():.4f}")
    
    # B∆∞·ªõc 5: X·∫øp h·∫°ng
    ranked_indices = np.argsort(similarities)[::-1]
    
    return query_lsi, similarities, ranked_indices

# Test x·ª≠ l√Ω truy v·∫•n v·ªõi nhi·ªÅu queries
if 'lsi_model' in locals() and queries:
    test_queries = list(queries.items())[:3]  # L·∫•y 3 queries ƒë·∫ßu ti√™n
    
    print("=== DEMO X·ª¨ L√ù TRUY V·∫§N ===\n")
    
    for i, (query_id, query_text) in enumerate(test_queries):
        print(f"Truy v·∫•n {i+1} (ID: {query_id}): {query_text}")
        query_lsi, similarities, ranked_indices = detailed_query_processing(lsi_model, query_text)
        
        # Hi·ªÉn th·ªã top 3 k·∫øt qu·∫£
        print("Top 3 k·∫øt qu·∫£:")
        for j in range(3):
            if j < len(ranked_indices):
                doc_idx = ranked_indices[j]
                doc_id = lsi_model.doc_ids[doc_idx]
                sim_score = similarities[doc_idx]
                if doc_id in documents:
                    doc_content = documents[doc_id][:80] + "..."
                    print(f"  H·∫°ng {j+1}: Doc {doc_id} (similarity: {sim_score:.4f})")
                    print(f"           {doc_content}")
        print("\n" + "="*60 + "\n")

# So s√°nh LSI vs TF-IDF baseline
if 'lsi_model' in locals() and documents:
    print("=== SO S√ÅNH LSI VS TF-IDF BASELINE ===")
    test_query = "aerodynamic wing design"
    print(f"Truy v·∫•n test: '{test_query}'")
    
    # K·∫øt qu·∫£ t·ª´ LSI
    query_lsi, lsi_similarities, lsi_ranked = detailed_query_processing(lsi_model, test_query, show_steps=False)
    
    # K·∫øt qu·∫£ t·ª´ TF-IDF baseline (kh√¥ng SVD)
    query_tfidf = lsi_model.tfidf_vectorizer.transform([test_query])
    doc_tfidf_matrix = lsi_model.tfidf_vectorizer.vectorizer.transform(
        [' '.join(preprocess_text(doc)) for doc in doc_list]
    )
    tfidf_similarities = cosine_similarity(query_tfidf, doc_tfidf_matrix).flatten()
    tfidf_ranked = np.argsort(tfidf_similarities)[::-1]
    
    print("\nSo s√°nh Top 5 k·∫øt qu·∫£:")
    print("H·∫°ng | LSI Model              | TF-IDF Baseline")
    print("-----|------------------------|------------------------")
    for i in range(5):
        if i < len(lsi_ranked) and i < len(tfidf_ranked):
            lsi_doc_id = lsi_model.doc_ids[lsi_ranked[i]]
            lsi_sim = lsi_similarities[lsi_ranked[i]]
            
            tfidf_doc_id = lsi_model.doc_ids[tfidf_ranked[i]]
            tfidf_sim = tfidf_similarities[tfidf_ranked[i]]
            
            print(f"{i+1:4d} | Doc {lsi_doc_id:>3} (sim: {lsi_sim:.3f}) | Doc {tfidf_doc_id:>3} (sim: {tfidf_sim:.3f})")
    
    # T√≠nh overlap trong top 10
    top_10_lsi = set(lsi_model.doc_ids[idx] for idx in lsi_ranked[:10])
    top_10_tfidf = set(lsi_model.doc_ids[idx] for idx in tfidf_ranked[:10])
    overlap = len(top_10_lsi.intersection(top_10_tfidf))
    
    print(f"\nDocuments chung trong top 10: {overlap}/10")
    print(f"ƒêi·ªÅu n√†y cho th·∫•y LSI {'c√≥ th·ªÉ' if overlap < 7 else '√≠t'} t√¨m ra documents kh√°c bi·ªát so v·ªõi TF-IDF thu·∫ßn")
    
# Ph√¢n t√≠ch semantic similarity
if 'lsi_model' in locals():
    print(f"\n=== PH√ÇN T√çCH SEMANTIC SIMILARITY ===")
    
    # Test v·ªõi c√°c queries c√≥ t·ª´ ƒë·ªìng nghƒ©a
    semantic_queries = [
        "aircraft wing design",
        "airplane wing structure", 
        "plane aerodynamic surface"
    ]
    
    print("Test kh·∫£ nƒÉng t√¨m ki·∫øm semantic v·ªõi c√°c queries t∆∞∆°ng t·ª±:")
    query_vectors = []
    
    for query in semantic_queries:
        query_lsi = lsi_model.transform_query(query)
        query_vectors.append(query_lsi.flatten())
        
        results = lsi_model.search(query, top_k=3)
        print(f"\nQuery: '{query}'")
        print("Top 3 results:")
        for result in results:
            if result['doc_id'] in documents:
                print(f"  Doc {result['doc_id']} (sim: {result['similarity']:.3f})")
    
    # T√≠nh similarity gi·ªØa c√°c query vectors
    print(f"\nƒê·ªô t∆∞∆°ng ƒë·ªìng gi·ªØa c√°c queries:")
    for i in range(len(semantic_queries)):
        for j in range(i+1, len(semantic_queries)):
            sim = cosine_similarity([query_vectors[i]], [query_vectors[j]])[0][0]
            print(f"  '{semantic_queries[i]}' vs '{semantic_queries[j]}': {sim:.3f}")
    
else:
    print("M√¥ h√¨nh LSI ch∆∞a ƒë∆∞·ª£c kh·ªüi t·∫°o. Vui l√≤ng ch·∫°y cell tr∆∞·ªõc ƒë√≥.")


In [None]:
# H√†m ƒë√°nh gi√° m√¥ h√¨nh
def calculate_precision_recall(relevant_docs, retrieved_docs, k=None):
    """T√≠nh precision v√† recall t·∫°i cutoff k"""
    if k is not None:
        retrieved_docs = retrieved_docs[:k]
    
    relevant_set = set(relevant_docs)
    retrieved_set = set(retrieved_docs)
    
    relevant_retrieved = relevant_set.intersection(retrieved_set)
    
    precision = len(relevant_retrieved) / len(retrieved_set) if retrieved_set else 0
    recall = len(relevant_retrieved) / len(relevant_set) if relevant_set else 0
    
    return precision, recall

def calculate_average_precision(relevant_docs, retrieved_docs):
    """T√≠nh Average Precision cho m·ªôt query"""
    if not relevant_docs:
        return 0.0
    
    relevant_set = set(relevant_docs)
    ap = 0.0
    relevant_count = 0
    
    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_set:
            relevant_count += 1
            precision_at_i = relevant_count / (i + 1)
            ap += precision_at_i
    
    return ap / len(relevant_docs) if relevant_docs else 0.0

def interpolate_precision_recall(precision_recall_pairs):
    """N·ªôi suy precision t·∫°i 11 recall levels chu·∫©n"""
    recall_levels = np.arange(0.0, 1.1, 0.1)
    interpolated_precisions = []
    
    # S·∫Øp x·∫øp theo recall
    precision_recall_pairs.sort(key=lambda x: x[1])
    
    for target_recall in recall_levels:
        # T√¨m precision cao nh·∫•t t·∫°i recall >= target_recall
        max_precision = 0.0
        for precision, recall in precision_recall_pairs:
            if recall >= target_recall:
                max_precision = max(max_precision, precision)
        interpolated_precisions.append(max_precision)
    
    return list(zip(recall_levels, interpolated_precisions))

# T·∫°o relevance judgments gi·∫£ l·∫≠p cho demo
def create_synthetic_relevance_judgments(queries, documents, lsi_model, num_queries=10):
    """
    T·∫°o relevance judgments gi·∫£ l·∫≠p d·ª±a tr√™n TF-IDF similarity
    Trong th·ª±c t·∫ø, ƒë√¢y s·∫Ω l√† ground truth t·ª´ human assessors
    """
    qrels = {}
    
    query_items = list(queries.items())[:num_queries]
    
    for query_id, query_text in query_items:
        # S·ª≠ d·ª•ng TF-IDF ƒë·ªÉ t√¨m documents li√™n quan (pseudo ground truth)
        query_tfidf = lsi_model.tfidf_vectorizer.transform([query_text])
        doc_tfidf_matrix = lsi_model.tfidf_vectorizer.vectorizer.transform(
            [' '.join(preprocess_text(doc)) for doc in doc_list]
        )
        similarities = cosine_similarity(query_tfidf, doc_tfidf_matrix).flatten()
        
        # Coi top 5% documents c√≥ similarity cao nh·∫•t l√† relevant
        threshold = np.percentile(similarities, 95)
        relevant_indices = np.where(similarities >= threshold)[0]
        
        qrels[query_id] = [lsi_model.doc_ids[idx] for idx in relevant_indices]
    
    return qrels

# TF-IDF Baseline Model
class TFIDFModel:
    """M√¥ h√¨nh TF-IDF baseline ƒë·ªÉ so s√°nh"""
    def __init__(self, tfidf_vectorizer):
        self.tfidf_vectorizer = tfidf_vectorizer
        self.doc_matrix = None
        self.doc_ids = None
        
    def fit(self, documents, doc_ids):
        self.doc_ids = doc_ids
        processed_docs = [' '.join(preprocess_text(doc)) for doc in documents]
        self.doc_matrix = self.tfidf_vectorizer.vectorizer.transform(processed_docs)
        return self
        
    def search(self, query, top_k=10):
        query_vector = self.tfidf_vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.doc_matrix).flatten()
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append({
                'doc_id': self.doc_ids[idx],
                'similarity': similarities[idx],
                'rank': len(results) + 1
            })
        return results

def evaluate_model(model, queries, qrels, model_name, top_k=50):
    """ƒê√°nh gi√° m·ªôt m√¥ h√¨nh s·ª≠ d·ª•ng P, R v√† MAP"""
    precisions_at_k = []
    recalls_at_k = []
    average_precisions = []
    all_interpolated_points = []
    
    print(f"\nƒêang ƒë√°nh gi√° {model_name}...")
    
    evaluated_queries = 0
    for query_id, query_text in queries.items():
        if query_id not in qrels:
            continue
            
        relevant_docs = qrels[query_id]
        if not relevant_docs:
            continue
            
        # L·∫•y k·∫øt qu·∫£ t√¨m ki·∫øm
        results = model.search(query_text, top_k=top_k)
        retrieved_docs = [r['doc_id'] for r in results]
        
        # T√≠nh c√°c metrics
        precision, recall = calculate_precision_recall(relevant_docs, retrieved_docs, k=top_k)
        ap = calculate_average_precision(relevant_docs, retrieved_docs)
        
        precisions_at_k.append(precision)
        recalls_at_k.append(recall)
        average_precisions.append(ap)
        
        # T√≠nh precision-recall curve ƒë·ªÉ n·ªôi suy
        pr_pairs = []
        for i in range(1, min(len(retrieved_docs), 20) + 1):
            p, r = calculate_precision_recall(relevant_docs, retrieved_docs, k=i)
            pr_pairs.append((p, r))
        
        interpolated = interpolate_precision_recall(pr_pairs)
        all_interpolated_points.append(interpolated)
        
        evaluated_queries += 1
    
    # T√≠nh metrics t·ªïng th·ªÉ
    avg_precision = np.mean(precisions_at_k) if precisions_at_k else 0
    avg_recall = np.mean(recalls_at_k) if recalls_at_k else 0
    map_score = np.mean(average_precisions) if average_precisions else 0
    
    # T√≠nh 11-point interpolated precision
    if all_interpolated_points:
        recall_levels = np.arange(0.0, 1.1, 0.1)
        mean_interpolated_precisions = []
        
        for i, recall_level in enumerate(recall_levels):
            precisions_at_level = [points[i][1] for points in all_interpolated_points]
            mean_interpolated_precisions.append(np.mean(precisions_at_level))
    else:
        mean_interpolated_precisions = [0.0] * 11
    
    return {
        'precision': avg_precision,
        'recall': avg_recall,
        'map': map_score,
        'interpolated_precisions': mean_interpolated_precisions,
        'num_queries': evaluated_queries
    }

# Th·ª±c hi·ªán ƒë√°nh gi√°
if 'lsi_model' in locals() and documents and queries:
    print("=== ƒê√ÅNH GI√Å M√î H√åNH TR√äN NG·ªÆ LI·ªÜU CRANFIELD ===")
    
    # T·∫°o relevance judgments gi·∫£ l·∫≠p
    print("T·∫°o relevance judgments gi·∫£ l·∫≠p...")
    qrels = create_synthetic_relevance_judgments(queries, documents, lsi_model, num_queries=15)
    
    print(f"ƒê√£ t·∫°o relevance judgments cho {len(qrels)} queries")
    for query_id, relevant_docs in list(qrels.items())[:3]:
        print(f"Query {query_id}: {len(relevant_docs)} relevant documents")
        if query_id in queries:
            print(f"  Text: {queries[query_id][:60]}...")
    
    # T·∫°o TF-IDF baseline model
    print("\nT·∫°o TF-IDF baseline model...")
    tfidf_baseline = TFIDFModel(lsi_model.tfidf_vectorizer)
    tfidf_baseline.fit(doc_list, doc_ids)
    
    # ƒê√°nh gi√° c·∫£ hai m√¥ h√¨nh
    lsi_results = evaluate_model(lsi_model, queries, qrels, "LSI Model")
    tfidf_results = evaluate_model(tfidf_baseline, queries, qrels, "TF-IDF Baseline")
    
    # Hi·ªÉn th·ªã k·∫øt qu·∫£
    print("\n" + "="*70)
    print("K·∫æT QU·∫¢ ƒê√ÅNH GI√Å")
    print("="*70)
    
    print(f"\nüìä LSI Model:")
    print(f"  Average Precision@50: {lsi_results['precision']:.4f}")
    print(f"  Average Recall@50:    {lsi_results['recall']:.4f}")
    print(f"  MAP:                  {lsi_results['map']:.4f}")
    print(f"  S·ªë queries ƒë√°nh gi√°:  {lsi_results['num_queries']}")
    
    print(f"\nüìä TF-IDF Baseline:")
    print(f"  Average Precision@50: {tfidf_results['precision']:.4f}")
    print(f"  Average Recall@50:    {tfidf_results['recall']:.4f}")
    print(f"  MAP:                  {tfidf_results['map']:.4f}")
    print(f"  S·ªë queries ƒë√°nh gi√°:  {tfidf_results['num_queries']}")
    
    # So s√°nh hi·ªáu su·∫•t
    print(f"\nüîç So s√°nh hi·ªáu su·∫•t:")
    if tfidf_results['map'] > 0:
        map_improvement = ((lsi_results['map'] - tfidf_results['map']) / tfidf_results['map']) * 100
        precision_improvement = ((lsi_results['precision'] - tfidf_results['precision']) / tfidf_results['precision']) * 100
        
        print(f"  MAP improvement:       {map_improvement:+.2f}%")
        print(f"  Precision improvement: {precision_improvement:+.2f}%")
        
        if map_improvement > 0:
            print("  ‚úÖ LSI model c√≥ hi·ªáu su·∫•t t·ªët h∆°n TF-IDF baseline")
        else:
            print("  ‚ö†Ô∏è  TF-IDF baseline c√≥ hi·ªáu su·∫•t t·ªët h∆°n LSI model")
    
    # V·∫Ω bi·ªÉu ƒë·ªì 11-point interpolated precision-recall curves
    plt.figure(figsize=(10, 6))
    recall_levels = np.arange(0.0, 1.1, 0.1)
    
    plt.plot(recall_levels, lsi_results['interpolated_precisions'], 'bo-', 
             label='LSI Model', linewidth=2, markersize=6)
    plt.plot(recall_levels, tfidf_results['interpolated_precisions'], 'ro-', 
             label='TF-IDF Baseline', linewidth=2, markersize=6)
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('11-Point Interpolated Precision-Recall Curves')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xlim(0, 1)
    plt.ylim(0, max(max(lsi_results['interpolated_precisions']), 
                   max(tfidf_results['interpolated_precisions'])) + 0.1)
    
    plt.tight_layout()
    plt.show()
    
    # B·∫£ng so s√°nh chi ti·∫øt
    print("\nüìã 11-Point Interpolated Precision Values:")
    print("Recall | LSI Model | TF-IDF   | Ch√™nh l·ªách")
    print("-------|-----------|----------|----------")
    for i, recall in enumerate(recall_levels):
        lsi_prec = lsi_results['interpolated_precisions'][i]
        tfidf_prec = tfidf_results['interpolated_precisions'][i]
        diff = lsi_prec - tfidf_prec
        print(f"{recall:6.1f} | {lsi_prec:8.4f}  | {tfidf_prec:8.4f} | {diff:+8.4f}")
    
    print("\n" + "="*70)
    print("ƒê√ÅNH GI√Å HO√ÄN TH√ÄNH")
    print("="*70)
    
    # Nh·∫≠n x√©t v·ªÅ k·∫øt qu·∫£
    print(f"\nüí° Nh·∫≠n x√©t:")
    print(f"- LSI model s·ª≠ d·ª•ng {lsi_model.n_components} components t·ª´ SVD")
    print(f"- Explained variance: {sum(lsi_model.svd.explained_variance_ratio_):.2%}")
    print(f"- LSI c√≥ th·ªÉ t√¨m ra semantic relationships m√† TF-IDF b·ªè l·ª°")
    print(f"- Trade-off: complexity tƒÉng nh∆∞ng c√≥ th·ªÉ c·∫£i thi·ªán recall")
    
else:
    print("Kh√¥ng th·ªÉ th·ª±c hi·ªán ƒë√°nh gi√°. Vui l√≤ng ƒë·∫£m b·∫£o ƒë√£ load d·ªØ li·ªáu v√† train m√¥ h√¨nh LSI.")
