In [None]:
import os
import json

KAGGLE_INPUT_PATH = '/kaggle/input'

data_paths = {
    'corpus': None,
    'queries': None
}

if os.path.exists(KAGGLE_INPUT_PATH):
    for root, dirs, files in os.walk(KAGGLE_INPUT_PATH):
        for file in files:
            if file == 'vn_plagiarism_corpus.json':
                data_paths['corpus'] = os.path.join(root, file)
            elif file == 'vn_plagiarism_queries.json':
                data_paths['queries'] = os.path.join(root, file)

if data_paths['corpus'] is None:
    if os.path.exists('dataset_plagiarism_detection/vn_plagiarism_corpus.json'):
        data_paths['corpus'] = 'dataset_plagiarism_detection/vn_plagiarism_corpus.json'
    else:
        data_paths['corpus'] = 'vn_plagiarism_corpus.json'
        
if data_paths['queries'] is None:
    if os.path.exists('dataset_plagiarism_detection/vn_plagiarism_queries.json'):
        data_paths['queries'] = 'dataset_plagiarism_detection/vn_plagiarism_queries.json'
    else:
        data_paths['queries'] = 'vn_plagiarism_queries.json'

print("Data file paths:")
print(f"   Corpus: {data_paths['corpus']}")
print(f"   Queries: {data_paths['queries']}")
print(f"   Corpus exists: {os.path.exists(data_paths['corpus'])}")
print(f"   Queries exists: {os.path.exists(data_paths['queries'])}")

In [None]:
import json 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

print("="*60)
print("LOADING VIETNAMESE PLAGIARISM DATASET")
print("="*60)

corpus_data = None
queries_data = None

try:
    with open(data_paths['corpus'], 'r', encoding='utf-8') as f:
        corpus_data = json.load(f)
    with open(data_paths['queries'], 'r', encoding='utf-8') as f:
        queries_data = json.load(f)
    
    if corpus_data is None or queries_data is None:
        raise ValueError("Data loaded but is None")
    
    print(f"Successfully loaded data!")
    print(f"   Corpus: {len(corpus_data)} documents")
    print(f"   Queries: {len(queries_data)} queries")
    
except FileNotFoundError as e:
    print(f"ERROR: Data files not found!")
    print(f"   {e}")
    print(f"\nInstructions:")
    print(f"   1. Upload vn_plagiarism_corpus.json and vn_plagiarism_queries.json")
    print(f"   2. Or add them as a Kaggle Dataset")
    print(f"   3. Restart the kernel")
    raise
except Exception as e:
    print(f"ERROR loading data: {e}")
    raise

plagiarism_counts = {}
for query in queries_data:
    ptype = query.get('plagiarism_type', 'unknown')
    if ptype is None:
        ptype = 'unknown'
    plagiarism_counts[ptype] = plagiarism_counts.get(ptype, 0) + 1

print(f"\nPlagiarism type distribution:")
for ptype, count in sorted(plagiarism_counts.items(), key=lambda x: x[1], reverse=True):
    if ptype is None:
        ptype = 'unknown'
    is_plag = "Plagiarism" if ptype != "original" else "Original"
    percentage = (count/len(queries_data)*100) if queries_data else 0
    print(f"   {is_plag:<12} | {ptype:<20} | {count:>4} ({percentage:>5.1f}%)")

query_lengths = [len(q['text'].split()) for q in queries_data if 'text' in q]
corpus_lengths = [len(c['text'].split()) for c in corpus_data if 'text' in c]

print(f"\nText length statistics (words):")
print(f"   Query  - Min: {min(query_lengths):>4}, Max: {max(query_lengths):>4}, Avg: {np.mean(query_lengths):>6.1f}")
print(f"   Corpus - Min: {min(corpus_lengths):>4}, Max: {max(corpus_lengths):>4}, Avg: {np.mean(corpus_lengths):>6.1f}")

print(f"\n{'='*60}")
print("Data ready for next step!")
print("="*60)

In [None]:
print("Installing required libraries...")

!pip install -q -U sentence-transformers
!pip install -q faiss-cpu
!pip install -q xgboost

print("Installation completed!")

In [None]:
import json
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import re
import time
import warnings

from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score
)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')

print("Import thu vien thanh cong!")

plt.style.use('default')
sns.set_palette('husl')

In [None]:
model_name = "bkai-foundation-models/vietnamese-bi-encoder"

print("="*60)
print("LOADING BI-ENCODER MODEL")
print("="*60)
print(f"Model: {model_name}")

try:
    bi_encoder = SentenceTransformer(model_name)
    print("Load mo hinh thanh cong!")
    print(f"   Max sequence length: {bi_encoder.max_seq_length}")
    print(f"   Embedding dimension: {bi_encoder.get_sentence_embedding_dimension()}")
except Exception as e:
    print(f"Loi khi tai mo hinh '{model_name}': {e}")
    raise   

In [None]:
# CHUNKING v√† l∆∞u l·∫°i 

import re
import pickle
import os

class TextChunker:

    
    def __init__(self, chunk_type="adaptive", max_chunk_words=100):
        self.chunk_type = chunk_type  # "sentence", "paragraph", or "adaptive"
        self.max_chunk_words = max_chunk_words  # Max words per chunk
    
    def chunk_text(self, text, doc_id):
        """
        ADAPTIVE CHUNKING:
        - Auto-detect best strategy based on text length
        - Paragraph chunking for long docs (>500 words)
        - Sentence chunking for short docs (<500 words)
        """
        word_count = len(text.split())
        
        # Decide chunking strategy
        if self.chunk_type == "adaptive":
            # Long document ‚Üí Paragraph chunking
            if word_count > 500:
                return self._chunk_by_paragraph(text, doc_id)
            # Short document ‚Üí Sentence chunking
            else:
                return self._chunk_by_sentence(text, doc_id)
        
        elif self.chunk_type == "paragraph":
            return self._chunk_by_paragraph(text, doc_id)
        
        else:  # sentence
            return self._chunk_by_sentence(text, doc_id)
    
    def _chunk_by_sentence(self, text, doc_id):
        """Original sentence chunking (fine-grained)"""
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if s.strip()]
        
        chunks = []
        for i, sentence in enumerate(sentences):
            chunk = {
                'chunk_id': f"{doc_id}_chunk_{i}",
                'doc_id': doc_id,
                'text': sentence,
                'position': i,
                'length': len(sentence.split())
            }
            chunks.append(chunk)
        
        return chunks
    
    def _chunk_by_paragraph(self, text, doc_id):
        """
        PARAGRAPH CHUNKING (coarse-grained) - For LARGE corpus
        
        Strategy:
        1. Split by double newline (\n\n) or multiple spaces
        2. Merge small paragraphs (< 30 words)
        3. Split large paragraphs (> max_chunk_words)
        """
        
        # Step 1: Split by paragraphs (double newline or 4+ spaces)
        paragraphs = re.split(r'\n\n+|\s{4,}', text)
        paragraphs = [p.strip() for p in paragraphs if p.strip()]
        
        # Step 2: Smart merging/splitting
        chunks = []
        current_chunk = []
        current_length = 0
        
        for para in paragraphs:
            para_words = para.split()
            para_length = len(para_words)
            
            # Case 1: Paragraph qu√° l·ªõn ‚Üí Split by sentences
            if para_length > self.max_chunk_words:
                # Flush current chunk
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = []
                    current_length = 0
                
                # Split large paragraph by sentences
                sentences = re.split(r'[.!?]+', para)
                sentences = [s.strip() for s in sentences if s.strip()]
                
                temp_chunk = []
                temp_length = 0
                
                for sent in sentences:
                    sent_len = len(sent.split())
                    
                    if temp_length + sent_len > self.max_chunk_words and temp_chunk:
                        chunks.append(' '.join(temp_chunk))
                        temp_chunk = [sent]
                        temp_length = sent_len
                    else:
                        temp_chunk.append(sent)
                        temp_length += sent_len
                
                if temp_chunk:
                    chunks.append(' '.join(temp_chunk))
            
            # Case 2: Merge v·ªõi chunk hi·ªán t·∫°i
            elif current_length + para_length <= self.max_chunk_words:
                current_chunk.append(para)
                current_length += para_length
            
            # Case 3: Flush v√† start new chunk
            else:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = [para]
                current_length = para_length
        
        # Flush remaining
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        
        # Step 3: Create chunk objects
        chunk_objects = []
        for i, chunk_text in enumerate(chunks):
            chunk_objects.append({
                'chunk_id': f"{doc_id}_chunk_{i}",
                'doc_id': doc_id,
                'text': chunk_text,
                'position': i,
                'length': len(chunk_text.split())
            })
        
        return chunk_objects
    
    def chunk_corpus(self, corpus_data):
        """Chunk to√†n b·ªô corpus"""
        all_chunks = []
        
        for doc in corpus_data:
            doc_chunks = self.chunk_text(doc['text'], doc['id'])
            all_chunks.extend(doc_chunks)
        
        return all_chunks

# ===============================================
# SAVE/LOAD CORPUS CHUNKS
# ===============================================
CORPUS_CHUNKS_FILE = 'corpus_chunks.pkl'

if os.path.exists(CORPUS_CHUNKS_FILE):
    print("="*60)
    print("üìÇ LOADING SAVED CORPUS CHUNKS")
    print("="*60)
    
    start_time = time.time()
    with open(CORPUS_CHUNKS_FILE, 'rb') as f:
        corpus_chunks = pickle.load(f)
    end_time = time.time()
    
    print(f"‚úÖ Loaded {len(corpus_chunks)} chunks")
    print(f"‚è±Ô∏è  Time: {end_time - start_time:.2f}s")
    print(f"   Avg chunks/doc: {len(corpus_chunks)/len(corpus_data):.1f}")
    
else:
    print("="*60)
    print("üî® CREATING NEW CORPUS CHUNKS")
    print("="*60)
    
    # T·∫°o chunks
    chunker = TextChunker()
    corpus_chunks = chunker.chunk_corpus(corpus_data)
    
    print(f"‚úÖ Created {len(corpus_chunks)} chunks from {len(corpus_data)} documents")
    print(f"   Avg: {len(corpus_chunks)/len(corpus_data):.1f} chunks/doc")
    
    # L∆∞u file
    print("\nüíæ Saving corpus chunks...")
    with open(CORPUS_CHUNKS_FILE, 'wb') as f:
        pickle.dump(corpus_chunks, f, protocol=pickle.HIGHEST_PROTOCOL)
    
    file_size = os.path.getsize(CORPUS_CHUNKS_FILE) / 1024 / 1024
    print(f"‚úÖ Saved to {CORPUS_CHUNKS_FILE} (~{file_size:.1f} MB)")

# Create chunker instance for later use
chunker = TextChunker()

# Hi·ªÉn th·ªã m·∫´u
print("\nüìù Sample chunks:")
for i, chunk in enumerate(corpus_chunks[:3]):
    print(f"{i+1}. [{chunk['chunk_id']}] {chunk['text'][:60]}...")

print("\n" + "="*60)
print("‚úÖ CORPUS CHUNKS READY!")
print("="*60)

In [None]:
#T·∫°o embeddings cho chunks 

import os
import pickle
import numpy as np
import faiss

# ===============================================
# FILE PATHS
# ===============================================
EMBEDDINGS_FILE = 'chunk_embeddings_normalized.npy'
FAISS_INDEX_FILE = 'chunk_faiss_index.faiss'
CHUNK_METADATA_FILE = 'chunk_metadata.pkl'

# ===============================================
# HELPER: VALIDATE EMBEDDINGS
# ===============================================
def validate_embeddings(embeddings, name="embeddings"):
    """
    Ki·ªÉm tra embeddings c√≥ h·ª£p l·ªá kh√¥ng
    
    Returns:
        bool: True if valid, False otherwise
    """
    # Check for NaN
    nan_count = np.isnan(embeddings).sum()
    if nan_count > 0:
        print(f"‚ùå ERROR: {name} contains {nan_count} NaN values!")
        return False
    
    # Check for Inf
    inf_count = np.isinf(embeddings).sum()
    if inf_count > 0:
        print(f"‚ùå ERROR: {name} contains {inf_count} Inf values!")
        return False
    
    # Check norm
    norms = np.linalg.norm(embeddings, axis=1)
    zero_norm_count = (norms < 1e-8).sum()
    if zero_norm_count > 0:
        print(f"‚ö†Ô∏è  WARNING: {zero_norm_count} embeddings have near-zero norm!")
    
    # Check range
    min_val = embeddings.min()
    max_val = embeddings.max()
    print(f"‚úÖ {name} validation:")
    print(f"   Shape: {embeddings.shape}")
    print(f"   Range: [{min_val:.6f}, {max_val:.6f}]")
    print(f"   Norm: [{norms.min():.6f}, {norms.max():.6f}]")
    
    return True

# ===============================================
# LOAD OR CREATE
# ===============================================
if (os.path.exists(EMBEDDINGS_FILE) and 
    os.path.exists(FAISS_INDEX_FILE) and 
    os.path.exists(CHUNK_METADATA_FILE)):
    
    print("="*60)
    print("üìÇ LOADING SAVED EMBEDDINGS + FAISS INDEX")
    print("="*60)
    
    start_time = time.time()
    
    # Load embeddings
    chunk_embeddings_normalized = np.load(EMBEDDINGS_FILE)
    print(f"‚úÖ Loaded embeddings: {chunk_embeddings_normalized.shape}")
    
    # VALIDATE LOADED EMBEDDINGS
    if not validate_embeddings(chunk_embeddings_normalized, "Loaded embeddings"):
        print("\n‚ö†Ô∏è  CORRUPTED FILE DETECTED!")
        print("   Deleting and recreating...")
        os.remove(EMBEDDINGS_FILE)
        os.remove(FAISS_INDEX_FILE)
        os.remove(CHUNK_METADATA_FILE)
        raise ValueError("Corrupted files deleted. Please re-run this cell.")
    
    # Load FAISS index
    embedding_dim = chunk_embeddings_normalized.shape[1]
    chunk_faiss_index = faiss.read_index(FAISS_INDEX_FILE)
    print(f"‚úÖ Loaded FAISS index: {chunk_faiss_index.ntotal} vectors")
    
    # Load metadata
    with open(CHUNK_METADATA_FILE, 'rb') as f:
        metadata = pickle.load(f)
    chunk_ids = metadata['chunk_ids']
    print(f"‚úÖ Loaded metadata: {len(chunk_ids)} chunk IDs")
    
    # TEST FAISS INDEX
    print(f"\nüß™ Testing FAISS index...")
    test_query = chunk_embeddings_normalized[0:1].astype('float32')
    test_similarities, test_indices = chunk_faiss_index.search(test_query, k=5)
    
    if np.isfinite(test_similarities).all():
        print(f"‚úÖ FAISS test passed! Similarity range: [{test_similarities[0].min():.6f}, {test_similarities[0].max():.6f}]")
    else:
        print(f"‚ùå FAISS index CORRUPTED! Deleting...")
        os.remove(EMBEDDINGS_FILE)
        os.remove(FAISS_INDEX_FILE)
        os.remove(CHUNK_METADATA_FILE)
        raise ValueError("Corrupted FAISS index deleted. Please re-run this cell.")
    
    end_time = time.time()
    print(f"\n‚è±Ô∏è  Loading time: {end_time - start_time:.2f}s")
    print(f"üöÄ Ready!")
    
else:
    print("="*60)
    print("üî® CREATING NEW EMBEDDINGS + FAISS INDEX")
    print("="*60)
    
    # Extract texts
    chunk_texts = [chunk['text'] for chunk in corpus_chunks]
    chunk_ids = [chunk['chunk_id'] for chunk in corpus_chunks]
    
    print(f"üìä Creating embeddings for {len(chunk_texts)} chunks...")
    
    # Create embeddings
    start_time = time.time()
    chunk_embeddings = bi_encoder.encode(
        chunk_texts,
        show_progress_bar=True,
        batch_size=32,
        convert_to_numpy=True
    )
    
    print(f"\nüîç Validating raw embeddings...")
    if not validate_embeddings(chunk_embeddings, "Raw embeddings"):
        raise ValueError("Raw embeddings are invalid! Check bi_encoder model.")
    
    # Normalize (WITH SAFETY CHECK)
    print(f"\nüîÑ Normalizing embeddings...")
    norms = np.linalg.norm(chunk_embeddings, axis=1, keepdims=True)
    
    # Replace zero norms with 1.0 to avoid division by zero
    zero_norm_mask = (norms < 1e-8)
    if zero_norm_mask.any():
        print(f"‚ö†Ô∏è  Found {zero_norm_mask.sum()} zero-norm embeddings, fixing...")
        norms[zero_norm_mask] = 1.0
    
    chunk_embeddings_normalized = chunk_embeddings / norms
    
    # Validate normalized embeddings
    print(f"\nüîç Validating normalized embeddings...")
    if not validate_embeddings(chunk_embeddings_normalized, "Normalized embeddings"):
        raise ValueError("Normalized embeddings are invalid!")
    
    end_time = time.time()
    print(f"\n‚úÖ Embeddings created and validated!")
    print(f"   Time: {end_time - start_time:.2f}s")
    
    # Create FAISS index
    print("\nüîß Creating FAISS index...")
    embedding_dim = chunk_embeddings_normalized.shape[1]
    
    # Use IndexFlatIP for small corpus
    chunk_faiss_index = faiss.IndexFlatIP(embedding_dim)
    
    # Add vectors (convert to float32 explicitly)
    vectors_to_add = chunk_embeddings_normalized.astype('float32')
    
    # Final validation before adding
    if not np.isfinite(vectors_to_add).all():
        raise ValueError("Non-finite values detected before adding to FAISS!")
    
    chunk_faiss_index.add(vectors_to_add)
    
    print(f"‚úÖ FAISS index created: {chunk_faiss_index.ntotal} vectors")
    
    # TEST FAISS INDEX
    print("\nüß™ Testing FAISS index...")
    test_query = chunk_embeddings_normalized[0:1].astype('float32')
    test_similarities, test_indices = chunk_faiss_index.search(test_query, k=5)
    
    print(f"   Test similarities: {test_similarities[0][:5]}")
    
    if np.isfinite(test_similarities).all():
        print(f"‚úÖ FAISS index test PASSED!")
    else:
        print(f"‚ùå FAISS index test FAILED!")
        raise ValueError("FAISS index is corrupted!")
    
    # Save everything
    print("\nüíæ Saving to disk...")
    
    # Save embeddings
    np.save(EMBEDDINGS_FILE, chunk_embeddings_normalized)
    emb_size = os.path.getsize(EMBEDDINGS_FILE) / 1024 / 1024
    print(f"‚úÖ Saved: {EMBEDDINGS_FILE} (~{emb_size:.1f} MB)")
    
    # Save FAISS index
    faiss.write_index(chunk_faiss_index, FAISS_INDEX_FILE)
    faiss_size = os.path.getsize(FAISS_INDEX_FILE) / 1024 / 1024
    print(f"‚úÖ Saved: {FAISS_INDEX_FILE} (~{faiss_size:.1f} MB)")
    
    # Save metadata
    metadata = {
        'chunk_ids': chunk_ids,
        'num_chunks': len(chunk_ids),
        'embedding_dim': embedding_dim,
        'model_name': 'bkai-foundation-models/vietnamese-bi-encoder'
    }
    with open(CHUNK_METADATA_FILE, 'wb') as f:
        pickle.dump(metadata, f)
    print(f"‚úÖ Saved: {CHUNK_METADATA_FILE}")
    
    print(f"\nüéâ All files saved! Next run will be faster!")

print("\n" + "="*60)
print("‚úÖ CHUNK EMBEDDINGS + FAISS INDEX READY!")
print("="*60)

In [None]:
# Document scoring v·ªõi chu·∫©n h√≥a m·ªÅm ƒë·ªÉ gi·∫£m b√£o h√≤a ƒëi·ªÉm
class DocumentScorer:
    def __init__(self, corpus_chunks, weights=None, score_center=0.55, score_scale=4.0):
        self.corpus_chunks = corpus_chunks
        self.weights = weights or {
            'doc_max': 0.40,
            'doc_mean': 0.20,
            'doc_count': 0.15,
            'doc_contiguous': 0.10,
            'doc_coverage': 0.10,
            'chunk_density': 0.05,
            'span_penalty': 0.05
        }
        self.score_center = score_center
        self.score_scale = score_scale
        self.chunk_map = {chunk['chunk_id']: chunk for chunk in corpus_chunks}
        self.doc_chunks_map = {}
        for chunk in corpus_chunks:
            self.doc_chunks_map.setdefault(chunk['doc_id'], []).append(chunk)

    def calculate_doc_scores(self, top_k_results):
        doc_similarities = {}
        for similarity, chunk_idx in top_k_results:
            chunk = self.corpus_chunks[chunk_idx]
            doc_id = chunk['doc_id']
            doc_similarities.setdefault(doc_id, []).append({
                'similarity': similarity,
                'chunk': chunk,
                'chunk_idx': chunk_idx
            })

        doc_scores = []
        for doc_id, chunk_sims in doc_similarities.items():
            similarities = [cs['similarity'] for cs in chunk_sims]
            doc_max = max(similarities)
            doc_mean = np.mean(similarities)
            total_doc_chunks = len(self.doc_chunks_map[doc_id])
            coverage_ratio = min(len(similarities) / total_doc_chunks, 1.0)
            doc_count = coverage_ratio  # gi·ªØ t∆∞∆°ng th√≠ch t√™n c≈©

            positions = sorted(cs['chunk']['position'] for cs in chunk_sims)
            doc_contiguous, max_group_len = self._calculate_contiguous_score(positions)
            span = positions[-1] - positions[0] + 1 if len(positions) > 1 else 1
            span_ratio = min(span / total_doc_chunks, 1.0)
            chunk_density = min(coverage_ratio / max(span_ratio, 1e-6), 1.0)
            chunk_similarity_std = float(np.std(similarities)) if len(similarities) > 1 else 0.0

            raw_score = (
                self.weights['doc_max'] * doc_max +
                self.weights['doc_mean'] * doc_mean +
                self.weights['doc_count'] * doc_count +
                self.weights['doc_contiguous'] * doc_contiguous +
                self.weights['doc_coverage'] * coverage_ratio +
                self.weights['chunk_density'] * chunk_density -
                self.weights['span_penalty'] * (1.0 - span_ratio)
            )

            logistic_input = self.score_scale * (raw_score - self.score_center)
            final_score = 1.0 / (1.0 + np.exp(-logistic_input))

            doc_scores.append({
                'doc_id': doc_id,
                'doc_max': doc_max,
                'doc_mean': doc_mean,
                'doc_count': doc_count,
                'doc_contiguous': doc_contiguous,
                'final_score': final_score,
                'raw_score': raw_score,
                'chunk_similarity_std': chunk_similarity_std,
                'position_span_ratio': span_ratio,
                'chunk_density': chunk_density,
                'contiguous_len': max_group_len,
                'chunks': chunk_sims,
                'num_chunks': len(chunk_sims)
            })

        doc_scores.sort(key=lambda x: x['final_score'], reverse=True)
        return doc_scores

    def _calculate_contiguous_score(self, positions):
        if len(positions) <= 1:
            return 0.0, len(positions)
        contiguous_groups = []
        current_group = [positions[0]]
        for i in range(1, len(positions)):
            if positions[i] - positions[i-1] <= 2:
                current_group.append(positions[i])
            else:
                contiguous_groups.append(len(current_group))
                current_group = [positions[i]]
        contiguous_groups.append(len(current_group))
        max_contiguous = max(contiguous_groups)
        score = min(max_contiguous / len(positions), 1.0)
        return score, max_contiguous


doc_scorer = DocumentScorer(corpus_chunks)
print("Document Scorer ready!")
print(f" Weights: {doc_scorer.weights}")

In [None]:
 
# BI-ENCODER PIPELINE
# M·ª•c ƒë√≠ch: T√¨m top 100 corpus chunks gi·ªëng nh·∫•t v·ªõi c√°c chunk c·ªßa query

def bi_encoder_search_chunked(query_text, query_id="query_temp", top_k=100):
    
    # B∆∞·ªõc 1: Chia query th√†nh chunks
    query_chunks = chunker.chunk_text(query_text, doc_id=query_id)
    print(f"   Query ƒë∆∞·ª£c chia th√†nh {len(query_chunks)} chunks")
    
    # B∆∞·ªõc 2: T·∫°o embeddings cho T·∫§T C·∫¢ query chunks
    query_texts = [chunk['text'] for chunk in query_chunks]
    query_embeddings = bi_encoder.encode(query_texts, convert_to_numpy=True)
    
    # Normalize query embeddings
    query_norms = np.linalg.norm(query_embeddings, axis=1, keepdims=True)
    query_norms[query_norms < 1e-8] = 1.0  # Tr√°nh chia cho 0
    query_embeddings_normalized = query_embeddings / query_norms
    
    # VALIDATE query embeddings
    if not np.isfinite(query_embeddings_normalized).all():
        print(f"   ‚ö†Ô∏è Some query chunks have invalid embeddings!")
        # Lo·∫°i b·ªè query chunks c√≥ invalid embeddings
        valid_mask = np.isfinite(query_embeddings_normalized).all(axis=1)
        query_embeddings_normalized = query_embeddings_normalized[valid_mask]
        print(f"   Keeping {valid_mask.sum()}/{len(query_chunks)} valid query chunks")
    
    if len(query_embeddings_normalized) == 0:
        print(f"   ‚ùå No valid query embeddings!")
        return []
    
    # B∆∞·ªõc 3: T√≠nh similarity gi·ªØa T·ª™NG corpus chunk v·ªõi T·∫§T C·∫¢ query chunks
    # Shape: (num_query_chunks, num_corpus_chunks)
    similarity_matrix = np.dot(
        query_embeddings_normalized.astype('float32'),
        chunk_embeddings_normalized.T.astype('float32')
    )
    
    # VALIDATE similarity matrix
    if not np.isfinite(similarity_matrix).all():
        print(f"   ‚ö†Ô∏è Similarity matrix contains invalid values!")
        similarity_matrix = np.nan_to_num(similarity_matrix, nan=0.0, posinf=1.0, neginf=0.0)
    
    # B∆∞·ªõc 4: Aggregate similarity cho m·ªói corpus chunk
    # Strategy: L·∫•y MAX similarity v·ªõi b·∫•t k·ª≥ query chunk n√†o
    # (v√¨ n·∫øu corpus chunk gi·ªëng v·ªõi B·∫§T K·ª≤ ph·∫ßn n√†o c·ªßa query ‚Üí c√≥ kh·∫£ nƒÉng plagiarism)
    corpus_scores = np.max(similarity_matrix, axis=0)  # Shape: (num_corpus_chunks,)
    
    print(f"   Computed similarity v·ªõi {len(corpus_scores)} corpus chunks")
    print(f"   Similarity range: [{corpus_scores.min():.6f}, {corpus_scores.max():.6f}]")
    
    # B∆∞·ªõc 5: L·∫•y top-K corpus chunks
    top_k_actual = min(top_k, len(corpus_scores))
    top_k_indices = np.argsort(corpus_scores)[::-1][:top_k_actual]
    
    # Convert to results format: [(similarity, chunk_idx), ...]
    results = [(float(corpus_scores[idx]), int(idx)) for idx in top_k_indices]
    
    print(f"   Selected top {len(results)} corpus chunks")
    
    return results

# Test v·ªõi m·ªôt query
test_query = queries_data[33]
print(f"üß™ TEST BI-ENCODER PIPELINE WITH QUERY CHUNKING")
print(f"="*60)
print(f"Query ID: {test_query['id']}")
print(f"Query: {test_query['text'][:100]}...")
print(f"Query length: {len(test_query['text'].split())} words")
print(f"True label: {'PLAGIARISM' if test_query['is_plagiarism'] else 'ORIGINAL'}")
print(f"True type: {test_query['plagiarism_type']}")

# Bi-encoder search with chunking (t√¨m top-100 corpus chunks gi·ªëng nh·∫•t v·ªõi to√†n b·ªô query)
print(f"\nüîç Chunking query and searching top 100 similar corpus chunks...")
bi_results = bi_encoder_search_chunked(test_query['text'], query_id=test_query['id'], top_k=100)

if len(bi_results) > 0:
    print(f"\n‚úÖ Found {len(bi_results)} unique chunks")
    print(f"   Similarity range: [{bi_results[-1][0]:.6f}, {bi_results[0][0]:.6f}]")
    
    # Document scoring
    doc_scores = doc_scorer.calculate_doc_scores(bi_results)
    print(f"\nüìä Document scores (Top 10):")
    print("="*60)
    
    for i, doc_score in enumerate(doc_scores[:10]):
        print(f"{i+1}. Doc: {doc_score['doc_id']} | Chunks: {doc_score['num_chunks']}")
        print(f"    Final: {doc_score['final_score']:.3f}")
        print(f"    Max: {doc_score['doc_max']:.3f} | Mean: {doc_score['doc_mean']:.3f}")
        print(f"    Count: {doc_score['doc_count']:.3f} | Contiguous: {doc_score['doc_contiguous']:.3f}")
        print()
    
    print(f"üéØ Best match: {doc_scores[0]['doc_id']} (score: {doc_scores[0]['final_score']:.3f})")
    
    # Ki·ªÉm tra xem c√≥ ƒë√∫ng source document kh√¥ng (n·∫øu l√† plagiarism)
    if test_query['is_plagiarism'] and test_query.get('source_doc_id'):
        source_rank = None
        for i, doc_score in enumerate(doc_scores):
            if doc_score['doc_id'] == test_query['source_doc_id']:
                source_rank = i + 1
                break
        
        if source_rank:
            print(f"‚úÖ Source document {test_query['source_doc_id']} found at rank {source_rank}")
        else:
            print(f"‚ùå Source document {test_query['source_doc_id']} NOT in top results")
    else:
        print(f"‚ÑπÔ∏è  This is an original text (no plagiarism)")
else:
    print(f"‚ùå No valid results found!")

print("="*60)


In [None]:
# M·ªû R·ªòNG NG·ªÆ C·∫¢NH CHO CHUNKS
# (Gi·ªØ l·∫°i ƒë·ªÉ tra c·ª©u ng·ªØ c·∫£nh chunk khi c·∫ßn debug, nh∆∞ng kh√¥ng c√≤n ph·ª•c v·ª• cross-encoder)
class ContextExpander:
    def __init__(self, corpus_chunks, corpus_data):
        self.corpus_chunks = corpus_chunks
        self.corpus_data = corpus_data
        
        self.doc_text_map = {doc['id']: doc['text'] for doc in corpus_data}
        self.doc_chunks_map = {}
        for chunk in corpus_chunks:
            doc_id = chunk['doc_id']
            self.doc_chunks_map.setdefault(doc_id, []).append(chunk)
        for doc_id in self.doc_chunks_map:
            self.doc_chunks_map[doc_id].sort(key=lambda x: x['position'])
    
    def expand_chunk_context(self, chunk, context_window=1):
        doc_id = chunk['doc_id']
        position = chunk['position']
        doc_chunks = self.doc_chunks_map[doc_id]
        current_idx = next((idx for idx, c in enumerate(doc_chunks) if c['position'] == position), None)
        if current_idx is None:
            return chunk['text']
        start_idx = max(0, current_idx - context_window)
        end_idx = min(len(doc_chunks), current_idx + context_window + 1)
        context_chunks = doc_chunks[start_idx:end_idx]
        return " ".join([c['text'] for c in context_chunks])
    
    def get_best_chunks_per_doc(self, doc_scores, top_n_docs=10, chunks_per_doc=1):
        best_chunks = []
        for doc_score in doc_scores[:top_n_docs]:
            sorted_chunks = sorted(doc_score['chunks'], key=lambda x: x['similarity'], reverse=True)
            selected_chunks = sorted_chunks if chunks_per_doc == -1 else sorted_chunks[:chunks_per_doc]
            for chunk_data in selected_chunks:
                best_chunks.append({
                    'doc_id': doc_score['doc_id'],
                    'chunk': chunk_data['chunk'],
                    'chunk_similarity': chunk_data['similarity'],
                    'doc_final_score': doc_score['final_score'],
                    'doc_max': doc_score['doc_max'],
                    'doc_mean': doc_score['doc_mean'],
                    'doc_count': doc_score['doc_count'],
                    'doc_contiguous': doc_score['doc_contiguous']
                })
        return best_chunks
    
    def get_best_chunk_per_doc(self, doc_scores, top_n=15):
        return self.get_best_chunks_per_doc(doc_scores, top_n_docs=top_n, chunks_per_doc=1)

context_expander = ContextExpander(corpus_chunks, corpus_data)
print("‚úÖ Context Expander ready!")

In [None]:

class CompletePlagiarismDetector:
    def __init__(self, bi_encoder, chunk_faiss_index, corpus_chunks, corpus_data,
                 doc_scorer, context_expander,
                 query_chunker=None, max_query_chunks=10, threshold=0.7):
        self.bi_encoder = bi_encoder
        self.chunk_faiss_index = chunk_faiss_index  # optional, not required for dot-product path
        self.corpus_chunks = corpus_chunks
        self.corpus_data = corpus_data
        self.doc_scorer = doc_scorer
        self.context_expander = context_expander
        self.query_chunker = query_chunker or TextChunker()
        self.max_query_chunks = max_query_chunks
        self.threshold = threshold

    def detect(self, query_text, top_k=100, top_n_docs=15, use_faiss=False, verbose=False):
        if verbose:
            print("="*60)
            print("PLAGIARISM DETECTION PIPELINE (BI-ENCODER ONLY)")
            print("="*60)

        # Step 1: chunk query
        query_chunks = self.query_chunker.chunk_text(query_text, doc_id="query")
        if self.max_query_chunks and len(query_chunks) > self.max_query_chunks:
            query_chunks = query_chunks[:self.max_query_chunks]
        query_texts = [c['text'] for c in query_chunks]
        word_count = len(query_text.split())

        if len(query_texts) == 0:
            return {
                'prediction': False,
                'confidence': 0.0,
                'threshold': self.threshold,
                'best_match': None,
                'top_results': [],
                'doc_scores': [],
                'method': 'bi-encoder',
                'query_chunks': 0,
                'query_words': word_count,
                'corpus_matches': 0
            }

        if verbose:
            print(f"Step 1: Query words: {word_count}, chunks: {len(query_texts)}")

        # Step 2: embed query chunks
        q_emb = self.bi_encoder.encode(query_texts, show_progress_bar=False, convert_to_numpy=True)
        q_norms = np.linalg.norm(q_emb, axis=1, keepdims=True)
        q_norms[q_norms < 1e-8] = 1.0
        q_emb_norm = q_emb / q_norms

        if use_faiss and hasattr(self, 'chunk_faiss_index') and self.chunk_faiss_index is not None:
            # per-chunk FAISS search aggregation (safe)
            corpus_chunk_scores = {}
            per_chunk_k = min(top_k, self.chunk_faiss_index.ntotal)
            for i, qv in enumerate(q_emb_norm):
                if not np.isfinite(qv).all():
                    continue
                sims, idxs = self.chunk_faiss_index.search(qv.reshape(1, -1).astype('float32'), per_chunk_k)
                sims = sims[0]; idxs = idxs[0]
                # sanitize
                sims = np.nan_to_num(sims, nan=0.0, posinf=1.0, neginf=0.0)
                for j, corpus_idx in enumerate(idxs):
                    sim = float(sims[j])
                    if not np.isfinite(sim):
                        continue
                    if corpus_idx not in corpus_chunk_scores:
                        corpus_chunk_scores[corpus_idx] = {'max_similarity': sim, 'query_chunks':[i]}
                    else:
                        if sim > corpus_chunk_scores[corpus_idx]['max_similarity']:
                            corpus_chunk_scores[corpus_idx]['max_similarity'] = sim
                        if i not in corpus_chunk_scores[corpus_idx]['query_chunks']:
                            corpus_chunk_scores[corpus_idx]['query_chunks'].append(i)
            bi_results = [(v['max_similarity'], int(k)) for k, v in corpus_chunk_scores.items()]
            bi_results.sort(reverse=True, key=lambda x: x[0])
            bi_results = bi_results[:top_k]
        else:
            # dot-product matrix path (fast if chunk_embeddings_normalized is in memory)
            # requires chunk_embeddings_normalized to exist in notebook scope
            similarity_matrix = np.dot(q_emb_norm.astype('float32'), chunk_embeddings_normalized.T.astype('float32'))
            if not np.isfinite(similarity_matrix).all():
                similarity_matrix = np.nan_to_num(similarity_matrix, nan=0.0, posinf=1.0, neginf=0.0)
            corpus_scores = np.max(similarity_matrix, axis=0)  # max over query chunks
            top_k_actual = min(top_k, len(corpus_scores))
            top_k_indices = np.argsort(corpus_scores)[::-1][:top_k_actual]
            bi_results = [(float(corpus_scores[idx]), int(idx)) for idx in top_k_indices]

        if verbose:
            if len(bi_results) > 0:
                print(f"Step 2: Selected top {len(bi_results)} corpus chunks | sim range [{bi_results[-1][0]:.6f}, {bi_results[0][0]:.6f}]")
            else:
                print("Step 2: No matching corpus chunks found")

        # Step 3: document scoring
        doc_scores = self.doc_scorer.calculate_doc_scores(bi_results)
        if verbose:
            print(f"Step 3: Scored {len(doc_scores)} documents")

        # Step 4: select best chunk per doc
        best_chunks = self.context_expander.get_best_chunk_per_doc(doc_scores, top_n=top_n_docs)
        if verbose:
            print(f"Step 4: Selected {len(best_chunks)} best chunks from top-{top_n_docs} docs")

        # Final: use document final_score as confidence
        best_doc = doc_scores[0] if len(doc_scores) > 0 else None
        confidence = float(best_doc['final_score']) if best_doc is not None else 0.0
        is_plagiarism = confidence >= self.threshold

        if verbose:
            print("\n" + "="*60)
            print("‚≠ê FINAL RESULT:")
            print(f"   Prediction: {'PLAGIARISM' if is_plagiarism else 'ORIGINAL'}")
            print(f"   Confidence: {confidence:.3f}")
            print(f"   Threshold: {self.threshold}")
            if best_doc:
                print(f"   Best match doc: {best_doc['doc_id']} (final_score: {best_doc['final_score']:.3f})")
            print("="*60)

        return {
            'prediction': bool(is_plagiarism),
            'confidence': confidence,
            'threshold': self.threshold,
            'best_match': best_doc,
            'top_results': doc_scores[:top_n_docs],
            'doc_scores': doc_scores[:5],
            'method': 'bi-encoder',
            'query_chunks': len(query_chunks),
            'query_words': word_count,
            'corpus_matches': len(bi_results)
        }


print("="*60)
print("CREATING PLAGIARISM DETECTOR")
print("="*60)

complete_detector_v2 = CompletePlagiarismDetector(
    bi_encoder=bi_encoder,
    chunk_faiss_index=chunk_faiss_index,
    corpus_chunks=corpus_chunks,
    corpus_data=corpus_data,
    doc_scorer=doc_scorer,
    context_expander=context_expander,
    query_chunker=chunker,
    max_query_chunks=10,
    threshold=0.6   # adjust as needed
)

print("‚úÖ Complete Plagiarism Detector (bi-encoder) ready!")
print(f"\nüìä Configuration:")
print(f"   Threshold: {complete_detector_v2.threshold}")
print(f"   Max query chunks: {complete_detector_v2.max_query_chunks}")
print(f"   Method: {complete_detector_v2.__class__.__name__} -> bi-encoder only")
# ...existing code...

In [None]:
# B∆Ø·ªöC 13A: ML CLASSIFIER CLASS

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle
import os

class MLClassifier:
    """
    Machine Learning Classifier cho Plagiarism Detection
    T·∫≠n d·ª•ng ƒë·∫∑c tr∆∞ng doc-level t·ª´ bi-encoder/doc scoring + th·ªëng k√™ coverage.
    """

    def __init__(self, model_type='xgboost'):
        self.model_type = model_type
        if model_type == 'xgboost':
            self.model = XGBClassifier(
                n_estimators=120,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.9,
                colsample_bytree=0.9,
                random_state=42,
                eval_metric='logloss'
            )
        else:
            raise ValueError(f"Model type {model_type} not supported")

        self.feature_names = [
            'best_final',
            'raw_score',
            'doc_max',
            'doc_mean',
            'doc_count',
            'doc_contiguous',
            'chunk_similarity_std',
            'position_span_ratio',
            'chunk_density',
            'top_final_std',
            'top_final_gap',
            'best_chunks_ratio'
        ]
        self.is_trained = False

    def extract_features(self, result):
        best_match = result.get('best_match')
        top_results = result.get('top_results', []) or []
        if not best_match or len(top_results) == 0:
            return [0.0] * len(self.feature_names)

        best_final = float(best_match.get('final_score', 0.0))
        raw_score = float(best_match.get('raw_score', best_final))
        doc_max = float(best_match.get('doc_max', 0.0))
        doc_mean = float(best_match.get('doc_mean', 0.0))
        doc_count = float(best_match.get('doc_count', 0.0))
        doc_contiguous = float(best_match.get('doc_contiguous', 0.0))
        chunk_similarity_std = float(best_match.get('chunk_similarity_std', 0.0))
        position_span_ratio = float(best_match.get('position_span_ratio', 0.0))
        chunk_density = float(best_match.get('chunk_density', 0.0))

        final_scores = [float(d.get('final_score', 0.0)) for d in top_results]
        top_final_std = float(np.std(final_scores)) if len(final_scores) > 1 else 0.0
        top_final_gap = float(final_scores[0] - final_scores[1]) if len(final_scores) > 1 else 0.0

        best_num_chunks = float(best_match.get('num_chunks', 0.0))
        total_top_chunks = float(sum(d.get('num_chunks', 0.0) for d in top_results))
        if total_top_chunks <= 0:
            total_top_chunks = 1.0
        best_chunks_ratio = float(best_num_chunks / total_top_chunks)

        features = [
            best_final,
            raw_score,
            doc_max,
            doc_mean,
            doc_count,
            doc_contiguous,
            chunk_similarity_std,
            position_span_ratio,
            chunk_density,
            top_final_std,
            top_final_gap,
            best_chunks_ratio
        ]
        return features

    def train(self, detector, queries_data, test_size=0.2, verbose=True):
        if verbose:
            print(f"Extracting features t·ª´ {len(queries_data)} queries...")
        X, y = [], []
        for i, query in enumerate(queries_data):
            if verbose and (i + 1) % 20 == 0:
                print(f"   Progress: {i+1}/{len(queries_data)}")
            result = detector.detect(query['text'], verbose=False)
            X.append(self.extract_features(result))
            y.append(query['is_plagiarism'])

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )
        if verbose:
            print(f"\nüìä Dataset split:")
            print(f"   Train: {len(X_train)} samples")
            print(f"   Test:  {len(X_test)} samples")
            print(f"\nüî® Training {self.model_type.upper()} classifier...")

        self.model.fit(X_train, y_train)

        train_pred = self.model.predict(X_train)
        test_pred = self.model.predict(X_test)
        train_acc = accuracy_score(y_train, train_pred)
        test_acc = accuracy_score(y_test, test_pred)
        train_f1 = f1_score(y_train, train_pred)
        test_f1 = f1_score(y_test, test_pred)
        self.is_trained = True

        if verbose:
            print(f"\n‚úÖ Training completed!")
            print(f"   Train Accuracy: {train_acc*100:.2f}% | F1: {train_f1*100:.2f}%")
            print(f"   Test  Accuracy: {test_acc*100:.2f}% | F1: {test_f1*100:.2f}%")

        if hasattr(self.model, 'feature_importances_'):
            print(f"\n‚≠ê Feature Importance:")
            for name, imp in sorted(zip(self.feature_names, self.model.feature_importances_),
                                    key=lambda x: x[1], reverse=True):
                print(f"   {name:<20} {imp:>6.3f}")

        return {
            'train_accuracy': train_acc,
            'test_accuracy': test_acc,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
            'train_pred': train_pred,
            'test_pred': test_pred
        }

    def predict(self, result):
        if not self.is_trained:
            raise ValueError("Model ch∆∞a ƒë∆∞·ª£c train! Ch·∫°y .train() tr∆∞·ªõc.")
        features = self.extract_features(result)
        prediction = self.model.predict([features])[0]
        confidence = self.model.predict_proba([features])[0][1]
        return {
            'prediction': bool(prediction),
            'confidence': float(confidence),
            'method': 'ml_classifier',
            'features': dict(zip(self.feature_names, features))
        }

    def save(self, filepath='ml_classifier.pkl'):
        if not self.is_trained:
            raise ValueError("Model ch∆∞a ƒë∆∞·ª£c train! Kh√¥ng th·ªÉ l∆∞u.")
        with open(filepath, 'wb') as f:
            pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)
        file_size = os.path.getsize(filepath) / 1024 / 1024
        print(f"‚úÖ ƒê√£ l∆∞u ML classifier v√†o: {filepath}")
        print(f"   File size: {file_size:.2f} MB")
        print(f"   Features: {len(self.feature_names)}")

    @staticmethod
    def load(filepath='ml_classifier.pkl'):
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"File kh√¥ng t·ªìn t·∫°i: {filepath}")
        with open(filepath, 'rb') as f:
            classifier = pickle.load(f)
        print(f"‚úÖ ƒê√£ load ML classifier t·ª´: {filepath}")
        print(f"   Model type: {classifier.model_type}")
        print(f"   Is trained: {classifier.is_trained}")
        print(f"   Features: {len(classifier.feature_names)}")
        return classifier

print("="*60)
print("‚úÖ MLClassifier class created!")
print("="*60)
print("="*60)

In [None]:
# B∆Ø·ªöC 13B: TRAIN ML CLASSIFIER (WITH SAVE/LOAD)

import os

ML_CLASSIFIER_FILE = 'ml_classifier.pkl'

try:
    active_detector = complete_detector_v2
    print("‚úÖ S·ª≠ d·ª•ng CompletePlagiarismDetectorV2 (bi-encoder + doc scoring)")
except NameError:
    raise ValueError("DETECTOR NOT FOUND!\n  Variable 'complete_detector_v2' does not exist!\n")

if os.path.exists(ML_CLASSIFIER_FILE):
    print("\n" + "="*60)
    print("üìÇ LOADING SAVED ML CLASSIFIER")
    print("="*60)
    start_time = time.time()
    ml_classifier = MLClassifier.load(ML_CLASSIFIER_FILE)
    end_time = time.time()
    print(f"‚è±Ô∏è  Time: {end_time - start_time:.2f}s")
else:
    print("\n" + "="*60)
    print("üî® TRAINING NEW ML CLASSIFIER")
    print("="*60)
    ml_classifier = MLClassifier(model_type='xgboost')
    try:
        training_results = ml_classifier.train(
            detector=active_detector,
            queries_data=queries_data,
            test_size=0.2,
            verbose=True
        )
        print(f"\n{'='*60}")
        print("‚úÖ ML Classifier trained successfully!")
        print("="*60)
        print("\nüíæ Saving ML classifier...")
        ml_classifier.save(ML_CLASSIFIER_FILE)
        print("üéâ Next run will load instantly!")
    except Exception as e:
        print(f"\n‚ùå Error during training: {e}")
        import traceback
        traceback.print_exc()
        print("\n‚ö†Ô∏è  SKIPPING ML CLASSIFIER TRAINING")

print("\n" + "="*60)
print("‚úÖ ML CLASSIFIER READY!")
print("="*60)

In [None]:
# B∆Ø·ªöC 13C: ƒê√ÅNH GI√Å NHANH ML CLASSIFIER (KH√îNG D√ôNG THRESHOLD)
print("ML CLASSIFIER")
print("="*60)

if ml_classifier.is_trained:
    test_indices = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
    comparison_results = []
    
    print(f"\n Testing tr√™n {len(test_indices)} queries...")
    print("-"*60)
    
    for idx in test_indices:
        query = queries_data[idx]
        result = active_detector.detect(query['text'], verbose=False)
        
        ml_result = ml_classifier.predict(result)
        ml_pred = ml_result['prediction']
        ml_conf = ml_result['confidence']
        true_label = query['is_plagiarism']
        ml_correct = (ml_pred == true_label)
        
        comparison_results.append({
            'query_id': query['id'],
            'plag_type': query['plagiarism_type'],
            'true_label': true_label,
            'ml_pred': ml_pred,
            'ml_conf': ml_conf,
            'ml_correct': ml_correct
        })
        
        if idx in [0, 20, 40, 60, 80]:
            print(f"\nQuery {query['id']} ({query['plagiarism_type']}):")
            print(f"  True: {'PLAGIARISM' if true_label else 'ORIGINAL'}")
            print(f"  ML Classifier: {'PLAGIARISM' if ml_pred else 'ORIGINAL'} (conf: {ml_conf:.3f}) {'‚úÖ' if ml_correct else '‚ùå'}")
    
    ml_accuracy = sum(r['ml_correct'] for r in comparison_results) / len(comparison_results)
    print(f"\n{'='*60}")
    print(" ML CLASSIFIER SUMMARY")
    print("="*60)
    print(f"Accuracy tr√™n t·∫≠p nh·ªè: {ml_accuracy*100:.2f}%")
    
    wrong_cases = [r for r in comparison_results if not r['ml_correct']]
    if wrong_cases:
        print(f"\nC√°c tr∆∞·ªùng h·ª£p ƒëo√°n sai ({len(wrong_cases)} cases):")
        for case in wrong_cases:
            print(f"  Query {case['query_id']} ({case['plag_type']}): true={'PLAG' if case['true_label'] else 'ORG'}, pred={'PLAG' if case['ml_pred'] else 'ORG'}, conf={case['ml_conf']:.3f}")
    else:
        print("\nML classifier ƒëo√°n ƒë√∫ng to√†n b·ªô c√°c m·∫´u th·ª≠ n√†y!")
else:
    print(" ML Classifier ch∆∞a ƒë∆∞·ª£c train!")

In [None]:
# B∆Ø·ªöC 13D: FULL EVALUATION V·ªöI ML CLASSIFIER
print(" FULL EVALUATION: ML CLASSIFIER")
print("="*60)

if ml_classifier.is_trained:
    print(f" Evaluating tr√™n to√†n b·ªô {len(queries_data)} queries...")
    print("(Qu√° tr√¨nh n√†y m·∫•t ~30-60 gi√¢y...)")
    
    ml_predictions = []
    true_labels_full = []
    ml_confidences = []
    
    start_time = time.time()
    
    for i, query in enumerate(queries_data):
        if (i + 1) % 20 == 0:
            print(f"   Progress: {i+1}/{len(queries_data)}")
        
        result = active_detector.detect(query['text'], verbose=False)
        ml_result = ml_classifier.predict(result)
        
        ml_predictions.append(int(ml_result['prediction']))
        true_labels_full.append(int(query['is_plagiarism']))
        ml_confidences.append(float(ml_result['confidence']))
    
    end_time = time.time()
    
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
    
    ml_acc = accuracy_score(true_labels_full, ml_predictions)
    ml_prec = precision_score(true_labels_full, ml_predictions)
    ml_rec = recall_score(true_labels_full, ml_predictions)
    ml_f1 = f1_score(true_labels_full, ml_predictions)
    cm_ml = confusion_matrix(true_labels_full, ml_predictions)
    
    print(f"\n Evaluation completed! Time: {end_time - start_time:.2f}s")
    print(f"\n{'='*60}")
    print(" ML CLASSIFIER PERFORMANCE")
    print("="*60)
    print(f"Accuracy : {ml_acc*100:.2f}%")
    print(f"Precision: {ml_prec*100:.2f}%")
    print(f"Recall   : {ml_rec*100:.2f}%")
    print(f"F1-Score : {ml_f1*100:.2f}%")
    
    print(f"\nConfusion Matrix:")
    print(f"                 Predicted")
    print(f"              Original  Plagiarism")
    print(f"Actual Original     {cm_ml[0][0]:3d}       {cm_ml[0][1]:3d}")
    print(f"       Plagiarism   {cm_ml[1][0]:3d}       {cm_ml[1][1]:3d}")
    
    ml_results_df = pd.DataFrame({
        'query_id': [q['id'] for q in queries_data],
        'true_label': true_labels_full,
        'ml_pred': ml_predictions,
        'ml_confidence': ml_confidences,
        'plagiarism_type': [q['plagiarism_type'] for q in queries_data],
        'ml_correct': [t == p for t, p in zip(true_labels_full, ml_predictions)]
    })
    
    ml_results_df.to_csv('ml_results.csv', index=False, encoding='utf-8-sig')
    print(f"\n ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o ml_results.csv (shape: {ml_results_df.shape})")
    
    print(f"\n{'='*60}")
    print(" PERFORMANCE BY PLAGIARISM TYPE")
    print("="*60)
    print(f"{'Type':<20} {'ML Accuracy':<15}")
    print("-"*35)
    for ptype in ml_results_df['plagiarism_type'].unique():
        subset = ml_results_df[ml_results_df['plagiarism_type'] == ptype]
        ml_acc_type = subset['ml_correct'].mean()
        print(f"{ptype:<20} {ml_acc_type*100:>6.2f}%")
    
    print(f"\n{'='*60}")
    print("üéâ ML CLASSIFIER EVALUATION COMPLETED!")
    print("="*60)
else:
    print(" ML Classifier ch∆∞a ƒë∆∞·ª£c train!")

In [None]:
# B∆Ø·ªöC 13E: VISUALIZATION - ML CLASSIFIER (KH√îNG C·∫¶N THRESHOLD)
print(" VISUALIZATION: ML CLASSIFIER")
print("="*60)

if ml_classifier.is_trained and 'ml_results_df' in locals():
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Metrics overview
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    ml_scores = [ml_acc, ml_prec, ml_rec, ml_f1]
    axes[0, 0].bar(metrics, ml_scores, color='coral', alpha=0.85)
    axes[0, 0].set_ylabel('Score')
    axes[0, 0].set_title('ML Classifier Performance Metrics')
    axes[0, 0].set_ylim([0, 1.05])
    axes[0, 0].grid(axis='y', alpha=0.3)
    for i, score in enumerate(ml_scores):
        axes[0, 0].text(i, score + 0.02, f'{score:.3f}', ha='center', fontsize=10)
    
    # 2. Confusion matrix heatmap
    import seaborn as sns
    sns.heatmap(cm_ml, annot=True, fmt='d', cmap='Reds',
                xticklabels=['Original', 'Plagiarism'],
                yticklabels=['Original', 'Plagiarism'],
                cbar=False, ax=axes[0, 1], linewidths=1, linecolor='gray')
    axes[0, 1].set_title(f'Confusion Matrix (Accuracy: {ml_acc*100:.2f}%)')
    axes[0, 1].set_xlabel('Predicted')
    axes[0, 1].set_ylabel('Actual')
    
    # 3. Accuracy by plagiarism type
    plag_types = ml_results_df['plagiarism_type'].unique()
    ml_accs_by_type = [ml_results_df[ml_results_df['plagiarism_type'] == p]['ml_correct'].mean()
                       for p in plag_types]
    axes[1, 0].bar(plag_types, ml_accs_by_type, color='skyblue', alpha=0.85)
    axes[1, 0].set_ylabel('Accuracy')
    axes[1, 0].set_title('Accuracy by Plagiarism Type')
    axes[1, 0].set_ylim([0, 1.05])
    axes[1, 0].tick_params(axis='x', rotation=45)
    axes[1, 0].grid(axis='y', alpha=0.3)
    for x, acc in zip(plag_types, ml_accs_by_type):
        axes[1, 0].text(x, acc + 0.02, f'{acc:.2f}', ha='center', fontsize=9)
    
    # 4. Confidence distribution
    axes[1, 1].hist(ml_results_df['ml_confidence'], bins=20, color='mediumseagreen', alpha=0.8)
    axes[1, 1].set_title('ML Confidence Distribution')
    axes[1, 1].set_xlabel('Confidence')
    axes[1, 1].set_ylabel('Count')
    axes[1, 1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('ml_classifier_visualization.png', dpi=300, bbox_inches='tight')
    print(" ƒê√£ l∆∞u visualization: ml_classifier_visualization.png")
    plt.show()
    
    print(f"\n{'='*60}")
    print(" VISUALIZATION COMPLETED!")
    print("="*60)
else:
    print(" Ch∆∞a c√≥ k·∫øt qu·∫£ ML Classifier ƒë·ªÉ visualize! Ch·∫°y B∆∞·ªõc 13D tr∆∞·ªõc.")

In [None]:
# B∆Ø·ªöC 13F: FEATURE IMPORTANCE VISUALIZATION
print("üìä FEATURE IMPORTANCE ANALYSIS")
print("="*60)

if ml_classifier.is_trained and hasattr(ml_classifier.model, 'feature_importances_'):
    importances = ml_classifier.model.feature_importances_
    feature_names = ml_classifier.feature_names
    
    # Sort by importance
    indices = np.argsort(importances)[::-1]
    sorted_features = [feature_names[i] for i in indices]
    sorted_importances = [importances[i] for i in indices]
    
    # Create figure
    plt.figure(figsize=(10, 6))
    bars = plt.barh(range(len(sorted_features)), sorted_importances, color='lightgreen', edgecolor='darkgreen')
    plt.yticks(range(len(sorted_features)), sorted_features)
    plt.xlabel('Importance Score')
    plt.ylabel('Feature')
    plt.title('Feature Importance in XGBoost Classifier', fontsize=14, fontweight='bold')
    plt.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, (feature, imp) in enumerate(zip(sorted_features, sorted_importances)):
        plt.text(imp + 0.005, i, f'{imp:.3f}', va='center', fontsize=10)
    
    # Add color gradient based on importance
    for i, bar in enumerate(bars):
        bar.set_alpha(0.5 + 0.5 * (sorted_importances[i] / max(sorted_importances)))
    
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
    print("‚úÖ ƒê√£ l∆∞u visualization: feature_importance.png")
    plt.show()
    
    # Print ranking
    print(f"\nüìà FEATURE IMPORTANCE RANKING:")
    print("-"*60)
    for i, (feature, imp) in enumerate(zip(sorted_features, sorted_importances), 1):
        print(f"{i}. {feature:<20} {imp:.4f} {'‚ñà' * int(imp * 50)}")
    
    print(f"\n{'='*60}")
    print("‚úÖ FEATURE IMPORTANCE ANALYSIS COMPLETED!")
    print("="*60)
    
else:
    print("‚ö†Ô∏è  ML Classifier ch∆∞a ƒë∆∞·ª£c train ho·∫∑c kh√¥ng c√≥ feature_importances_!")

In [None]:
#  DEMO: C√ÅCH S·ª¨ D·ª§NG DETECTOR + ML CLASSIFIER

example_query = """T√¥i ƒëi h·ªçc"""
print("="*60)
print("üß™ DEMO: PLAGIARISM DETECTION")
print("="*60)
print(f"Query text:\n{example_query}\n")

result = complete_detector_v2.detect(example_query, verbose=False)

print(f"üìä DETECTOR RESULT:")
print(f"   Confidence: {result['confidence']:.3f}")
if result['best_match']:
    print(f"   Best match: {result['best_match']['doc_id']}")
else:
    print("   Best match: None")

if ml_classifier.is_trained:
    ml_result = ml_classifier.predict(result)
    print(f"\n ML CLASSIFIER RESULT:")
    print(f"   Prediction: {'PLAGIARISM' if ml_result['prediction'] else 'ORIGINAL'}")
    print(f"   Confidence: {ml_result['confidence']:.3f}")
    print(f" FEATURE VALUES:")
    for feature, value in ml_result['features'].items():
        print(f"   {feature:<20} {value:.4f}")
else:
    print("\n‚ö†Ô∏è  ML Classifier ch∆∞a ƒë∆∞·ª£c train!")

print(f"\n{'='*60}")
print(" DEMO COMPLETED!")
print("="*60)