In [None]:
!pip install transformers torch fuzzywuzzy python-levenshtein pandas numpy tqdm --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/153.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m2.7/3.2 MB[0m [31m79.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
import re
from typing import List, Dict, Tuple, Optional, Set
from collections import defaultdict
from fuzzywuzzy import fuzz
import warnings
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')

import json

from tqdm import tqdm
tqdm.pandas()

# Load Dataset

In [None]:
download_url = 'https://drive.google.com/uc?id=1VeaXUW5d34siLszKIDZ1xN1NWzLt2kT5'
df = pd.read_csv(download_url)
print(f"Dataset loaded: {len(df)} articles")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

Dataset loaded: 15240 articles
Columns: ['title', 'content', 'label']

First few rows:
                                               title  \
0  Anak Penjaga Kebersihan Masjid Lantamal 1 Bela...   
1  Wakil Menkeu : Kenaikan Iuran BPJS Tidak Akan ...   
2  Pendukung Jokowi Ngaku Diculik di Masjid, Ini ...   
3  Sebentar Lagi, Minyak Curah Dilarang, Warga Ha...   
4  Ya Tuhan... Hubungan Sedarah di Sumut Meningka...   

                                             content          label  
0  Setelah dihebohkan dengan kemunculan calon tar...      clickbait  
1  Jelang akhir tahun kabar  kenaikan tarif iuran...  non-clickbait  
2  Pengurus Dewan Kemakmuran Masjid (DKM) Al-Fala...      clickbait  
3  Menteri Perdagangan Enggartiasto Lukita melara...      clickbait  
4  Miris, meski dianggap tabu dalam agama maupun ...      clickbait  


# Entity Extractor

In [None]:
class EntityExtractor:
    COMMON_WORDS_NOT_ENTITIES = [
        'ya', 'tuhan', 'ini', 'itu', 'ada', 'tidak', 'bukan',
        'warga', 'pihak', 'orang', 'masyarakat', 'rakyat',
        'akan', 'sudah', 'telah', 'sedang', 'masih',
        'yang', 'dari', 'untuk', 'dengan', 'pada', 'ke', 'di'
    ]

    GENERIC_LOCATION_WORDS = [
        'masjid', 'gedung', 'kantor', 'rumah', 'sekolah',
        'pasar', 'toko', 'mall', 'plaza', 'jalan', 'jln'
    ]

    ROLE_TITLE_EXPANSIONS = {
        'wakil menkeu': 'wakil menteri keuangan',
        'wamenkeu': 'wakil menteri keuangan',
        'menkeu': 'menteri keuangan',
        'wapres': 'wakil presiden',
        'mensesneg': 'menteri sekretaris negara',
        'menhub': 'menteri perhubungan',
        'mendagri': 'menteri dalam negeri',
        'menlu': 'menteri luar negeri',
        'menhan': 'menteri pertahanan',
        'kapolri': 'kepala kepolisian republik indonesia',
        'kapolda': 'kepala kepolisian daerah',
        'kapolres': 'kepala kepolisian resort',
        'danrem': 'komandan resort militer',
        'pangdam': 'panglima komando daerah militer',
        'dirut': 'direktur utama',
        'kadis': 'kepala dinas',
        'wali kota': 'walikota',
        'bupati': 'bupati'
    }

    def __init__(self, model_name: str = "cahya/NusaBert-ner-v1.3"):
        print(f"Loading NER model: {model_name}...")

        self.min_confidence = 0.75

        try:
            from transformers import AutoTokenizer, AutoModelForTokenClassification
            from transformers import pipeline

            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForTokenClassification.from_pretrained(model_name)

            self.ner_pipeline = pipeline(
                "ner",
                model=self.model,
                tokenizer=self.tokenizer,
                aggregation_strategy="simple"
            )

            print(f"NER model {model_name} loaded successfully!")

        except Exception as e:
            print(f"Error loading NER model: {e}")
            print("Fallback: Using rule-based entity extraction...")
            self.ner_pipeline = None

    def extract_entities(self, text: str) -> List[Dict]:
        if not text or pd.isna(text):
            return []

        text = str(text).strip()
        if len(text) == 0:
            return []

        if self.ner_pipeline:
            try:
                # multichunk processing for long texts
                results = self._ner_long_text(text)

                entities = []

                for entity in results:
                    if not self._is_valid_entity(entity):
                        continue

                    entity_type = entity['entity_group']

                    entities.append({
                        'text': entity['word'],
                        'type': entity_type,
                        'score': entity['score']
                    })

                entities = self._deduplicate_entities(entities)
                entities = self._add_role_titles(text, entities)

                return entities

            except Exception as e:
                return self._rule_based_extraction(text)
        else:
            return self._rule_based_extraction(text)

    def _ner_long_text(self, text: str, max_length: int = 510, overlap: int = 50) -> List[Dict]:
        tokens = text.split()

        if len(tokens) <= max_length:
            return self.ner_pipeline(text)

        entities = []
        seen_entities = set()

        for start_idx in range(0, len(tokens), max_length - overlap):
            end_idx = min(start_idx + max_length, len(tokens))
            chunk_tokens = tokens[start_idx:end_idx]
            chunk_text = " ".join(chunk_tokens)

            try:
                chunk_entities = self.ner_pipeline(chunk_text)

                for entity in chunk_entities:
                    entity_key = (entity['word'].lower(), entity['entity_group'])

                    if entity_key not in seen_entities:
                        entities.append(entity)
                        seen_entities.add(entity_key)
                    else:
                        for i, existing in enumerate(entities):
                            if (existing['word'].lower() == entity['word'].lower() and
                                existing['entity_group'] == entity['entity_group']):
                                if entity['score'] > existing['score']:
                                    entities[i] = entity
                                break

            except Exception as e:
                print(f"Error processing chunk at position {start_idx}: {e}")
                continue

            if end_idx >= len(tokens):
                break

        return entities

    def _deduplicate_entities(self, entities: List[Dict]) -> List[Dict]:
        unique_entities = {}

        for entity in entities:
            key = (entity['text'].lower(), entity['type'])

            if key not in unique_entities:
                unique_entities[key] = entity
            else:
                if entity['score'] > unique_entities[key]['score']:
                    unique_entities[key] = entity

        return list(unique_entities.values())

    def _add_role_titles(self, text: str, entities: List[Dict]) -> List[Dict]:
        text_lower = text.lower()

        for abbrev, full_title in self.ROLE_TITLE_EXPANSIONS.items():
            if abbrev in text_lower:
                start_idx = text_lower.find(abbrev)
                actual_text = text[start_idx:start_idx + len(abbrev)]

                if not any(e['text'].lower() == abbrev for e in entities):
                    entities.append({
                        'text': actual_text,
                        'type': 'PER',
                        'score': 0.90,
                        'is_role': True
                    })

        return entities

    def _is_valid_entity(self, entity: Dict) -> bool:
        text = entity['word'].strip()
        text_lower = text.lower()
        score = entity['score']
        entity_type = entity['entity_group'].upper()

        # Filter by confidence threshold
        if score < self.min_confidence:
            return False

        # Filter by common words that are not entities
        if text_lower in self.COMMON_WORDS_NOT_ENTITIES:
            return False

        # Filter by very short entities (< 3 chars) without high confidence
        if len(text) < 3 and score < 0.90:
            return False

        # Filter by generic location words
        if text_lower in self.GENERIC_LOCATION_WORDS:
            return False

        return True

    def _rule_based_extraction(self, text: str) -> List[Dict]:
        entities = []

        pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
        matches = re.finditer(pattern, text)

        for match in matches:
            entity_text = match.group()
            entity_lower = entity_text.lower()

            if entity_lower in self.COMMON_WORDS_NOT_ENTITIES:
                continue

            if entity_lower in self.GENERIC_LOCATION_WORDS:
                continue

            if entity_lower in ['yang', 'ini', 'itu', 'ada']:
                continue

            if any(keyword in entity_lower for keyword in ['pak', 'bu', 'presiden', 'gubernur', 'lurah', 'bupati', 'camat', 'artis', 'aktor']):
                entity_type = 'PER'
            if any(keyword in entity_lower for keyword in ['pt ', 'cv ', 'bank ', 'dinas ', 'kementerian']):
                entity_type = 'ORGANIZATION'
            if any(keyword in entity_lower for keyword in ['kota ', 'kabupaten ', 'provinsi ']):
                entity_type = 'LOCATION'

            entities.append({
                'text': entity_text,
                'type': entity_type,
                'score': 0.5
            })

        print(f"\nExtracted Entities (first 5, rule-based): {entities[:5]}")
        return entities

# Entity Normalization

In [None]:
class EntityNormalizer:

    ROLE_EXPANSIONS = {
      'wakil menkeu': 'wakil menteri keuangan',
      'wamenkeu': 'wakil menteri keuangan',
      'menkeu': 'menteri keuangan',
      'wapres': 'wakil presiden',
      'kapolri': 'kepala kepolisian republik indonesia',
      'kapolda': 'kepala kepolisian daerah',
      'dirut': 'direktur utama',
      'kadis': 'kepala dinas'
    }

    COMMON_ABBREVIATIONS = {
        'dpr': 'dewan perwakilan rakyat',
        'kpk': 'komisi pemberantasan korupsi',
        'bpjs': 'badan penyelenggara jaminan sosial',
        'tni': 'tentara nasional indonesia',
        'polri': 'kepolisian republik indonesia'
    }

    DEGREE_PATTERN = r',?\s*[A-Z]+\.?(?:\s*,?\s*[A-Z]+\.?)*$'

    @staticmethod
    def normalize(text: str, expand_abbreviations: bool = True) -> str:
        if not text or pd.isna(text):
            return ""

        text = str(text).lower().strip()
        text = re.sub(r'[^\w\s-]', '', text)
        text = ' '.join(text.split())

        if expand_abbreviations:
            if text in EntityNormalizer.ROLE_EXPANSIONS:
                text = EntityNormalizer.ROLE_EXPANSIONS[text]
            elif text in EntityNormalizer.COMMON_ABBREVIATIONS:
                text = EntityNormalizer.COMMON_ABBREVIATIONS[text]

        return text.strip()

    @staticmethod
    def get_normalized_variants(text: str) -> Set[str]:
        variants = set()
        variants.add(EntityNormalizer.normalize(text, expand_abbreviations=False))
        expanded = EntityNormalizer.normalize(text, expand_abbreviations=True)
        variants.add(expanded)
        if expanded:
            variants.add(' '.join(sorted(expanded.split())))
        return variants

    @staticmethod
    def get_tokens(text: str) -> set:
        normalized = EntityNormalizer.normalize(text, expand_abbreviations=True)
        return set(normalized.split()) if normalized else set()

# Entity Matching

In [None]:
class EntityMatcher:
    @staticmethod
    def match_entities(title_entity: Dict, content_entities: List[Dict], threshold: float = 0.75) -> Dict:
        title_text = title_entity['text']
        title_type = title_entity['type']

        title_variants = EntityNormalizer.get_normalized_variants(title_text)
        title_norm = EntityNormalizer.normalize(title_text, expand_abbreviations=True)
        title_tokens = EntityNormalizer.get_tokens(title_text)

        substring_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")

        best_match = {
            'matched': False,
            'match_type': None,
            'content_entity': None,
            'similarity': 0.0,
            'type_match': False
        }

        for content_entity in content_entities:
            content_text = content_entity['text']
            content_type = content_entity['type']

            content_variants = EntityNormalizer.get_normalized_variants(content_text)
            content_norm = EntityNormalizer.normalize(content_text, expand_abbreviations=True)
            content_tokens = EntityNormalizer.get_tokens(content_text)

            if not title_norm or not content_norm:
              continue

            # Method 1 : Exact Word Entity Matching
            if title_variants & content_variants:
                return {
                    'matched': True,
                    'match_type': 'exact',
                    'content_entity': content_entity,
                    'similarity': 1.0,
                    'type_match': (title_type == content_type)
                }

            # Method 2 : Substring Matching
            if title_norm in content_norm or content_norm in title_norm:
                vec1 = substring_model.encode(title_norm)
                vec2 = substring_model.encode(content_norm)

                similarity = cosine_similarity([vec1], [vec2])[0][0]
                if similarity > best_match['similarity']:
                    best_match = {
                        'matched': True,
                        'match_type': 'substring',
                        'content_entity': content_entity,
                        'similarity': similarity,
                        'type_match': (title_type == content_type)
                    }

            # Method 3 : Token Overlap Matching
            if title_tokens and content_tokens:
                overlap_token = title_tokens & content_tokens
                unique_token = title_tokens | content_tokens
                if len(overlap_token) > 0:
                    jaccard = len(overlap_token) / len(unique_token)
                    if jaccard >= 0.6 and jaccard > best_match['similarity']:
                        best_match = {
                            'matched': True,
                            'match_type': 'token_overlap',
                            'content_entity': content_entity,
                            'similarity': jaccard,
                            'type_match': (title_type == content_type)
                        }

            # Method 4 : Fuzzy Matching
            fuzzy_score = fuzz.token_set_ratio(title_norm, content_norm) / 100.0
            if fuzzy_score >= threshold and fuzzy_score > best_match['similarity']:
                best_match = {
                    'matched': True,
                    'match_type': 'fuzzy',
                    'content_entity': content_entity,
                    'similarity': fuzzy_score,
                    'type_match': (title_type == content_type)
                }

        return best_match

    @staticmethod
    def compute_match_features(title_entities: List[Dict], content_entities: List[Dict], content_text: str = "") -> Dict:
        if not title_entities:
            return {'num_title_entities': 0, 'entity_consistency_score': 0.0}

        matches = []
        for title_entity in title_entities:
            matches.append(EntityMatcher.match_entities(title_entity, content_entities))

        num_matched = sum(1 for m in matches if m['matched'])
        similarities = [m['similarity'] for m in matches if m['matched']]

        return {
            'num_title_entities': len(title_entities),
            'num_content_entities': len(content_entities),
            'num_matched_entities': num_matched,
            'match_ratio': num_matched / len(title_entities),
            'avg_similarity': sum(similarities) / len(similarities) if similarities else 0.0,
            'entity_consistency_score': num_matched / len(title_entities) * (sum(similarities) / len(similarities) if similarities else 0.0)
        }

# Feature Extraction

In [None]:
class FeatureExtractor:
    @staticmethod
    def extract_features(
        title_entities: List[Dict],
        content_entities: List[Dict],
        matching_results: List[Dict],
        content_text: str
    ) -> Dict:
        num_title_entities = len(title_entities)
        num_content_entities = len(content_entities)

        if num_title_entities == 0:
            return FeatureExtractor._default_features()

        # Count matches
        matched_results = [r for r in matching_results if r['matched']]
        num_matched = len(matched_results)
        num_missing = num_title_entities - num_matched

        # Match ratio
        match_ratio = num_matched / num_title_entities if num_title_entities > 0 else 0.0

        # Similarity scores
        similarities = [r['similarity'] for r in matched_results]
        avg_similarity = np.mean(similarities) if similarities else 0.0
        max_similarity = max(similarities) if similarities else 0.0
        min_similarity = min(similarities) if similarities else 0.0

        # Type matching
        type_matches = [r['type_match'] for r in matched_results]
        type_match_ratio = sum(type_matches) / len(type_matches) if type_matches else 0.0

        # Frequency analysis (count mentions in content)
        mention_counts = FeatureExtractor._count_mentions(
            matched_results, content_text
        )
        avg_mention_freq = np.mean(mention_counts) if mention_counts else 0.0
        max_mention_freq = max(mention_counts) if mention_counts else 0.0

        # Position analysis
        paragraphs = content_text.split('\n')
        in_lead_paragraph = FeatureExtractor._check_lead_paragraph(
            matched_results, paragraphs
        )

        # Entity consistency score (weighted aggregate)
        entity_consistency_score = FeatureExtractor._calculate_consistency_score(
            match_ratio=match_ratio,
            avg_mention_freq=avg_mention_freq,
            in_lead_paragraph=in_lead_paragraph,
            num_missing=num_missing,
            num_title_entities=num_title_entities
        )

        features = {
            'num_title_entities': num_title_entities,
            'num_content_entities': num_content_entities,
            'num_matched_entities': num_matched,
            'num_missing_entities': num_missing,
            'match_ratio': round(match_ratio, 3),
            'type_match_ratio': round(type_match_ratio, 3),
            'avg_similarity': round(avg_similarity, 3),
            'max_similarity': round(max_similarity, 3),
            'min_similarity': round(min_similarity, 3),
            'avg_mention_freq': round(avg_mention_freq, 3),
            'max_mention_freq': max_mention_freq,
            'entities_in_lead': in_lead_paragraph,
            'entity_consistency_score': round(entity_consistency_score, 3)
        }

        return features

    @staticmethod
    def _default_features() -> Dict:
        """Return default features when no entities in title"""
        return {
            'num_title_entities': 0,
            'num_content_entities': 0,
            'num_matched_entities': 0,
            'num_missing_entities': 0,
            'match_ratio': 0.0,
            'type_match_ratio': 0.0,
            'avg_similarity': 0.0,
            'max_similarity': 0.0,
            'min_similarity': 0.0,
            'avg_mention_freq': 0.0,
            'max_mention_freq': 0,
            'entities_in_lead': 0,
            'entity_consistency_score': 0.5
        }

    @staticmethod
    def _count_mentions(matching_results: List[Dict], content_text: str) -> List[int]:
        """Count how many times each matched entity appears in content"""
        content_lower = content_text.lower()
        mention_counts = []

        for result in matching_results:
            if result['matched'] and result['content_entity']:
                entity_text = result['content_entity']['text']
                entity_norm = EntityNormalizer.normalize(entity_text)

                # Count occurrences (case-insensitive)
                count = content_lower.count(entity_norm)
                mention_counts.append(count)

        return mention_counts

    @staticmethod
    def _check_lead_paragraph(matching_results: List[Dict], paragraphs: List[str]) -> int:
        """Check how many matched entities appear in first 2 paragraphs"""
        if len(paragraphs) < 2:
            lead_text = ' '.join(paragraphs).lower()
        else:
            lead_text = ' '.join(paragraphs[:2]).lower()

        count = 0
        for result in matching_results:
            if result['matched'] and result['content_entity']:
                entity_text = result['content_entity']['text']
                entity_norm = EntityNormalizer.normalize(entity_text)

                if entity_norm in lead_text:
                    count += 1

        return count

    @staticmethod
    def _calculate_consistency_score(
        match_ratio: float,
        avg_mention_freq: float,
        in_lead_paragraph: int,
        num_missing: int,
        num_title_entities: int
    ) -> float:
        """
        Calculate entity consistency score

        Formula:
        score = w1*match_ratio + w2*freq_norm + w3*lead_norm - w4*missing_penalty

        Weights:
        w1 = 0.45 (match ratio - most important)
        w2 = 0.25 (mention frequency)
        w3 = 0.20 (lead paragraph presence)
        w4 = 0.30 (missing entities penalty)
        """
        # Normalize frequency (cap at 10 mentions)
        freq_norm = min(avg_mention_freq / 10.0, 1.0)

        # Normalize lead presence
        lead_norm = in_lead_paragraph / num_title_entities if num_title_entities > 0 else 0.0

        # Missing entities penalty
        missing_penalty = num_missing / num_title_entities if num_title_entities > 0 else 0.0

        # Weighted sum
        score = (
            0.50 * match_ratio +
            0.20 * freq_norm +
            0.20 * lead_norm -
            0.40 * missing_penalty
        )

        # Clip to [0, 1]
        score = max(0.0, min(1.0, score))

        return score

# Main Entity Detector

In [None]:
class MainEntityDetector:
    TYPE_WEIGHTS = {
        'PER': 1.0,
        'ORG': 0.9,
        'GPE': 0.8,
        'LOC': 0.8,
        'NOR': 0.9,
        'EVT': 0.7,
        'LAW': 0.7,
        'FAC': 0.6,
        'PRD': 0.6,
        'REG': 0.6,
        'WOA': 0.5,
        'LAN': 0.4,
        'MON': 0.3,
        'PRC': 0.3,
        'CRD': 0.3,
        'QTY': 0.3,
        'ORD': 0.2,
        'DAT': 0.2,
        'TIM': 0.2,
    }

    @staticmethod
    def get_main_entity(title: str, entities: List[Dict]) -> Optional[Dict]:
        if not entities:
            return None

        title_lower = title.lower()
        title_len = len(title)

        scored_entities = []

        for entity in entities:
            entity_text = entity['text']
            entity_type = entity['type']

            # 1. Position score (earlier = better)
            position = title_lower.find(entity_text.lower())
            if position == -1:
                position = 0
            position_score = 1.0 - (position / title_len)

            # 2. Entity type weight
            type_score = MainEntityDetector.TYPE_WEIGHTS.get(entity_type, 0.5)

            # 3. Prominence score (how much of title does entity take)
            prominence = len(entity_text) / title_len
            prominence_score = min(prominence * 2, 1.0)


            final_score = (
                0.40 * position_score +
                0.30 * type_score +
                0.30 * prominence_score
            )

            scored_entities.append({
                'entity': entity,
                'importance_score': final_score,
                'position': position,
                'prominence': prominence
            })

        main_entity_info = max(scored_entities, key=lambda x: x['importance_score'])

        return {
            **main_entity_info['entity'],
            'importance_score': main_entity_info['importance_score'],
            'position_in_title': main_entity_info['position'],
            'prominence': main_entity_info['prominence']
        }


# Main Entity Analyzer

In [None]:
class MainEntityAnalyzer:
    @staticmethod
    def analyze_main_entity(
        main_entity: Dict,
        content: str,
        content_entities: List[Dict],
        normalizer
    ) -> Dict:
        # Check if main entity exists in content entities
        matched_entity = MainEntityAnalyzer._find_entity_match(
            main_entity, content_entities, normalizer
        )

        if not matched_entity:
            return MainEntityAnalyzer._default_analysis(is_present=False)

        # Analyze entity distribution in content
        paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
        if not paragraphs:
            return MainEntityAnalyzer._default_analysis(is_present=True)

        # Get all paragraph numbers where entity appears
        entity_paragraphs = MainEntityAnalyzer._get_entity_paragraphs(
            main_entity, paragraphs, normalizer
        )

        if not entity_paragraphs:
            # Entity matched but not found in text (edge case)
            return MainEntityAnalyzer._default_analysis(is_present=False)

        # Calculate metrics
        num_paragraphs = len(paragraphs)
        num_paragraphs_with_entity = len(entity_paragraphs)

        # Coverage:  percentage of paragraphs containing entity
        coverage = num_paragraphs_with_entity / num_paragraphs

        # Position distribution: average paragraph index
        avg_position = np.mean(entity_paragraphs) / num_paragraphs
        position_score = 1.0 - avg_position

        # Frequency: count all mentions
        frequency = MainEntityAnalyzer._count_mentions(main_entity, content, normalizer)
        freq_normalized = min(frequency / 5.0, 1.0)

        # Lead paragraph presence (first 2 paragraphs)
        in_lead = any(idx < 2 for idx in entity_paragraphs)
        lead_score = 1.0 if in_lead else 0.2

        # Distribution consistency
        if len(entity_paragraphs) > 1:
            position_std = np.std(entity_paragraphs)
            distribution_score = min(position_std / num_paragraphs, 1.0)
        else:
            distribution_score = 0.0

        main_entity_consistency = (
            0.30 * coverage +
            0.25 * position_score +
            0.20 * freq_normalized +
            0.15 * lead_score +
            0.10 * distribution_score
        )

        return {
            'is_present': True,
            'main_entity_text': main_entity['text'],
            'main_entity_type': main_entity['type'],
            'main_entity_coverage': round(coverage, 3),
            'main_entity_frequency': frequency,
            'main_entity_avg_position': round(avg_position, 3),
            'main_entity_in_lead': in_lead,
            'main_entity_distribution_score': round(distribution_score, 3),
            'main_entity_consistency_score': round(main_entity_consistency, 3),
            'main_entity_manipulation_likelihood': round(1.0 - main_entity_consistency, 3)
        }

    @staticmethod
    def _find_entity_match(
        main_entity: Dict,
        content_entities: List[Dict],
        normalizer,
        threshold: float = 0.75
    ) -> Optional[Dict]:
        main_text = main_entity['text']
        main_norm = normalizer.normalize(main_text)
        main_tokens = normalizer.get_tokens(main_text)

        best_match = None
        best_score = 0.0

        for content_entity in content_entities:
            content_text = content_entity['text']
            content_norm = normalizer.normalize(content_text)
            content_tokens = normalizer.get_tokens(content_text)

            # Exact match
            if main_norm == content_norm:
                return content_entity

            # Substring match
            if main_norm in content_norm or content_norm in main_norm:
                score = min(len(main_norm), len(content_norm)) / max(len(main_norm), len(content_norm))
                if score > best_score:
                    best_score = score
                    best_match = content_entity

            # Token overlap
            if main_tokens and content_tokens:
                overlap = main_tokens & content_tokens
                if overlap:
                    jaccard = len(overlap) / len(main_tokens | content_tokens)
                    if jaccard > best_score:
                        best_score = jaccard
                        best_match = content_entity

        if best_score >= threshold:
            return best_match
        return None

    @staticmethod
    def _get_entity_paragraphs(
        main_entity: Dict,
        paragraphs: List[str],
        normalizer
    ) -> List[int]:
        entity_text = main_entity['text']
        entity_norm = normalizer.normalize(entity_text)

        if not entity_norm:
            return []

        indices = []
        for idx, paragraph in enumerate(paragraphs):
            paragraph_lower = paragraph.lower()
            if entity_norm in paragraph_lower:
                indices.append(idx)

        return indices

    @staticmethod
    def _count_mentions(main_entity: Dict, content: str, normalizer) -> int:
        entity_text = main_entity['text']
        entity_norm = normalizer.normalize(entity_text)

        if not entity_norm:
            return 0

        content_lower = content.lower()
        return content_lower.count(entity_norm)

    @staticmethod
    def _default_analysis(is_present: bool = False) -> Dict:
        if not is_present:
            return {
                'is_present': False,
                'main_entity_text': None,
                'main_entity_type': None,
                'main_entity_coverage': 0.0,
                'main_entity_frequency': 0,
                'main_entity_avg_position': 1.0,
                'main_entity_in_lead': False,
                'main_entity_distribution_score': 0.0,
                'main_entity_consistency_score': 0.0,
                'main_entity_manipulation_likelihood': 1.0
            }
        else:
            return {
                'is_present': True,
                'main_entity_text': None,
                'main_entity_type': None,
                'main_entity_coverage': 0.0,
                'main_entity_frequency': 0,
                'main_entity_avg_position': 1.0,
                'main_entity_in_lead': False,
                'main_entity_distribution_score': 0.0,
                'main_entity_consistency_score': 0.0,
                'main_entity_manipulation_likelihood': 1.0
            }

# Named Entity Matcher Pipeline

In [None]:
class NamedEntityMatcher:
    def __init__(self, model_name: str = "cahya/NusaBert-ner-v1.3"):
        self.extractor = EntityExtractor(model_name)
        self.normalizer = EntityNormalizer()
        self.matcher = EntityMatcher()
        self.feature_extractor = FeatureExtractor()
        self.main_entity_detector = MainEntityDetector()
        self.main_entity_analyzer = MainEntityAnalyzer()

    def process_article(
        self,
        title: str,
        content: str
    ) -> Dict:
        # Step 1: Extract entities
        title_entities = self.extractor.extract_entities(title)
        content_entities = self.extractor.extract_entities(content)

        # Step 2: Match entities
        matching_results = []
        for title_entity in title_entities:
            match_result = self.matcher.match_entities(
                title_entity,
                content_entities
            )
            matching_results.append({
                'title_entity': title_entity,
                **match_result
            })

        # Step 3: Extract features
        features = self.feature_extractor.extract_features(
            title_entities,
            content_entities,
            matching_results,
            content
        )

        # Step 4: Detect main entity
        main_entity = self.main_entity_detector.get_main_entity(
            title, title_entities
        )

        # Step 5: Analyze main entity
        if main_entity:
            main_entity_features = self.main_entity_analyzer.analyze_main_entity(
                main_entity,
                content,
                content_entities,
                self.normalizer
            )
        else:
            main_entity_features = {
                'is_present': False,
                'main_entity_text': None,
                'main_entity_type': None,
                'main_entity_coverage': 0.0,
                'main_entity_frequency': 0,
                'main_entity_avg_position': 1.0,
                'main_entity_in_lead': False,
                'main_entity_distribution_score': 0.0,
                'main_entity_consistency_score': 0.5,  # Neutral
                'main_entity_manipulation_likelihood': 0.5
            }

        # Step 6: Compile important output
        combined_features = {
            **features,  # Existing features including entity_consistency_score
            **main_entity_features  # New main entity features
        }

        return {
            'title_entities': title_entities,
            'content_entities': content_entities,
            'main_entity': main_entity,
            'matching_results': matching_results,
            'features': combined_features
        }

    def process_dataset(
        self,
        df: pd.DataFrame,
        title_col: str = 'title',
        content_col: str = 'content',
        batch_size: int = 100
    ) -> pd.DataFrame:
        print(f"\n{'='*60}")
        print(f"Processing {len(df)} articles...")
        print(f"{'='*60}\n")

        results = []

        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing articles"):
            try:
                title = row[title_col]
                content = row[content_col]

                output = self.process_article(title, content)

                result = {
                    'title': title,
                    **output['features']
                }
                results.append(result)

            except Exception as e:
                print(f"\nError processing article {idx}: {e}")
                default_all = self.feature_extractor._default_features()
                default_main = {
                    'is_present': False,
                    'main_entity_text': None,
                    'main_entity_type': None,
                    'main_entity_coverage': 0.0,
                    'main_entity_frequency': 0,
                    'main_entity_avg_position': 1.0,
                    'main_entity_in_lead': False,
                    'main_entity_distribution_score': 0.0,
                    'main_entity_consistency_score': 0.5,
                    'main_entity_manipulation_likelihood': 0.5
                }
                results.append({
                    'title': row[title_col],
                    **default_all,
                    **default_main
                })

        results_df = pd.DataFrame(results)

        print(f"Named Entity Matcher processing complete!")

        return results_df

Utility

In [None]:
def save_features_to_csv(df_features: pd.DataFrame, filename: str):
    """Save features to CSV"""
    df_features.to_csv(filename, index=False)
    print(f"✅ Features saved to: {filename}")

In [None]:
def analyze_features(df_features: pd.DataFrame):
    """Analyze feature distributions"""
    print("\n" + "="*60)
    print("FEATURE STATISTICS")
    print("="*60 + "\n")

    # Key features
    key_features = [
        'match_ratio',
        'entity_consistency_score',
        'num_title_entities',
        'num_missing_entities',
        'avg_mention_freq'
    ]

    for feature in key_features:
        if feature in df_features.columns:
            print(f"{feature}:")
            print(f"  Mean: {df_features[feature].mean():.3f}")
            print(f"  Std:  {df_features[feature].std():.3f}")
            print(f"  Min:  {df_features[feature].min():.3f}")
            print(f"  Max:  {df_features[feature].max():.3f}")
            print()

# Run Pipeline

In [None]:
nem = NamedEntityMatcher(model_name="cahya/NusaBert-ner-v1.3")

Loading NER model: cahya/NusaBert-ner-v1.3...


tokenizer_config.json:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/644M [00:00<?, ?B/s]

Device set to use cuda:0


NER model cahya/NusaBert-ner-v1.3 loaded successfully!


# Test on Articles

In [None]:
num_samples = 10

print("="*80)
print(f"ANALYZING FIRST {num_samples} ARTICLES")
print("="*80)

for sample_idx in range(num_samples):
    sample_title = df.iloc[sample_idx]['title']
    sample_content = df.iloc[sample_idx]['content']
    sample_label = df.iloc[sample_idx]['label']

    print(f"\n{'='*80}")
    print(f"SAMPLE #{sample_idx + 1}")
    print(f"{'='*80}")
    print(f"\nTitle: {sample_title}")
    print(f"Label: {sample_label}")
    print(f"\nContent preview: {sample_content[:200]}...")

    # Process single article
    result = nem.process_article(sample_title, sample_content)

    # Print results
    print("\nEXTRACTION RESULTS:")
    print(f"\nTitle Entities ({len(result['title_entities'])}):")
    for i, entity in enumerate(result['title_entities'][:5], 1):  # Show first 5
        print(f"    {i}. {entity['text']} ({entity['type']}) - score: {entity['score']:.2f}")

    if not result['title_entities']:
        print("    (No entities found)")

    print(f"\n  Content Entities ({len(result['content_entities'])}):")
    for i, entity in enumerate(result['content_entities'][:10], 1):  # Show first 10
        print(f"    {i}. {entity['text']} ({entity['type']}) - score: {entity['score']:.2f}")

    if not result['content_entities']:
        print("    (No entities found)")

    print("\nMATCHING RESULTS:")
    if result['matching_results']:
        for i, match in enumerate(result['matching_results'], 1):
            title_entity = match['title_entity']['text']
            matched = match['matched']

            if matched:
                content_entity = match['content_entity']['text']
                match_type = match['match_type']
                similarity = match['similarity']
                print(f"    {i}. '{title_entity}' → '{content_entity}' ({match_type}, sim={similarity:.2f})")
            else:
                print(f"    {i}. '{title_entity}' → NOT FOUND")
    else:
        print("    (No title entities to match)")

    print("\nFEATURES:")
    for key, value in result['features'].items():
        if isinstance(value, float):
            print(f"{key}: {value:.3f}")
        else:
            print(f"{key}: {value}")

    print(f"\nEntity Consistency Score: {result['features']['entity_consistency_score']:.3f}")
    if result['features']['entity_consistency_score'] < 0.4:
        print("HIGH likelihood of clickbait (entity manipulation)")
    elif result['features']['entity_consistency_score'] < 0.7:
        print("MEDIUM likelihood of clickbait")
    else:
        print("LOW likelihood of clickbait (entities well-supported)")

    print(f"\n" + "-"*80)


    # Print main entity information
    print(f"\nMAIN ENTITY INFORMATION:")
    if result["features"].get("main_entity_text"):
        print(f"  Main Entity: {result['features']['main_entity_text']}")
        print(f"  Entity Type: {result['features']['main_entity_type']}")
        print(f"  Coverage: {result['features']['main_entity_coverage']:.3f} (% of paragraphs)")
        print(f"  Frequency: {result['features']['main_entity_frequency']} mentions")
        print(f"  Avg Position: {result['features']['main_entity_avg_position']:.3f} (0=early, 1=late)")
        print(f"  In Lead: {result['features']['main_entity_in_lead']}")
        print(f"  Main Entity Consistency Score: {result['features']['main_entity_consistency_score']:.3f}")
    else:
        print(f"  No main entity detected or entity not present in content")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

ANALYZING FIRST 10 ARTICLES

SAMPLE #1

Title: Anak Penjaga Kebersihan Masjid Lantamal 1 Belawan Lulus TNI, Hafal Quran dan Punya Suara Merdu
Label: clickbait

Content preview: Setelah dihebohkan dengan kemunculan calon taruna Enzo Zenz yang mahir berbahasa Prancis dan hafal Quran, kini TNI Angkatat Laut merekrut anak penjaga kebersihan Masjid Lantamal 1 Belawan.,,Pria berna...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


EXTRACTION RESULTS:

Title Entities (1):
    1.  Haf (REG) - score: 0.92

  Content Entities (16):
    1.  En (PER) - score: 0.96
    2. zo Zenz (PER) - score: 0.88
    3.  Quran (REG) - score: 1.00
    4.  TNI Angkatat Laut (NOR) - score: 0.97
    5.  1 Belawan (FAC) - score: 0.84
    6.  Jumanto (PER) - score: 0.97
    7.  TNI AL (NOR) - score: 0.86
    8.  ayat (REG) - score: 0.77
    9.  Alquran (REG) - score: 0.80
    10. 51 detik (QTY) - score: 0.97

MATCHING RESULTS:
    1. ' Haf' → NOT FOUND

FEATURES:
num_title_entities: 1
num_content_entities: 16
num_matched_entities: 0
num_missing_entities: 1
match_ratio: 0.000
type_match_ratio: 0.000
avg_similarity: 0.000
max_similarity: 0.000
min_similarity: 0.000
avg_mention_freq: 0.000
max_mention_freq: 0.000
entities_in_lead: 0
entity_consistency_score: 0.000
is_present: False
main_entity_text: None
main_entity_type: None
main_entity_coverage: 0.000
main_entity_frequency: 0
main_entity_avg_position: 1.000
main_entity_in_lead: False
mai

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



EXTRACTION RESULTS:

Title Entities (2):
    1.  Minyak Curah (PRD) - score: 1.00
    2.  Minyak Kemasan (PRD) - score: 0.80

  Content Entities (39):
    1. Menteri Perdagangan (NOR) - score: 1.00
    2.  Enggartiasto Lukita (PER) - score: 0.93
    3.  minyak curah (PRD) - score: 1.00
    4.   (DAT) - score: 0.99
    5. 1 Januari 2020 (DAT) - score: 0.96
    6.  minyak goreng (PRD) - score: 0.99
    7.  minyak goreng curah (PRD) - score: 1.00
    8.  Engg (PER) - score: 0.97
    9.  kawasan Sarinah (LOC) - score: 0.92
    10.  Jakarta (GPE) - score: 0.99

MATCHING RESULTS:
    1. ' Minyak Curah' → ' minyak curah' (exact, sim=1.00)
    2. ' Minyak Kemasan' → ' minyak' (fuzzy, sim=1.00)

FEATURES:
num_title_entities: 2
num_content_entities: 39
num_matched_entities: 2
num_missing_entities: 0
match_ratio: 1.000
type_match_ratio: 1.000
avg_similarity: 1.000
max_similarity: 1.000
min_similarity: 1.000
avg_mention_freq: 25.500
max_mention_freq: 34
entities_in_lead: 2
entity_consistency_scor

## Process Dataset

In [None]:
df_features = nem.process_dataset(df, title_col='title', content_col='content')



Processing 15240 articles...



Processing articles:  10%|▉         | 1470/15240 [1:02:44<8:46:31,  2.29s/it]

In [None]:
df_all = pd.concat(
    [df, df_features],
    axis=1
)

df_all.to_csv('df_all.csv', index=False)
print("Saved: df_all.csv")

In [None]:
df_named_entity_matcher_result = pd.concat(
    [df, df_features[['entity_consistency_score']], df_features[['main_entity_coverage']], df_features[['main_entity_avg_position']], df_features[['main_entity_consistency_score']]],
    axis=1
)

df_named_entity_matcher_result.to_csv('named_entity_matcher_result.csv', index=False)
print("Saved: named_entity_matcher_result.csv")


In [None]:
FEATURES = [
    'num_matched_entities',
    'num_missing_entities',
    'match_ratio',
    'type_match_ratio',
    'avg_similarity',
    'max_similarity',
    'min_similarity',
    'avg_mention_freq',
    'max_mention_freq',
    'entities_in_lead'
]

In [None]:
MAIN_FEATURES = [
    'main_entity_coverage',
    'main_entity_frequency',
    'main_entity_avg_position',
    'main_entity_distribution_score'
]

In [None]:
DATA_PATH = "df_all.csv"

In [None]:
def load_data(filepath):
    """Load data dari CSV hasil ekstraksi fitur"""
    try:
        df = pd.read_csv(filepath)
        print(f"✓ Data loaded: {len(df)} rows")
        print(f"✓ Columns: {df.columns.tolist()}")
        return df
    except FileNotFoundError:
        print(f"⚠ File {filepath} tidak ditemukan!")
        print("Membuat contoh data untuk demonstrasi...")
        return create_sample_data()

In [None]:
def calculate_effect_size(group1, group2):
    """Hitung Cohen's d effect size"""
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))

    if pooled_std == 0:
        return 0

    return (np.mean(group1) - np.mean(group2)) / pooled_std

In [None]:
def calculate_feature_importance(df, features):
    """Hitung importance metrics untuk setiap fitur"""
    results = []

    clickbait = df[df['label'] == 'clickbait']
    non_clickbait = df[df['label'] == 'non-clickbait']

    for feature in features:
        # Mann-Whitney U test
        statistic, p_value = stats.mannwhitneyu(
            clickbait[feature].dropna(),
            non_clickbait[feature].dropna(),
            alternative='two-sided'
        )

        # Effect size (Cohen's d)
        effect_size = calculate_effect_size(
            non_clickbait[feature].dropna(),
            clickbait[feature].dropna()
        )

        # Mean difference
        mean_diff = non_clickbait[feature].mean() - clickbait[feature].mean()

        # Median difference
        median_diff = non_clickbait[feature].median() - clickbait[feature].median()

        results.append({
            'feature': feature,
            'p_value': p_value,
            'effect_size': abs(effect_size),
            'mean_diff': abs(mean_diff),
            'median_diff': abs(median_diff),
            'clickbait_mean': clickbait[feature].mean(),
            'non_clickbait_mean': non_clickbait[feature].mean(),
            'clickbait_median': clickbait[feature].median(),
            'non_clickbait_median': non_clickbait[feature].median()
        })

    return pd.DataFrame(results)

In [None]:
def create_boxplot_grid(df, features):
    """Membuat grid boxplot untuk semua fitur dalam satu frame"""

    n_features = len(features)
    n_cols = 3
    n_rows = (n_features + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 5*n_rows))
    axes = axes.flatten() if n_features > 1 else [axes]

    # Color palette
    colors = {'clickbait': '#FF6B6B', 'non-clickbait': '#4ECDC4'}

    for idx, feature in enumerate(features):
        ax = axes[idx]

        # Prepare data
        data_to_plot = []
        labels_to_plot = []

        for label in ['clickbait', 'non-clickbait']:
            data = df[df['label'] == label][feature].dropna()
            data_to_plot.append(data)
            labels_to_plot.append(label)

        # Create boxplot
        bp = ax.boxplot(data_to_plot,
                       labels=labels_to_plot,
                       patch_artist=True,
                       widths=0.6,
                       showmeans=True,
                       meanprops=dict(marker='D', markerfacecolor='yellow',
                                    markeredgecolor='black', markersize=8))

        # Color boxes
        for patch, label in zip(bp['boxes'], labels_to_plot):
            patch.set_facecolor(colors[label])
            patch.set_alpha(0.7)

        # Styling
        ax.set_title(feature.replace('_', ' ').title(),
                    fontsize=12, fontweight='bold', pad=10)
        ax.set_ylabel('Value', fontsize=10)
        ax.grid(axis='y', alpha=0.3, linestyle='--')
        ax.set_facecolor('#f8f9fa')

        # Add mean values as text
        for i, label in enumerate(labels_to_plot, 1):
            mean_val = df[df['label'] == label][feature].mean()
            ax.text(i, ax.get_ylim()[1] * 0.95,
                   f'μ={mean_val:.2f}',
                   ha='center', fontsize=9,
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

    # Hide empty subplots
    for idx in range(n_features, len(axes)):
        axes[idx].axis('off')

    plt.suptitle('Perbandingan Semua Fitur: Clickbait vs Non-Clickbait',
                fontsize=18, fontweight='bold', y=1.00)
    plt.tight_layout()

    return fig

In [None]:
def create_significance_chart(importance_df):
    """Membuat visualisasi effect size dan significance"""

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

    # Sort by effect size
    importance_df = importance_df.sort_values('effect_size', ascending=True)

    # Chart 1: Effect Size dengan color berdasarkan significance
    colors = ['#FF6B6B' if p < 0.001 else '#FFA07A' if p < 0.01 else '#FFD700' if p < 0.05 else '#A9A9A9'
              for p in importance_df['p_value']]

    bars1 = ax1.barh(importance_df['feature'], importance_df['effect_size'],
                     color=colors, edgecolor='black', linewidth=1.5, alpha=0.8)

    ax1.set_xlabel('Effect Size (|Cohen\'s d|)', fontsize=12, fontweight='bold')
    ax1.set_title('Effect Size Setiap Fitur\n(Merah: p<0.001, Orange: p<0.01, Kuning: p<0.05, Abu: ns)',
                 fontsize=14, fontweight='bold', pad=15)
    ax1.axvline(0.2, color='gray', linestyle='--', alpha=0.5, label='Small effect')
    ax1.axvline(0.5, color='gray', linestyle='--', alpha=0.5, label='Medium effect')
    ax1.axvline(0.8, color='red', linestyle='--', alpha=0.5, label='Large effect')
    ax1.legend(loc='lower right')
    ax1.grid(axis='x', alpha=0.3)

    # Add values on bars
    for i, (feat, val) in enumerate(zip(importance_df['feature'], importance_df['effect_size'])):
        ax1.text(val + 0.02, i, f'{val:.3f}',
                va='center', fontsize=9, fontweight='bold')

    # Chart 2: Mean Difference
    ax2.barh(importance_df['feature'], importance_df['mean_diff'],
            color='#4ECDC4', edgecolor='black', linewidth=1.5, alpha=0.8)

    ax2.set_xlabel('|Mean Difference| (Non-Clickbait - Clickbait)',
                  fontsize=12, fontweight='bold')
    ax2.set_title('Perbedaan Rata-rata Antar Grup',
                 fontsize=14, fontweight='bold', pad=15)
    ax2.grid(axis='x', alpha=0.3)

    # Add values on bars
    for i, (feat, val) in enumerate(zip(importance_df['feature'], importance_df['mean_diff'])):
        ax2.text(val + 0.01, i, f'{val:.3f}',
                va='center', fontsize=9, fontweight='bold')

    plt.tight_layout()
    return fig


In [None]:
def print_feature_ranking(importance_df):
    """Print ranking fitur dengan detail statistik"""

    # Sort by composite score (kombinasi effect size dan significance)
    importance_df['composite_score'] = (
        importance_df['effect_size'] *
        (1 - importance_df['p_value'].clip(0, 1))
    )

    importance_df = importance_df.sort_values('composite_score', ascending=False)

    print("\n" + "="*100)
    print("RANKING KONTRIBUSI & SIGNIFICANCE FITUR")
    print("="*100)
    print(f"\n{'Rank':<6} {'Feature':<30} {'Effect Size':<15} {'p-value':<12} {'Mean Diff':<12} {'Score':<10}")
    print("-"*100)

    for rank, (_, row) in enumerate(importance_df.iterrows(), 1):
        sig_marker = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else "ns"

        print(f"{rank:<6} {row['feature']:<30} {row['effect_size']:<15.4f} {row['p_value']:<12.6f} "
              f"{row['mean_diff']:<12.4f} {row['composite_score']:<10.4f} {sig_marker}")

    print("\nSignificance: *** p<0.001, ** p<0.01, * p<0.05, ns = not significant")

    # Rekomendasi bobot
    print("\n" + "="*100)
    print("REKOMENDASI BOBOT UNTUK ENTITY_CONSISTENCY_SCORE")
    print("="*100)

    # Normalize composite score to get weights
    total_score = importance_df['composite_score'].sum()
    importance_df['recommended_weight'] = (importance_df['composite_score'] / total_score * 100)

    print(f"\n{'Feature':<30} {'Weight (%)':<15} {'Interpretasi'}")
    print("-"*100)

    for _, row in importance_df.iterrows():
        weight = row['recommended_weight']

        if weight > 15:
            interp = "Sangat Penting"
        elif weight > 10:
            interp = "Penting"
        elif weight > 5:
            interp = "Cukup Penting"
        else:
            interp = "Minor contribution"

        print(f"{row['feature']:<30} {weight:<15.2f} {interp}")

    # Formula suggestion
    print("\n" + "="*100)
    print("FORMULA YANG DISARANKAN untuk entity_consistency_score:")
    print("="*100)
    print("\nBerdasarkan analisis statistik, berikut formula yang disarankan:\n")

    # Group features by importance
    high_imp = importance_df[importance_df['recommended_weight'] > 15]['feature'].tolist()
    med_imp = importance_df[(importance_df['recommended_weight'] > 10) &
                           (importance_df['recommended_weight'] <= 15)]['feature'].tolist()
    low_imp = importance_df[importance_df['recommended_weight'] <= 10]['feature'].tolist()

    print("entity_consistency_score = (")

    if high_imp:
        weights_high = [f"{w:.3f}" for w in
                       importance_df[importance_df['feature'].isin(high_imp)]['recommended_weight'] / 100]
        print("    # Fitur dengan kontribusi TINGGI:")
        for feat, w in zip(high_imp, weights_high):
            print(f"    {w} * {feat} +")

    if med_imp:
        weights_med = [f"{w:.3f}" for w in
                      importance_df[importance_df['feature'].isin(med_imp)]['recommended_weight'] / 100]
        print("    # Fitur dengan kontribusi SEDANG:")
        for feat, w in zip(med_imp, weights_med):
            print(f"    {w} * {feat} +")

    if low_imp:
        weights_low = [f"{w:.3f}" for w in
                      importance_df[importance_df['feature'].isin(low_imp)]['recommended_weight'] / 100]
        print("    # Fitur dengan kontribusi RENDAH:")
        for i, (feat, w) in enumerate(zip(low_imp, weights_low)):
            if i < len(low_imp) - 1:
                print(f"    {w} * {feat} +")
            else:
                print(f"    {w} * {feat}")

    print(")")

    # Additional stats table
    print("\n" + "="*100)
    print("STATISTIK DETAIL SETIAP FITUR")
    print("="*100)
    print(f"\n{'Feature':<30} {'CB Mean':<12} {'NCB Mean':<12} {'CB Median':<12} {'NCB Median':<12}")
    print("-"*100)

    for _, row in importance_df.iterrows():
        print(f"{row['feature']:<30} {row['clickbait_mean']:<12.4f} "
              f"{row['non_clickbait_mean']:<12.4f} {row['clickbait_median']:<12.4f} "
              f"{row['non_clickbait_median']:<12.4f}")

    print("\nCB = Clickbait, NCB = Non-Clickbait")
    print("="*100)

    return importance_df

In [None]:
df = load_data(DATA_PATH)

# Validasi kolom
missing_features = [f for f in FEATURES if f not in df.columns]
if missing_features:
    print(f"\nWarning: Fitur berikut tidak ditemukan di data: {missing_features}")
    features_to_use = [f for f in FEATURES if f in df.columns]
else:
    features_to_use = FEATURES

if 'label' not in df.columns:
    print("Error: Kolom 'label' tidak ditemukan!")
    return

print(f"\nMenganalisis {len(features_to_use)} fitur")
print(f"Total data: {len(df)} artikel")
print(f"Clickbait: {len(df[df['label']=='clickbait'])}")
print(f"Non-Clickbait: {len(df[df['label']=='non-clickbait'])}")

In [None]:
fig1 = create_boxplot_grid(df, features_to_use)
plt.savefig('boxplot_grid_all_features.png', dpi=300, bbox_inches='tight')
print("Saved: boxplot_grid_all_features.png")

In [None]:
fig2 = create_significance_chart(importance_df)
plt.savefig('effect_size_significance.png', dpi=300, bbox_inches='tight')
print("Saved: effect_size_significance.png")

In [None]:
# Calculate importance
importance_df = calculate_feature_importance(df, features_to_use)

# Print ranking
importance_df_final = print_feature_ranking(importance_df)

# Save ranking to CSV
importance_df_final.to_csv('feature_ranking.csv', index=False)
print("\nRanking saved to: feature_ranking.csv")

In [None]:
plt.show()

In [None]:
print("\n" + "="*100)
    print("ANALISIS MAIN ENTITY FEATURES - CLICKBAIT DETECTION")
    print("="*100 + "\n")

    # Load data
    df = load_data(DATA_PATH)

    # Validasi kolom
    missing_features = [f for f in MAIN_FEATURES if f not in df.columns]
    if missing_features:
        print(f"\n⚠ Warning: Fitur berikut tidak ditemukan di data: {missing_features}")
        features_to_use = [f for f in MAIN_FEATURES if f in df.columns]
    else:
        features_to_use = MAIN_FEATURES

    if 'label' not in df.columns:
        print("Error: Kolom 'label' tidak ditemukan!")
        return

    print(f"\n✓ Menganalisis {len(features_to_use)} main entity features")
    print(f"✓ Total data: {len(df)} artikel")
    print(f"  - Clickbait: {len(df[df['label']=='clickbait'])}")
    print(f"  - Non-Clickbait: {len(df[df['label']=='non-clickbait'])}")

    # Calculate importance
    print("\nMenghitung statistical significance dan effect size...")
    importance_df = calculate_feature_importance(df, features_to_use)

    # Visualisasi 1: Boxplot Grid
    print("Membuat visualisasi boxplot grid...")
    fig1 = create_boxplot_grid(df, features_to_use)
    plt.savefig('boxplot_grid_main_entity_features.png', dpi=300, bbox_inches='tight')
    print("✓ Saved: boxplot_grid_main_entity_features.png")

    # Visualisasi 2: Effect Size Chart
    print("Membuat visualisasi effect size dan significance...")
    fig2 = create_significance_chart(importance_df)
    plt.savefig('effect_size_significance_main_entity.png', dpi=300, bbox_inches='tight')
    print("✓ Saved: effect_size_significance_main_entity.png")

    # Print ranking
    importance_df_final = print_feature_ranking(importance_df)

    # Save ranking to CSV
    importance_df_final.to_csv('main_entity_feature_ranking.csv', index=False)
    print("\n✓ Ranking saved to: main_entity_feature_ranking.csv")

    print("\n" + "="*100)
    print("ANALISIS SELESAI!")
    print("="*100)
    print("\nFile yang dihasilkan:")
    print("1. boxplot_grid_main_entity_features.png - Boxplot semua main entity features dalam 1 frame")
    print("2. effect_size_significance_main_entity.png - Chart effect size dan significance")
    print("3. main_entity_feature_ranking.csv - Tabel ranking lengkap")

    plt.show()

In [None]:
def create_boxplot_stripplot(df, score_col='entity_consistency_score',
                              label_col='label', figsize=(12, 8),
                              sample_size=500, output_file=None):
    """
    Membuat Box Plot + Strip Plot untuk membandingkan entity consistency score

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame yang berisi data
    score_col : str
        Nama kolom yang berisi entity_consistency_score (float 0-1)
    label_col : str
        Nama kolom yang berisi label ('clickbait' atau 'non-clickbait')
    figsize : tuple
        Ukuran figure (width, height)
    sample_size : int
        Jumlah sample points untuk strip plot (untuk menghindari overcrowding)
    output_file : str, optional
        Path untuk menyimpan file output. Jika None, tidak menyimpan file.

    Returns:
    --------
    fig, ax : matplotlib figure dan axes objects
    """

    # Prepare data
    print("Memproses data...")
    clickbait = df[df[label_col] == 'clickbait'][score_col].dropna()
    non_clickbait = df[df[label_col] == 'non-clickbait'][score_col].dropna()

    print(f"  ✓ Clickbait: {len(clickbait)} samples")
    print(f"  ✓ Non-Clickbait: {len(non_clickbait)} samples")

    # Calculate statistics
    cb_mean = clickbait.mean()
    cb_median = clickbait.median()
    cb_std = clickbait.std()

    ncb_mean = non_clickbait.mean()
    ncb_median = non_clickbait.median()
    ncb_std = non_clickbait.std()

    # Statistical test
    t_stat, p_value = stats.ttest_ind(clickbait, non_clickbait)
    u_stat, p_value_mw = stats.mannwhitneyu(clickbait, non_clickbait, alternative='two-sided')

    print("\nStatistik:")
    print(f"  Clickbait     - Mean: {cb_mean:.4f}, Median: {cb_median:.4f}, Std: {cb_std:.4f}")
    print(f"  Non-Clickbait - Mean: {ncb_mean:.4f}, Median: {ncb_median:.4f}, Std: {ncb_std:.4f}")
    print(f"\n  T-test p-value: {p_value:.4e}")
    print(f"  Mann-Whitney U p-value: {p_value_mw:.4e}")
    print(f"  → Difference is {'SIGNIFICANT' if p_value < 0.05 else 'NOT significant'} (α=0.05)")

    # Create figure
    print("\nMembuat visualisasi...")
    fig, ax = plt.subplots(figsize=figsize)

    # Set style
    sns.set_style("whitegrid")

    # Colors
    color_clickbait = '#FF6B6B'  # Red-ish
    color_non_clickbait = '#4ECDC4'  # Teal

    # ==========================================
    # 1. BOX PLOT
    # ==========================================
    box_parts = ax.boxplot(
        [clickbait, non_clickbait],
        positions=[1, 2],
        tick_labels=['Clickbait', 'Non-Clickbait'],
        patch_artist=True,
        notch=True,  # Notched box plot untuk confidence interval
        widths=0.5,
        showmeans=True,  # Show mean as well
        meanprops=dict(marker='D', markerfacecolor='yellow',
                      markeredgecolor='black', markersize=8),
        medianprops=dict(color='black', linewidth=2),
        boxprops=dict(linewidth=1.5),
        whiskerprops=dict(linewidth=1.5),
        capprops=dict(linewidth=1.5)
    )

    # Color boxes
    box_parts['boxes'][0].set_facecolor(color_clickbait)
    box_parts['boxes'][0].set_alpha(0.7)
    box_parts['boxes'][1].set_facecolor(color_non_clickbait)
    box_parts['boxes'][1].set_alpha(0.7)

    # ==========================================
    # 2. STRIP PLOT (overlay individual points)
    # ==========================================
    actual_sample_cb = min(sample_size, len(clickbait))
    actual_sample_ncb = min(sample_size, len(non_clickbait))

    # Random jitter untuk x-axis agar points tidak overlap
    np.random.seed(42)
    x1_jitter = np.random.normal(1, 0.04, actual_sample_cb)
    x2_jitter = np.random.normal(2, 0.04, actual_sample_ncb)

    # Sample data
    cb_sample = clickbait.sample(actual_sample_cb, random_state=42)
    ncb_sample = non_clickbait.sample(actual_sample_ncb, random_state=42)

    # Plot points
    ax.scatter(x1_jitter, cb_sample, alpha=0.3, s=20,
              color=color_clickbait, edgecolors='none', label='_nolegend_')
    ax.scatter(x2_jitter, ncb_sample, alpha=0.3, s=20,
              color=color_non_clickbait, edgecolors='none', label='_nolegend_')

    # ==========================================
    # 3. FORMATTING
    # ==========================================
    ax.set_ylabel('Entity Consistency Score', fontsize=14, fontweight='bold')
    ax.set_xlabel('Label', fontsize=14, fontweight='bold')
    ax.set_title('Perbandingan Entity Consistency Score:\nClickbait vs Non-Clickbait',
                fontsize=16, fontweight='bold', pad=20)

    # Grid
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.set_axisbelow(True)

    # Y-axis limits
    ax.set_ylim(-0.05, 1.05)

    # X-axis styling
    ax.set_xlim(0.5, 2.5)
    ax.tick_params(axis='both', labelsize=12)

    # ==========================================
    # 4. ANNOTATIONS - Statistics Box
    # ==========================================
    stats_text = "STATISTIK:\n"
    stats_text += "─" * 35 + "\n"
    stats_text += f"Clickbait (n={len(clickbait)}):\n"
    stats_text += f"  • Mean:   {cb_mean:.4f}\n"
    stats_text += f"  • Median: {cb_median:.4f}\n"
    stats_text += f"  • Std:    {cb_std:.4f}\n"
    stats_text += f"  • Q1-Q3:  {clickbait.quantile(0.25):.4f} - {clickbait.quantile(0.75):.4f}\n\n"

    stats_text += f"Non-Clickbait (n={len(non_clickbait)}):\n"
    stats_text += f"  • Mean:   {ncb_mean:.4f}\n"
    stats_text += f"  • Median: {ncb_median:.4f}\n"
    stats_text += f"  • Std:    {ncb_std:.4f}\n"
    stats_text += f"  • Q1-Q3:  {non_clickbait.quantile(0.25):.4f} - {non_clickbait.quantile(0.75):.4f}\n\n"

    stats_text += "─" * 35 + "\n"
    stats_text += f"Difference of Means: {abs(cb_mean - ncb_mean):.4f}\n"
    stats_text += f"T-test p-value: {p_value:.4e}\n"
    stats_text += f"{'✓ SIGNIFICANT' if p_value < 0.05 else '✗ NOT SIGNIFICANT'} (α=0.05)"

    # Position stats box
    ax.text(0.98, 0.97, stats_text, transform=ax.transAxes,
           fontsize=9, verticalalignment='top', horizontalalignment='right',
           bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.85, pad=0.8),
           family='monospace')

    legend_elements = [
        plt.Line2D([0], [0], marker='o', color='w',
                  markerfacecolor=color_clickbait, markersize=10,
                  label='Clickbait', alpha=0.7),
        plt.Line2D([0], [0], marker='o', color='w',
                  markerfacecolor=color_non_clickbait, markersize=10,
                  label='Non-Clickbait', alpha=0.7),
        plt.Line2D([0], [0], marker='D', color='w',
                  markerfacecolor='yellow', markeredgecolor='black',
                  markersize=8, label='Mean'),
        plt.Line2D([0], [0], color='black', linewidth=2,
                  label='Median')
    ]

    ax.legend(handles=legend_elements, loc='upper left',
             fontsize=10, framealpha=0.9)

    interpretation = ""
    if cb_mean < ncb_mean:
        diff_pct = ((ncb_mean - cb_mean) / ncb_mean) * 100
        interpretation = f"INTERPRETASI:\n"
        interpretation += f"Artikel clickbait memiliki entity\n"
        interpretation += f"consistency score {diff_pct:.1f}% lebih rendah\n"
        interpretation += f"dari non-clickbait.\n\n"
        interpretation += f"→ Ini mengindikasikan clickbait\n"
        interpretation += f"  lebih sering melakukan entity\n"
        interpretation += f"  manipulation (menyebut entitas\n"
        interpretation += f"  di judul yang tidak muncul di isi)."
    else:
        interpretation = f"INTERPRETASI:\n"
        interpretation += f"Tidak ada perbedaan signifikan\n"
        interpretation += f"dalam entity consistency score."

    print(interpretation)

    plt.tight_layout()

    return fig, ax

In [None]:
# df = pd.read_csv('named_entity_matcher_result.csv')

# fig, ax = create_boxplot_stripplot(
#     df,
#     score_col='entity_consistency_score',
#     label_col='label'
# )

# plt.show()

In [None]:
# df_all = pd.read_csv('df_all.csv')
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='match_ratio',
#     label_col='label'
# )

# ax.set_ylabel('Match Ratio Score', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Match Ratio Score:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='num_content_entities',
#     label_col='label'
# )

# ax.set_ylabel('Main Entity Consistency Score', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Main Entity Consistency Score:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='num_matched_entities',
#     label_col='label'
# )

# ax.set_ylabel('Jumlah Matched Entity', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Jumlah Matched Entity:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='num_missing_entities',
#     label_col='label'
# )

# ax.set_ylabel('Jumlah Missing Entity', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Jumlah Missing Entity:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='type_match_ratio',
#     label_col='label'
# )

# ax.set_ylabel('Rasio Type Match', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Rasio Type Match:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='avg_similarity',
#     label_col='label'
# )

# ax.set_ylabel('Rata-Rata Similarity Entitas', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Rata-Rata Similarity Entitas:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='max_similarity',
#     label_col='label'
# )

# ax.set_ylabel('Jumlah Kesamaan Entitas Tertinggi', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Jumlah Kesamaan Entitas Tertinggi:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='min_similarity',
#     label_col='label'
# )

# ax.set_ylabel('Jumlah Kesamaan Entitas Terendah', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Jumlah Kesamaan Entitas Terendah:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='avg_mention_freq',
#     label_col='label'
# )

# ax.set_ylabel('Rata-Rata Frekuensi Kemunculan', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Rata-Rata Frekuensi Kemunculan:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='max_mention_freq',
#     label_col='label',
#     sample_size=500,
# )

# ax.set_ylabel('Frekuensi Kemunculan Tertinggi', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Frekuensi Kemunculan Tertinggi:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# fig, ax = create_boxplot_stripplot(
#     df_all,
#     score_col='entities_in_lead',
#     label_col='label',
#     sample_size=500,
# )

# ax.set_ylabel('Jumlah Entitas Judul di Lead Konten', fontsize=14, fontweight='bold')
# ax.set_title('Perbandingan Jumlah Entitas Judul di Lead Konten:\nClickbait vs Non-Clickbait',
#             fontsize=16, fontweight='bold', pad=20)

# plt.show()

In [None]:
# Visualisasi untuk main_entity_coverage
fig, ax = create_boxplot_stripplot(
    df,
    score_col='main_entity_coverage',
    label_col='label',
    sample_size=500,
)

ax.set_ylabel('Main Entity Coverage', fontsize=14, fontweight='bold')
ax.set_title('Perbandingan Main Entity Coverage:\nClickbait vs Non-Clickbait',
            fontsize=16, fontweight='bold', pad=20)

plt.show()

In [None]:
fig, ax = create_boxplot_stripplot(
    df_all,
    score_col='main_entity_frequency',
    label_col='label',
    sample_size=500,
)

ax.set_ylabel('Main Entity Frequency', fontsize=14, fontweight='bold')
ax.set_title('Perbandingan Main Entity Frequency:\nClickbait vs Non-Clickbait',
            fontsize=16, fontweight='bold', pad=20)

plt.show()

In [None]:
# Visualisasi untuk main_entity_avg_position
fig, ax = create_boxplot_stripplot(
    df,
    score_col='main_entity_avg_position',
    label_col='label',
    sample_size=500,
)

ax.set_ylabel('Main Entity Average Position', fontsize=14, fontweight='bold')
ax.set_title('Perbandingan Main Entity Average Position:\nClickbait vs Non-Clickbait\n(0=early, 1=late)',
            fontsize=16, fontweight='bold', pad=20)

plt.show()

In [None]:
# Visualisasi untuk main_entity_consistency_score
fig, ax = create_boxplot_stripplot(
    df,
    score_col='main_entity_consistency_score',
    label_col='label',
    sample_size=500,
)

ax.set_ylabel('Main Entity Consistency Score', fontsize=14, fontweight='bold')
ax.set_title('Perbandingan Main Entity Consistency Score:\nClickbait vs Non-Clickbait',
            fontsize=16, fontweight='bold', pad=20)

plt.show()

In [None]:
# Visualisasi untuk main_entity_consistency_score
fig, ax = create_boxplot_stripplot(
    df,
    score_col='main_entity_consistency_score',
    label_col='label',
    sample_size=500,
)

ax.set_ylabel('Main Entity Consistency Score', fontsize=14, fontweight='bold')
ax.set_title('Perbandingan Main Entity Consistency Score:\nClickbait vs Non-Clickbait',
            fontsize=16, fontweight='bold', pad=20)

plt.show()