## üì¶ Step 1: Install Required Libraries

In [None]:
!pip install keybert
!pip install yake
!pip install rake-nltk
!pip install transformers
!pip install sentence-transformers
!pip install datasets
!pip install nltk
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install scikit-learn
!pip install pandas numpy

## üìö Step 2: Import Libraries

In [None]:
import os
import json
import pickle
import zipfile
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from collections import Counter

# Keyword Extraction Libraries
from keybert import KeyBERT
import yake
from rake_nltk import Rake

# NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Transformer Models
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

# Dataset
from datasets import load_dataset

print("‚úÖ All libraries imported successfully!")

## üì∞ Step 3: Load News Datasets for Journalism/Research Domain

In [None]:
# Load AG News Dataset (News articles for 4 categories)
print("Loading AG News Dataset...")
ag_news = load_dataset("ag_news", split="train[:5000]")

# Convert to DataFrame
df_ag = pd.DataFrame({
    'text': ag_news['text'],
    'label': ag_news['label']
})

# Label mapping for AG News
label_map = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}
df_ag['category'] = df_ag['label'].map(label_map)

print(f"\nüìä Dataset Statistics:")
print(f"Total articles: {len(df_ag)}")
print(f"\nCategory Distribution:")
print(df_ag['category'].value_counts())

# Display sample articles
print("\nüìù Sample Articles:")
for i in range(3):
    print(f"\n--- Article {i+1} ({df_ag.iloc[i]['category']}) ---")
    print(df_ag.iloc[i]['text'][:300] + "...")

## üß† Step 4: Initialize Keyword Extraction Models

In [None]:
print("üîÑ Initializing KeyBERT Model (Best for Semantic Keywords)...")
# Using all-MiniLM-L6-v2 for balance between speed and quality
kw_model = KeyBERT(model='all-MiniLM-L6-v2')
print("‚úÖ KeyBERT initialized!")

print("\nüîÑ Initializing YAKE Model...")
# YAKE parameters optimized for news articles
yake_extractor = yake.KeywordExtractor(
    lan="en",
    n=3,  # max ngram size
    dedupLim=0.7,
    dedupFunc='seqm',
    windowsSize=1,
    top=10,
    features=None
)
print("‚úÖ YAKE initialized!")

print("\nüîÑ Initializing RAKE Model...")
rake_extractor = Rake(
    stopwords=stopwords.words('english'),
    punctuations=None,
    language='english',
    max_length=3,
    min_length=1
)
print("‚úÖ RAKE initialized!")

print("\nüéâ All models ready!")

## üîç Step 5: Define Keyword Extraction Functions

In [None]:
def extract_keywords_keybert(text, top_n=10, keyphrase_ngram_range=(1, 3), 
                              use_mmr=True, diversity=0.5):
    """
    Extract keywords using KeyBERT (Semantic-based)
    Best for: Understanding context and meaning
    """
    try:
        keywords = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=keyphrase_ngram_range,
            stop_words='english',
            use_mmr=use_mmr,
            diversity=diversity,
            top_n=top_n
        )
        return [(kw, round(score, 4)) for kw, score in keywords]
    except Exception as e:
        return []

def extract_keywords_yake(text, top_n=10):
    """
    Extract keywords using YAKE (Statistical-based)
    Best for: Fast extraction, no training needed
    """
    try:
        keywords = yake_extractor.extract_keywords(text)
        # YAKE returns lower scores for better keywords
        keywords = [(kw, round(1 - score, 4)) for kw, score in keywords[:top_n]]
        return keywords
    except Exception as e:
        return []

def extract_keywords_rake(text, top_n=10):
    """
    Extract keywords using RAKE (Graph-based)
    Best for: Multi-word keyphrases
    """
    try:
        rake_extractor.extract_keywords_from_text(text)
        keywords = rake_extractor.get_ranked_phrases_with_scores()[:top_n]
        # Normalize scores
        if keywords:
            max_score = max(score for score, _ in keywords)
            keywords = [(kw, round(score/max_score, 4)) for score, kw in keywords]
        return keywords
    except Exception as e:
        return []

def extract_keywords_ensemble(text, top_n=10):
    """
    Ensemble method combining all three extractors
    Uses weighted voting for best results
    """
    # Get keywords from all methods
    keybert_kws = extract_keywords_keybert(text, top_n=15)
    yake_kws = extract_keywords_yake(text, top_n=15)
    rake_kws = extract_keywords_rake(text, top_n=15)
    
    # Combine with weights (KeyBERT has highest weight for semantic quality)
    keyword_scores = {}
    
    # KeyBERT weight: 0.5
    for kw, score in keybert_kws:
        kw_lower = kw.lower()
        keyword_scores[kw_lower] = keyword_scores.get(kw_lower, 0) + score * 0.5
    
    # YAKE weight: 0.3
    for kw, score in yake_kws:
        kw_lower = kw.lower()
        keyword_scores[kw_lower] = keyword_scores.get(kw_lower, 0) + score * 0.3
    
    # RAKE weight: 0.2
    for kw, score in rake_kws:
        kw_lower = kw.lower()
        keyword_scores[kw_lower] = keyword_scores.get(kw_lower, 0) + score * 0.2
    
    # Sort and return top keywords
    sorted_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)
    return [(kw, round(score, 4)) for kw, score in sorted_keywords[:top_n]]

print("‚úÖ All extraction functions defined!")

## üß™ Step 6: Test Keyword Extraction on Sample Articles

In [None]:
# Test on sample news articles
sample_texts = [
    df_ag.iloc[0]['text'],  # World news
    df_ag.iloc[1250]['text'],  # Sports news
    df_ag.iloc[2500]['text'],  # Business news
    df_ag.iloc[3750]['text'],  # Sci/Tech news
]

print("=" * 80)
print("üîç KEYWORD EXTRACTION COMPARISON")
print("=" * 80)

for i, text in enumerate(sample_texts):
    category = df_ag.iloc[i * 1250]['category'] if i > 0 else df_ag.iloc[0]['category']
    print(f"\nüì∞ Article {i+1} - Category: {category}")
    print(f"Text: {text[:200]}...")
    print("-" * 40)
    
    print("\nüîπ KeyBERT Keywords:")
    keybert_results = extract_keywords_keybert(text)
    for kw, score in keybert_results[:5]:
        print(f"   ‚Ä¢ {kw}: {score}")
    
    print("\nüîπ YAKE Keywords:")
    yake_results = extract_keywords_yake(text)
    for kw, score in yake_results[:5]:
        print(f"   ‚Ä¢ {kw}: {score}")
    
    print("\nüîπ RAKE Keywords:")
    rake_results = extract_keywords_rake(text)
    for kw, score in rake_results[:5]:
        print(f"   ‚Ä¢ {kw}: {score}")
    
    print("\nüîπ ENSEMBLE Keywords (Best):")
    ensemble_results = extract_keywords_ensemble(text)
    for kw, score in ensemble_results[:5]:
        print(f"   ‚Ä¢ {kw}: {score}")
    
    print("\n" + "=" * 80)

## üìä Step 7: Benchmark and Evaluate Models

In [None]:
import time

# Benchmark on 100 articles
test_articles = df_ag['text'].head(100).tolist()

def benchmark_model(extractor_func, name, articles):
    start_time = time.time()
    all_keywords = []
    for article in articles:
        keywords = extractor_func(article)
        all_keywords.append(keywords)
    elapsed_time = time.time() - start_time
    
    avg_keywords = np.mean([len(kws) for kws in all_keywords])
    return {
        'name': name,
        'time': round(elapsed_time, 2),
        'avg_keywords': round(avg_keywords, 2),
        'time_per_article': round(elapsed_time / len(articles) * 1000, 2)
    }

print("‚è±Ô∏è Benchmarking models on 100 articles...\n")

results = []
results.append(benchmark_model(extract_keywords_keybert, 'KeyBERT', test_articles))
print(f"‚úÖ KeyBERT: {results[-1]['time']}s")

results.append(benchmark_model(extract_keywords_yake, 'YAKE', test_articles))
print(f"‚úÖ YAKE: {results[-1]['time']}s")

results.append(benchmark_model(extract_keywords_rake, 'RAKE', test_articles))
print(f"‚úÖ RAKE: {results[-1]['time']}s")

results.append(benchmark_model(extract_keywords_ensemble, 'Ensemble', test_articles))
print(f"‚úÖ Ensemble: {results[-1]['time']}s")

# Display results
print("\n" + "=" * 60)
print("üìä BENCHMARK RESULTS")
print("=" * 60)
benchmark_df = pd.DataFrame(results)
benchmark_df.columns = ['Model', 'Total Time (s)', 'Avg Keywords', 'Time/Article (ms)']
print(benchmark_df.to_string(index=False))

## üíæ Step 8: Create Model Configuration and Save

In [None]:
# Create model directory
MODEL_DIR = 'keyword_extraction_model'
os.makedirs(MODEL_DIR, exist_ok=True)

# Model configuration
model_config = {
    'name': 'Keyword Extraction System',
    'version': '1.0.0',
    'domain': 'Journalism/Research',
    'models': {
        'keybert': {
            'model_name': 'all-MiniLM-L6-v2',
            'description': 'BERT-based semantic keyword extraction',
            'best_for': 'Semantic understanding, context-aware keywords'
        },
        'yake': {
            'language': 'en',
            'max_ngram': 3,
            'dedup_threshold': 0.7,
            'description': 'Statistical keyword extraction',
            'best_for': 'Fast extraction, no model loading'
        },
        'rake': {
            'language': 'english',
            'max_length': 3,
            'min_length': 1,
            'description': 'Graph-based keyword extraction',
            'best_for': 'Multi-word keyphrases'
        },
        'ensemble': {
            'weights': {'keybert': 0.5, 'yake': 0.3, 'rake': 0.2},
            'description': 'Combined extraction using weighted voting',
            'best_for': 'Best overall accuracy'
        }
    },
    'default_params': {
        'top_n': 10,
        'diversity': 0.5,
        'use_mmr': True
    },
    'stopwords': list(stopwords.words('english'))
}

# Save configuration
with open(os.path.join(MODEL_DIR, 'config.json'), 'w') as f:
    json.dump(model_config, f, indent=2)

print("‚úÖ Model configuration saved!")

## üîß Step 9: Create Extraction Module for Deployment

In [None]:
# Create the main extraction module
extraction_module = '''
"""
Keyword Extraction Module
Domain: Journalism/Research
Models: KeyBERT, YAKE, RAKE, Ensemble
"""

import json
import os
from keybert import KeyBERT
import yake
from rake_nltk import Rake
from nltk.corpus import stopwords
import nltk

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

class KeywordExtractor:
    def __init__(self, config_path=None):
        """
        Initialize the Keyword Extractor with all models
        """
        # Load configuration
        if config_path and os.path.exists(config_path):
            with open(config_path, 'r') as f:
                self.config = json.load(f)
        else:
            self.config = self._default_config()
        
        # Initialize models
        self._init_models()
    
    def _default_config(self):
        return {
            'models': {
                'keybert': {'model_name': 'all-MiniLM-L6-v2'},
                'yake': {'language': 'en', 'max_ngram': 3, 'dedup_threshold': 0.7},
                'rake': {'language': 'english', 'max_length': 3, 'min_length': 1},
                'ensemble': {'weights': {'keybert': 0.5, 'yake': 0.3, 'rake': 0.2}}
            },
            'default_params': {'top_n': 10, 'diversity': 0.5, 'use_mmr': True}
        }
    
    def _init_models(self):
        """Initialize all keyword extraction models"""
        # KeyBERT
        model_name = self.config['models']['keybert']['model_name']
        self.keybert_model = KeyBERT(model=model_name)
        
        # YAKE
        yake_config = self.config['models']['yake']
        self.yake_model = yake.KeywordExtractor(
            lan=yake_config['language'],
            n=yake_config['max_ngram'],
            dedupLim=yake_config['dedup_threshold'],
            dedupFunc='seqm',
            windowsSize=1,
            top=20
        )
        
        # RAKE
        rake_config = self.config['models']['rake']
        self.rake_model = Rake(
            stopwords=stopwords.words('english'),
            language=rake_config['language'],
            max_length=rake_config['max_length'],
            min_length=rake_config['min_length']
        )
    
    def extract_keybert(self, text, top_n=10, diversity=0.5):
        """Extract keywords using KeyBERT"""
        try:
            keywords = self.keybert_model.extract_keywords(
                text,
                keyphrase_ngram_range=(1, 3),
                stop_words='english',
                use_mmr=True,
                diversity=diversity,
                top_n=top_n
            )
            return [(kw, round(score, 4)) for kw, score in keywords]
        except:
            return []
    
    def extract_yake(self, text, top_n=10):
        """Extract keywords using YAKE"""
        try:
            keywords = self.yake_model.extract_keywords(text)
            return [(kw, round(1 - score, 4)) for kw, score in keywords[:top_n]]
        except:
            return []
    
    def extract_rake(self, text, top_n=10):
        """Extract keywords using RAKE"""
        try:
            self.rake_model.extract_keywords_from_text(text)
            keywords = self.rake_model.get_ranked_phrases_with_scores()[:top_n]
            if keywords:
                max_score = max(score for score, _ in keywords)
                return [(kw, round(score/max_score, 4)) for score, kw in keywords]
            return []
        except:
            return []
    
    def extract_ensemble(self, text, top_n=10):
        """Extract keywords using ensemble method"""
        weights = self.config['models']['ensemble']['weights']
        
        keybert_kws = self.extract_keybert(text, top_n=15)
        yake_kws = self.extract_yake(text, top_n=15)
        rake_kws = self.extract_rake(text, top_n=15)
        
        keyword_scores = {}
        
        for kw, score in keybert_kws:
            kw_lower = kw.lower()
            keyword_scores[kw_lower] = keyword_scores.get(kw_lower, 0) + score * weights['keybert']
        
        for kw, score in yake_kws:
            kw_lower = kw.lower()
            keyword_scores[kw_lower] = keyword_scores.get(kw_lower, 0) + score * weights['yake']
        
        for kw, score in rake_kws:
            kw_lower = kw.lower()
            keyword_scores[kw_lower] = keyword_scores.get(kw_lower, 0) + score * weights['rake']
        
        sorted_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)
        return [(kw, round(score, 4)) for kw, score in sorted_keywords[:top_n]]
    
    def extract(self, text, method='ensemble', top_n=10, **kwargs):
        """
        Main extraction method
        
        Args:
            text: Input text to extract keywords from
            method: 'keybert', 'yake', 'rake', or 'ensemble' (default)
            top_n: Number of keywords to return
        
        Returns:
            List of (keyword, score) tuples
        """
        methods = {
            'keybert': self.extract_keybert,
            'yake': self.extract_yake,
            'rake': self.extract_rake,
            'ensemble': self.extract_ensemble
        }
        
        extractor = methods.get(method.lower(), self.extract_ensemble)
        return extractor(text, top_n=top_n, **kwargs)
    
    def extract_all(self, text, top_n=10):
        """
        Extract keywords using all methods
        
        Returns:
            Dictionary with results from all methods
        """
        return {
            'keybert': self.extract_keybert(text, top_n),
            'yake': self.extract_yake(text, top_n),
            'rake': self.extract_rake(text, top_n),
            'ensemble': self.extract_ensemble(text, top_n)
        }
'''

# Save the extraction module
with open(os.path.join(MODEL_DIR, 'extractor.py'), 'w') as f:
    f.write(extraction_module)

print("‚úÖ Extraction module saved!")

## üì¶ Step 10: Create Requirements File

In [None]:
requirements = '''
# Keyword Extraction Requirements
keybert>=0.7.0
yake>=0.4.8
rake-nltk>=1.0.6
sentence-transformers>=2.2.0
transformers>=4.25.0
torch>=1.13.0
nltk>=3.8.0
scikit-learn>=1.2.0
numpy>=1.23.0
pandas>=1.5.0
flask>=2.2.0
flask-cors>=3.0.10
gunicorn>=20.1.0
'''

with open(os.path.join(MODEL_DIR, 'requirements.txt'), 'w') as f:
    f.write(requirements.strip())

print("‚úÖ Requirements file saved!")

## üóúÔ∏è Step 11: Create ZIP File for Download

In [None]:
# Create ZIP file containing all model files
ZIP_NAME = 'keyword_extraction_model.zip'

with zipfile.ZipFile(ZIP_NAME, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(MODEL_DIR):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, MODEL_DIR)
            zipf.write(file_path, arcname)

print(f"‚úÖ ZIP file created: {ZIP_NAME}")
print(f"üì¶ File size: {os.path.getsize(ZIP_NAME) / 1024:.2f} KB")

# List contents
print("\nüìã ZIP Contents:")
with zipfile.ZipFile(ZIP_NAME, 'r') as zipf:
    for name in zipf.namelist():
        print(f"   ‚Ä¢ {name}")

## ‚¨áÔ∏è Step 12: Download the Model

In [None]:
# Download the ZIP file
from google.colab import files

print("üì• Downloading keyword_extraction_model.zip...")
files.download(ZIP_NAME)
print("\n‚úÖ Download complete!")
print("\nüìù Next Steps:")
print("1. Extract the ZIP file")
print("2. Place the 'model' folder in your Flask backend directory")
print("3. Run the Flask server")
print("4. Use the frontend to extract keywords!")

## üß™ Step 13: Final Test - Complete Pipeline

In [None]:
# Test the complete pipeline with a journalism sample
sample_journalism_text = """
The Federal Reserve announced a significant interest rate hike on Wednesday, 
marking the fourth consecutive increase this year as the central bank continues 
its aggressive campaign to combat inflation. Fed Chair Jerome Powell stated that 
the decision was necessary to bring inflation back to the 2% target, despite 
concerns about potential economic slowdown. Wall Street reacted with mixed 
sentiments, with the Dow Jones Industrial Average initially dropping 300 points 
before recovering some losses. Economists predict that the housing market and 
consumer spending will be most affected by the rate changes. The unemployment 
rate remains historically low at 3.5%, providing some cushion against recession fears.
"""

print("=" * 70)
print("üì∞ FINAL PIPELINE TEST - JOURNALISM ARTICLE")
print("=" * 70)
print(f"\nüìù Sample Text:\n{sample_journalism_text}")
print("\n" + "-" * 70)

# Extract keywords using all methods
print("\nüîë EXTRACTED KEYWORDS:")
print("-" * 70)

print("\n1Ô∏è‚É£ KeyBERT (Semantic-based):")
for kw, score in extract_keywords_keybert(sample_journalism_text):
    print(f"   ‚ñ™ {kw} ({score})")

print("\n2Ô∏è‚É£ YAKE (Statistical-based):")
for kw, score in extract_keywords_yake(sample_journalism_text):
    print(f"   ‚ñ™ {kw} ({score})")

print("\n3Ô∏è‚É£ RAKE (Graph-based):")
for kw, score in extract_keywords_rake(sample_journalism_text):
    print(f"   ‚ñ™ {kw} ({score})")

print("\n4Ô∏è‚É£ ENSEMBLE (Best Combined):")
for kw, score in extract_keywords_ensemble(sample_journalism_text):
    print(f"   ‚òÖ {kw} ({score})")

print("\n" + "=" * 70)
print("‚úÖ PIPELINE TEST COMPLETE!")
print("=" * 70)

## üìñ Documentation & Usage

### Model Summary

| Model | Type | Best For | Speed |
|-------|------|----------|-------|
| **KeyBERT** | Semantic (BERT) | Context-aware extraction | Medium |
| **YAKE** | Statistical | Fast extraction | Fast |
| **RAKE** | Graph-based | Multi-word phrases | Fast |
| **Ensemble** | Combined | Best accuracy | Slower |

### Dataset Reference
- **AG News**: 120K news articles from 4 categories (World, Sports, Business, Sci/Tech)
- **Source**: https://huggingface.co/datasets/ag_news

### Usage Example
```python
from extractor import KeywordExtractor

# Initialize
extractor = KeywordExtractor(config_path='config.json')

# Extract keywords
text = "Your news article text here..."
keywords = extractor.extract(text, method='ensemble', top_n=10)

for keyword, score in keywords:
    print(f"{keyword}: {score}")
```