In [None]:
# import nltk
# import os

# custom_nltk_data_path = "C:\\nltk_data"
# if not os.path.exists(custom_nltk_data_path):
#     try:
#         os.makedirs(custom_nltk_data_path)
#     except Exception as e:
#         print(f"Could not create folder {custom_nltk_data_path}: {e}")

# if custom_nltk_data_path not in nltk.data.path:
#     nltk.data.path.append(custom_nltk_data_path)

# print(f"Attempting to download 'punkt' to {custom_nltk_data_path}...")
# nltk.download('punkt', download_dir=custom_nltk_data_path, quiet=False, force=True)


# print(f"Attempting to download 'punkt_tab' to {custom_nltk_data_path}...")
# nltk.download('punkt_tab', download_dir=custom_nltk_data_path, quiet=False, force=True)


# print(f"Attempting to download 'stopwords' to {custom_nltk_data_path}...")
# nltk.download('stopwords', download_dir=custom_nltk_data_path, quiet=False, force=True)

# print("Download attempts finished.")

# # Quick test
# try:
#     from nltk.tokenize import word_tokenize
#     print("Successfully imported word_tokenize.")
#     test_tokens = word_tokenize("This is a test sentence for tokenization.")
#     print("word_tokenize test successful:", test_tokens)
# except Exception as e:
#     print("Error during quick test of word_tokenize:", e)

Attempting to download 'punkt' to C:\nltk_data...


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Attempting to download 'punkt_tab' to C:\nltk_data...


[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Attempting to download 'stopwords' to C:\nltk_data...


[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Download attempts finished.
Successfully imported word_tokenize.
word_tokenize test successful: ['This', 'is', 'a', 'test', 'sentence', 'for', 'tokenization', '.']


In [None]:
# --- Imports and Article Fetching/Preprocessing Setup

import nltk
import os 
from newspaper import Article 
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# --- Point NLTK to a specific data directory ---

custom_nltk_data_path = "C:\\nltk_data" 

if not os.path.exists(custom_nltk_data_path):
    try:
        os.makedirs(custom_nltk_data_path)
        print(f"Created NLTK data directory: {custom_nltk_data_path}")
    except Exception as e:
        print(f"Could not create NLTK data directory {custom_nltk_data_path}. Please create it manually. Error: {e}")

if custom_nltk_data_path not in nltk.data.path:
    nltk.data.path.append(custom_nltk_data_path)
    print(f"Added {custom_nltk_data_path} to NLTK data path.")
else:
    print(f"{custom_nltk_data_path} is already in NLTK data path.")




try:
    # Check if resources are found in ANY of the NLTK paths
    nltk.data.find('corpora/stopwords')
    print("NLTK stopwords found.")
except LookupError:
    print(f"Downloading NLTK stopwords to {custom_nltk_data_path}...")
    nltk.download('stopwords', download_dir=custom_nltk_data_path, quiet=False) # quiet=False for visibility
try:
    nltk.data.find('tokenizers/punkt')
    print("NLTK punkt tokenizer found.")
except LookupError:
    print(f"Downloading NLTK punkt tokenizer to {custom_nltk_data_path}...")
    nltk.download('punkt', download_dir=custom_nltk_data_path, quiet=False) # quiet=False for visibility


stop_words_nltk = set(stopwords.words('english'))

def fetch_and_preprocess_article_nltk(url):
    """Fetches article from URL and preprocesses its text using NLTK."""
    print(f"Attempting to fetch article from: {url}")
    try:
        article_obj = Article(url)
        article_obj.download()
        article_obj.parse()
        raw_text = article_obj.text
        title = article_obj.title
        # Check if title is None or empty, which can happen if parsing fails
        if not title:
            title = "Title not found"
            print("Warning: Article title not found by newspaper3k.")
        else:
            print(f"Successfully fetched: {title}")

        
        if not raw_text or len(raw_text) < 100: 
             print(f"Warning: Fetched raw text is very short or None for {url}. Content might be missing or paywalled/JS-rendered.")
             

    except Exception as e:
        print(f"Error fetching/parsing article from {url}: {e}")
        return None, None, None

    if not raw_text: 
        print(f"No content found for article (raw_text is None): {url}")
        return None, title, None 

    # NLTK Preprocessing
    text_lower = raw_text.lower()
    text_no_punct = re.sub(r'\W', ' ', text_lower) 
    text_no_extra_space = re.sub(r'\s+', ' ', text_no_punct).strip() 
    
    tokens = word_tokenize(text_no_extra_space) 
    
    filtered_tokens = [
        word for word in tokens
        if word not in stop_words_nltk and len(word) > 2 and word.isalpha() 
    ]
    preprocessed_text = " ".join(filtered_tokens)

    return raw_text, title, preprocessed_text

# --- Test the function ---
test_article_url = 'https://www.bbc.com/news/articles/c87j5v4xjxqo'

print("--- Testing Article Fetching and NLTK Preprocessing ---")
raw_content, article_title, processed_content_nltk = fetch_and_preprocess_article_nltk(test_article_url)

if raw_content:
    print(f"\nTitle: {article_title}") 
    print("\n--- Raw Content (first 500 chars) ---")
    print(raw_content[:500] + "...")
    if processed_content_nltk:
        print("\n--- NLTK Processed Content (first 1000 chars) ---")
        print(processed_content_nltk[:1000] + "...")
    else:
        print("\n--- NLTK Processed Content: Not generated (likely due to issues with raw_content or tokenization).")


    documents_for_bertopic = [raw_content] if raw_content else []
    processed_documents_for_bertopic = [processed_content_nltk] if processed_content_nltk else []
else:
    print(f"\nCould not get raw content for article: {test_article_url}")
    
    documents_for_bertopic = []
    processed_documents_for_bertopic = []

C:\nltk_data is already in NLTK data path.
NLTK stopwords found.
NLTK punkt tokenizer found.
--- Testing Article Fetching and NLTK Preprocessing ---
Attempting to fetch article from: https://www.bbc.com/news/articles/c87j5v4xjxqo
Successfully fetched: Sidhu Moose Wala: Gangster tells BBC why India's biggest hip-hop star was murdered

Title: Sidhu Moose Wala: Gangster tells BBC why India's biggest hip-hop star was murdered

--- Raw Content (first 500 chars) ---
Gangster tells BBC why India's biggest hip-hop star was murdered

11 June 2025 Share Save Soutik Biswas & Ishleen Kaur BBC Eye Investigations Share Save

BBC Sidhu Moose Wala was shot dead in a hail of bullets in 2022

It was a killing that shocked India: Punjabi hip-hop star Sidhu Moose Wala shot dead through the windscreen of his car by hired gunmen. Within hours, a Punjabi gangster named Goldy Brar had used Facebook to claim responsibility for ordering the hit. But three years after the murde...

--- NLTK Processed Content (fi