In [1]:
import spacy
import os
import json
import pandas as pd
import time

In [None]:
CSV = "extracted_entities.csv"

In [3]:
# Load spaCy's multilingual model (supports Ukrainian)
print("[INFO] Loading spaCy model...")
nlp = spacy.load("xx_ent_wiki_sm")  # Faster Ukrainian-only model

nlp.enable_pipe("ner")  # Ensure NER is enabled

if spacy.prefer_gpu():
    print("[INFO] Using GPU acceleration for spaCy.")
else:
    print("[INFO] Running on CPU.")

[INFO] Loading spaCy model...
[INFO] Using GPU acceleration for spaCy.


In [4]:
BATCH_SIZE = 100  # Save progress every 100 articles

In [5]:
# Function to extract named entities from text using spaCy
def extract_entities_from_text(text):
    """Extract named entities using spaCy."""
    try:
        doc = nlp(text)
        return [{'text': ent.text, 'label': ent.label_} for ent in doc.ents]
    except Exception as e:
        print(f"[ERROR] NER processing failed: {e}")
        return []

# Process and save entities in chunks
def process_articles(directory):
    """Loads, processes, and saves articles in chunks to reduce memory usage."""
    article_count = 0
    extracted_data = []
    start_time = time.time()

    print(f"[INFO] Starting entity extraction from {directory}...")

    # Walk through all files in the directory
    for root, _, files in os.walk(directory):
        for filename in files:
            file_path = os.path.join(root, filename)
            print(f"[INFO] Processing file: {file_path}")

            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    
                    try:
                        article = json.loads(line)
                        title = article.get('title', 'Unknown Title')
                        text = article.get('text', '')

                        if text:
                            entities = extract_entities_from_text(text)
                            extracted_data.append({'title': title, 'text': text, 'entities': entities})
                            article_count += 1
                        else:
                            print(f"[WARNING] Skipping empty article: {title}")

                    except json.JSONDecodeError:
                        print(f"[WARNING] Invalid JSON in {file_path}. Skipping.")

                    # Save every BATCH_SIZE articles
                    if article_count % BATCH_SIZE == 0 and article_count > 0:
                        print(f"[INFO] Processed {article_count} articles. Saving to CSV...")
                        save_to_csv(extracted_data)
                        extracted_data = []  # Clear memory

    # Save any remaining data
    if extracted_data:
        print(f"[INFO] Final batch of {len(extracted_data)} articles. Saving to CSV.")
        save_to_csv(extracted_data)

    elapsed_time = time.time() - start_time
    print(f"[INFO] Finished processing {article_count} articles in {elapsed_time:.2f} seconds.")

# Save extracted entities to CSV
def save_to_csv(data):
    """Appends extracted entity data to a CSV file."""
    try:
        df = pd.DataFrame([
            {'title': item['title'], 'text': item['text'], 'entity': ent['text'], 'label': ent['label']}
            for item in data for ent in item['entities']
        ])
        
        df.to_csv(OUTPUT_CSV, mode='a', index=False, encoding='utf-8', header=not os.path.exists(OUTPUT_CSV))
        print(f"[INFO] Saved {len(data)} articles to {OUTPUT_CSV}")

    except Exception as e:
        print(f"[ERROR] Failed to save to CSV: {e}")

In [10]:
PREPROCESSED_CSV = "processed_data.csv"
OUTPUT_CSV = "extracted_entities.csv"
first_chunk = True
chunk_size = 10000  

for chunk in pd.read_csv(OUTPUT_CSV, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(subset=["text"], inplace=True)

    mode = "w" if first_chunk else "a"  # Write first time, then append
    header = first_chunk  # Write header only once

    chunk.to_csv(PREPROCESSED_CSV, mode=mode, header=header, index=False)
    first_chunk = False  # After first write, switch to append mode
    print(f"[INFO] Saved {len(chunk)} rows")

[INFO] Saved 8025 rows
[INFO] Saved 8463 rows
[INFO] Saved 8022 rows
[INFO] Saved 7840 rows
[INFO] Saved 7861 rows
[INFO] Saved 8176 rows
[INFO] Saved 8074 rows
[INFO] Saved 8188 rows
[INFO] Saved 7934 rows
[INFO] Saved 8164 rows
[INFO] Saved 8128 rows
[INFO] Saved 8104 rows
[INFO] Saved 8371 rows
[INFO] Saved 8153 rows
[INFO] Saved 8259 rows
[INFO] Saved 8076 rows
[INFO] Saved 8154 rows
[INFO] Saved 8143 rows
[INFO] Saved 8181 rows
[INFO] Saved 8075 rows
[INFO] Saved 8168 rows
[INFO] Saved 8240 rows
[INFO] Saved 8277 rows
[INFO] Saved 8261 rows
[INFO] Saved 8296 rows
[INFO] Saved 7913 rows
[INFO] Saved 8186 rows
[INFO] Saved 8347 rows
[INFO] Saved 8087 rows
[INFO] Saved 8069 rows
[INFO] Saved 8187 rows
[INFO] Saved 8286 rows
[INFO] Saved 8218 rows
[INFO] Saved 7963 rows
[INFO] Saved 7737 rows
[INFO] Saved 8279 rows
[INFO] Saved 8359 rows
[INFO] Saved 8111 rows
[INFO] Saved 8241 rows
[INFO] Saved 8165 rows
[INFO] Saved 8376 rows
[INFO] Saved 8206 rows
[INFO] Saved 8016 rows
[INFO] Save

KeyboardInterrupt: 