<a href="https://colab.research.google.com/github/hsandaver/essays/blob/main/entity_extractor_wikipediav1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
import sys
import subprocess

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", package])

# Install required packages
install("pymupdf")
install("spacy")
install("SPARQLWrapper")
install("pandas")

# Download the Spacy model if not already present
import spacy

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

import fitz  # PyMuPDF
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from google.colab import files
from IPython.display import display  # For displaying DataFrame

# Function to upload files in Colab
def upload_pdf():
    print("Please upload your PDF file.")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded. Exiting.")
        sys.exit()
    pdf_path = next(iter(uploaded))
    return pdf_path

# Function to extract text from PDF
def extract_text_from_pdf(doc):
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)  # Load page
        text += page.get_text()  # Extract text from page
    return text

# Function to perform entity recognition
def extract_person_entities(text, nlp_model):
    doc_nlp = nlp_model(text)
    person_entities = [ent.text.strip() for ent in doc_nlp.ents if ent.label_ == "PERSON"]
    # Remove duplicates and short names which might be false positives
    person_entities = list(set([ent for ent in person_entities if len(ent) > 1]))
    return person_entities

# Function to query Wikidata using SPARQL
def query_wikidata(entity):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?item ?itemLabel ?itemDescription WHERE {{
      ?item rdfs:label "{entity}"@en.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }} LIMIT 1
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    try:
        results = sparql.query().convert()
        bindings = results["results"]["bindings"]
        if bindings:
            item = bindings[0]["item"]["value"]
            item_id = item.split('/')[-1]
            label = bindings[0].get("itemLabel", {}).get("value", "N/A")
            description = bindings[0].get("itemDescription", {}).get("value", "N/A")
            return item_id, label, description
        else:
            return None, None, None
    except Exception as e:
        print(f"Error querying Wikidata for entity '{entity}': {e}")
        return None, None, None

# Main processing function
def process_pdf():
    pdf_path = upload_pdf()
    print(f"Processing PDF: {pdf_path}")
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        sys.exit()

    pdf_text = extract_text_from_pdf(doc)
    print("Extracted text from PDF.")

    person_entities = extract_person_entities(pdf_text, nlp)
    print(f"Found {len(person_entities)} unique person entities.")

    if not person_entities:
        print("No person entities found in the PDF.")
        sys.exit()

    # Query Wikidata for each person entity and collect data
    entity_data = []
    for idx, entity in enumerate(person_entities, 1):
        print(f"Querying Wikidata for entity {idx}/{len(person_entities)}: {entity}")
        entity_id, label, description = query_wikidata(entity)
        if entity_id:
            entity_data.append({
                "Name": entity,
                "Wikidata ID": entity_id,
                "Label": label,
                "Description": description
            })
        else:
            entity_data.append({
                "Name": entity,
                "Wikidata ID": "N/A",
                "Label": "N/A",
                "Description": "N/A"
            })

    # Convert the results to a pandas DataFrame for display
    df = pd.DataFrame(entity_data)

    # Display the DataFrame
    print("\n=== Person Entities Extracted ===")
    display(df)

    # Optionally, allow the user to download the DataFrame as a CSV
    try:
        csv = df.to_csv(index=False)
        with open("person_entities.csv", "w") as f:
            f.write(csv)
        print("\nDownloading 'person_entities.csv'...")
        files.download('person_entities.csv')
    except Exception as e:
        print(f"Error downloading CSV: {e}")

# Execute the main function
if __name__ == "__main__":
    process_pdf()