In [None]:
# Jupyter/Colab code follows:

In [2]:
!python -m spacy download en_core_web_sm 

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.6/12.8 MB 8.4 MB/s eta 0:00:02
     ------------ --------------------------- 3.9/12.8 MB 10.7 MB/s eta 0:00:01
     ------------------ --------------------- 6.0/12.8 MB 10.3 MB/s eta 0:00:01
     ---------------------- ----------------- 7.3/12.8 MB 9.3 MB/s eta 0:00:01
     ----------------------------- ---------- 9.4/12.8 MB 9.3 MB/s eta 0:00:01
     ------------------------------------ --- 11.5/12.8 MB 9.5 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 9.1 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import spacy
import pandas as pd

In [4]:
nlp = spacy.load("en_core_web_sm") # end of Jupyter/Colab specific

In [None]:
""" # script/github code follows:
import spacy
import pandas as pd

try:
     # If model is already downloaded, execute.
    nlp = spacy.load("en_core_web_sm")
except OSError:
    # If the model isn't found, download and load.
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
# end of script/github specific """

In [None]:
# Code to run regardless of execution medium:

In [5]:
# Ensure that the tsv file is available.
df = pd.read_csv("haunted_places.tsv", sep="\t")

In [6]:
# Original Code prior to 3 APR
"""def extract_entities(text):
    doc = nlp(str(text)) # Ensure text is in string format
    return [(ent.label_, ent.text) for ent in doc.ents]

# apply NER to the 'description' column. This code takes about 10 minutes.
df['entities'] = df['description'].apply(extract_entities) """

In [6]:
# Start 3APR addition
def extract_entity_dict(text):
    doc = nlp(str(text))
    entity_dict = {}
    for ent in doc.ents:
        entity_dict.setdefault(ent.label_, []).append(ent.text)
    return entity_dict

In [7]:
# 3APR addition
# Apply function to description column. This code took 8 minutes to run.
df['entity_dict'] = df['description'].apply(extract_entity_dict)

In [8]:
# 3APR addition
# Get all unique entity labels from the dataset
all_labels = set()
df['entity_dict'].apply(lambda d: all_labels.update(d.keys()))

0        None
1        None
2        None
3        None
4        None
         ... 
10987    None
10988    None
10989    None
10990    None
10991    None
Name: entity_dict, Length: 10992, dtype: object

In [9]:
# Create separate columns for each label
for label in all_labels:
    df[label] = df['entity_dict'].apply(lambda d: d.get(label, []))


In [10]:
# Drop the intermediate dictionary column if not needed
df.drop(columns=['entity_dict'], inplace=True)

In [11]:
# Save to a new file.
ner_results = "ner_results_v2.tsv"

df.to_csv(ner_results, sep="\t", index=False)

print(f"NER complete. Results saved to {ner_results}.") 

NER complete. Results saved to ner_results_v2.tsv.
