In [None]:
import csv
import random
import spacy
import en_core_web_lg

def load_data(file_path, num_samples=100):
    second_column_data = []
    with open(file_path, encoding='utf8') as f:
        csv_reader = csv.reader(f)
        for row in csv_reader:
            if len(row) >= 2:
                second_column_data.append(row[1])
    random.shuffle(second_column_data)
    return second_column_data[:num_samples]

def assemble_entity_dict(entity_dict, label, doc_list):
    entities = []
    for doc in doc_list:
        for ent in doc.ents:
            if ent.label_ == label:
                entities.append(ent.text)
    entity_dict[label] = entities
    return entity_dict

# Load SpaCy model
nlp = en_core_web_lg.load()

# Load and prepare data
data = load_data("abcnews-date-text.csv", num_samples=100000)

# Process headlines with SpaCy
doc_list = [nlp(headline) for headline in data]

# Create an empty dictionary to store named entities
entity_dict = {}

# List of named entity labels)
ner_labels = [
    'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART',
    'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'
]

# Assemble the dictionary
for label in ner_labels:
    entity_dict = assemble_entity_dict(entity_dict, label, doc_list)

# Print out one of the dictionary entries (e.g., "LOC")
print("Locations (LOC) found in the headlines:")
print(entity_dict['LOC'])

print("People (PERSON) found in the headlines:")
print(entity_dict['PERSON'])

print("Art  (WORK_OF_ART) found in the headlines:")
print(entity_dict['WORK_OF_ART'])

# Optional: Print out the number of entities for each label
print("\nNumber of entities found for each label:")
for label, entities in entity_dict.items():
    print(f"{label}: {len(entities)}")