In [None]:
import csv
import random
import spacy
import en_core_web_sm
import en_core_web_lg
import time
from collections import Counter

def load_data(file_path, num_samples=100):
    second_column_data = []
    with open(file_path, encoding='utf8') as f:
        csv_reader = csv.reader(f)
        for row in csv_reader:
            if len(row) >= 2:
                second_column_data.append(row[1])
    random.shuffle(second_column_data)
    return second_column_data[:num_samples]

def process_model(nlp, data):
    start_time = time.time()
    raw_text = " ".join(data)
    doc = nlp(raw_text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    end_time = time.time()
    return entities, end_time - start_time

def run_benchmark(model_name, model, data, num_runs=5):
    total_time = 0
    all_entities = []
    for _ in range(num_runs):
        entities, run_time = process_model(model, data)
        total_time += run_time
        all_entities.extend(entities)
    
    avg_time = total_time / num_runs
    entity_counts = Counter(all_entities)
    return avg_time, entity_counts

# Load and prepare data
data = load_data("abcnews-date-text.csv", num_samples=100)

# Run benchmarks
sm_model = en_core_web_sm.load()
lg_model = en_core_web_lg.load()

sm_time, sm_entities = run_benchmark("Small Model", sm_model, data)
lg_time, lg_entities = run_benchmark("Large Model", lg_model, data)

# Print results
print(f"Small Model Average Time: {sm_time:.4f} seconds")
print(f"Large Model Average Time: {lg_time:.4f} seconds")

print("\nTop 10 Entities Found by Small Model:")
for entity, count in sm_entities.most_common(10):
    print(f"{entity}: {count}")

print("\nTop 10 Entities Found by Large Model:")
for entity, count in lg_entities.most_common(10):
    print(f"{entity}: {count}")

# Compare unique entities
sm_unique = set(sm_entities.keys())
lg_unique = set(lg_entities.keys())

print(f"\nUnique entities in Small Model: {len(sm_unique)}")
print(f"Unique entities in Large Model: {len(lg_unique)}")
print(f"Entities found by Large Model but not Small Model: {len(lg_unique - sm_unique)}")
print(f"Entities found by Small Model but not Large Model: {len(sm_unique - lg_unique)}")