In [2]:
import nltk, csv
from nltk import pos_tag, ne_chunk

# A function to extract place names
def extract_place_names(input_file):
    print(f"Opening and reading file: {input_file}")
    with open(input_file, 'r') as f:
        text = f.read().split()

    print("Tagging parts of speech...")
    pos_tags = pos_tag(text)

    print("Performing named entity recognition...")
    named_entities = ne_chunk(pos_tags)

    # Extract and print the place names
    print("Extracting place names...")
    place_names = []
    for entity in named_entities:
        if isinstance(entity, nltk.tree.Tree) and entity.label() == 'GPE':
            place_names.append(' '.join([word for word, tag in entity.leaves()]))

    return place_names

# Define a function to write output to a CSV file
def write_to_csv(filename, data):
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f)
        for item in data:
            # write each item in data to a new row
            writer.writerow([item])

# Test the function, against your desired source
print("Starting place name extraction test...")
place_names = extract_place_names('../output/tokenized_merged_titles.txt')
print(place_names)
write_to_csv('../output/tokenized_merged_titles_test_output.csv', place_names)
print("Place name extraction test complete. See results in output/tokenized_merged_titles_test_output.csv\n")

# Run the function, against your desired source
print("Starting place name extraction job...")
place_names = extract_place_names('../output/tokenized_merged_titles.txt')
print(place_names)
write_to_csv('../output/extracted_entities_tokenized_merged_titles.csv', place_names)
print("Place name extraction job complete. See results in output/extracted_entities_tokenized_merged_titles.csv\n")




Starting place name extraction test...
Opening and reading file: ../output/tokenized_merged_titles.txt
Tagging parts of speech...
Performing named entity recognition...
Extracting place names...
['Munich', 'England', 'English', 'Leicestershire', 'Munich', 'Germany', 'Germans', 'English', 'English', 'Heines', 'Munich', 'Munich', 'German', 'Munich', 'Heines', 'England', 'Munich', 'Munich', 'English', 'England', 'English', 'Munich', 'English', 'German', 'English', 'English', 'German', 'English', 'Munich', 'New', 'Isa', 'New', 'England', 'English', 'English', 'England', 'Isa', 'Isa', 'London', 'Cheery', 'Isa', 'Niece', 'Bavaria', 'Bavaria', 'English', 'Berkshire', 'Rome', 'Rome', 'English', 'British', 'Rome', 'English', 'Americans', 'Italian', 'American', 'Rome', 'Rome', 'Mackinnon', 'Mackinnon', 'English', 'Ireland', 'Mackinnon', 'Rome', 'Cherub', 'Campidoglio', 'Pollux', 'Castor', 'Mackinnon', 'American', 'English', 'Mackinnon', 'Campidoglio', 'French', 'Mackinnon', 'Rome', 'Praetorian',