Note: This script needs to be run on a properly configured Dask cluster. It looks for files on a mounted Samba share at /picluster.

In [None]:
import nltk, csv
from nltk import pos_tag, ne_chunk

input_files = '../sources/trollope/*.txt'
output_file = 'output/trollope_entities.csv'

# A function to extract place names
def extract_place_names(input_file):
    print(f"Opening and reading file: {input_file}")
    with open(input_file, 'r') as f:
        text = f.read().split()

    print("Tagging parts of speech...")
    pos_tags = pos_tag(text)

    print("Performing named entity recognition...")
    named_entities = ne_chunk(pos_tags)

    # Extract and print the place names
    print("Extracting place names...")
    place_names = []
    for entity in named_entities:
        if isinstance(entity, nltk.tree.Tree) and entity.label() == 'GPE':
            place_names.append(' '.join([word for word, tag in entity.leaves()]))

    return place_names

# Define a function to write output to a CSV file
def write_to_csv(filename, data):
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f)
        for item in data:
            # write each item in data to a new row
            writer.writerow([item])

# Run the function, against your desired source
print("Starting place name extraction job...")
place_names = extract_place_names(input_file)
print(place_names)
write_to_csv(output_file, place_names)
print(f"Place name extraction complete. See results in {output_file}.\n")


