Takes in a csv containing data, the identifier for the column containing the title 
and attempts to extract recipient and sender information from the title.

Prints the new dataframe containing recipient, sender columns.

In [132]:
import pandas as pd


In [133]:
# Replace the arguments to the function call below to run this script.
path = "./extracted_data/compiled/data_clean.csv"
title_col = 0


In [135]:
df = pd.read_csv(path, index_col=0)

In [136]:
import spacy
import en_core_web_sm
from pprint import pprint

# This cell can be run individually to test the output of this script on any specific title in the index

def extract_from_title(title):
    nlp = en_core_web_sm.load()
    doc = nlp(title)
    print(title)
    print([(X.text, X.label_) for X in doc.ents])

    # a list for ignored entities, like ciphered telegram
    ignored_entities = ["Ciphered Telegram", "Cde", "Incoming Cable", "Telegram", "Urgent", "Flash", "No.76.064"]
    ignored_labels = ["DATE", "CARDINAL"]

    # any entity right before the token 'to' is the sender, and the entity right after is the receiver.
    # if not present, return null

    recipient = ""
    sender = ""
    to_index = title.find("to")

    if to_index > -1: 
        for entity in doc.ents:
            if entity.text not in ignored_entities and entity.label_ not in ignored_labels:
                entity_index = title.index(entity.text)

                if sender == "" and entity_index < to_index:
                    sender = entity.text

                if recipient == "" and entity_index > to_index:
                    recipient = entity.text

                if recipient != "" and sender != "":
                    break
        
        # If the recipient is still not found, but the "to" field exists,
        # attempt to take the token right after the "to" token.
        if recipient == "":
            title_arr = title.split(" ")
            to_pos = title_arr.index("to")
            recipient = doc[to_pos+1]

    # When to field is not found, set the sender as the first entity (assumes no receiver in title)
    else:
        for entity in doc.ents:
            if entity.text not in ignored_entities:
                sender = entity.text
                break

    return recipient, sender

recipient, sender = extract_from_title(df[title_col][371])
print("recipient:", recipient)
print("sender:", sender)


Hungarian Embassy in Indonesia, Ciphered Telegram, 9 February 1983. Subject: The visit of a DPRK deputy foreign minister in Indonesia.
[('Hungarian Embassy', 'PERSON'), ('Indonesia', 'GPE'), ('Ciphered Telegram', 'PERSON'), ('9 February 1983', 'DATE'), ('DPRK', 'ORG'), ('Indonesia', 'GPE')]
recipient: 
sender: Hungarian Embassy


In [137]:
applied_df = df.apply(lambda row: extract_from_title(row[title_col]), axis='columns', result_type='expand')
merged_df = pd.concat([df, applied_df], axis='columns')
merged_df.rename(columns={0:"recipient", 1:"sender"}, inplace=True)
merged_df.to_csv("data_clean_with_receiver_sender.csv")

Telegram from Aleksandr Vasilevsky to Stalin
[('Aleksandr Vasilevsky', 'ORG'), ('Stalin', 'PERSON')]
Cable from Aleksandr Vasilevsky to Stalin
[('Aleksandr Vasilevsky', 'ORG'), ('Stalin', 'PERSON')]
Cable No. 121973, Meretskov and Shytkov to Cde. Stalin
[('121973', 'DATE'), ('Meretskov', 'PERSON'), ('Shytkov', 'PERSON'), ('Cde', 'ORG'), ('Stalin', 'PERSON')]
Telephone Message via VCh, I. Stalin to Kim Il Sung
[('I. Stalin', 'PERSON'), ('Kim Il Sung', 'PERSON')]
Telegram, Shtemenko and Vasilevsky to Stalin
[('Telegram', 'ORG'), ('Shtemenko', 'PERSON'), ('Vasilevsky', 'PERSON'), ('Stalin', 'PERSON')]
Ciphered Telegram from Shtykov
[('Ciphered Telegram', 'PERSON'), ('Shtykov', 'PERSON')]
Telegram, Shtykov to Stalin
[('Telegram', 'ORG'), ('Shtykov', 'PERSON'), ('Stalin', 'PERSON')]
Cable Telegram no. 59363 from Kovalev to Stalin, containing a message from Mao Zedong
[('59363', 'CARDINAL'), ('Kovalev', 'ORG'), ('Stalin', 'PERSON'), ('Mao Zedong', 'PERSON')]
Draft Reply to Mao Zedong's Teleg

In [138]:
applied_df.to_csv("test.csv")