Takes in a csv containing data, the identifier for the column containing the title 
and attempts to extract recipient and sender information from the title.

Prints the new dataframe containing recipient, sender columns.

In [1]:
import pandas as pd

# Replace the arguments to the function call below to run this script.
# path = "./extracted_data/compiled/data_clean.csv"
path = "z2_processed_data.csv"
title_col = "2"

df = pd.read_csv(path, index_col=0)

In [2]:
# Preprocessing to remove _ values
def clean_title(title):
    x0 = title.replace("_", " ")
    x1 = x0.replace("-", " ")
    x2 = " ".join(x1.split())
    return x2

df["2"] = df["2"].apply(clean_title)

In [62]:
import spacy
import en_core_web_sm
from pprint import pprint

# This cell can be run individually to test the output of this script on any specific title in the index

def extract_from_title(title):
    nlp = en_core_web_sm.load()
    doc = nlp(title)
    print(title)
    print([(X.text, X.label_) for X in doc.ents])

    # a list for ignored entities, like ciphered telegram
    ignored_entities = ["Ciphered Telegram", "Cde", "Incoming Cable", "Telegram", "Urgent", "Flash", "No.76.064"]
    ignored_labels = ["DATE", "CARDINAL"]

    # any entity right before the token 'to' is the sender, and the entity right after is the receiver.
    # if not present, return null

    recipient = ""
    sender = ""
    to_index = title.find("to")

    if to_index > -1: 
        for entity in doc.ents:
            if entity.text not in ignored_entities and entity.label_ not in ignored_labels:
                entity_index = title.index(entity.text)

                if sender == "" and entity_index < to_index:
                    sender = entity.text

                if recipient == "" and entity_index > to_index:
                    recipient = entity.text

                if recipient != "" and sender != "":
                    break
        
        # If the recipient is still not found, but the "to" field exists,
        # attempt to take the token right after the "to" token.
        if recipient == "":
            title_arr = title.split(" ")
            if "to" in title_arr:
                to_pos = title_arr.index("to")
                recipient = doc[to_pos+1]

    # When to field is not found, set the sender as the first entity (assumes no receiver in title)
    else:
        for entity in doc.ents:
            if entity.text not in ignored_entities:
                sender = entity.text
                break

    return recipient, sender

recipient, sender = extract_from_title(df[title_col][100])
print("recipient:", recipient)
print("sender:", sender)


Telegram from the Hungarian Embassy in Beijing China on the KWP’s 6th Congress 0
[('the Hungarian Embassy', 'ORG'), ('Beijing', 'GPE'), ('China', 'GPE'), ('KWP', 'ORG'), ('6th', 'ORDINAL'), ('Congress', 'ORG'), ('0', 'PRODUCT')]
recipient: 
sender: the Hungarian Embassy


In [63]:
# Run this cell to apply the above function to the entire specified csv, and output a new csv.

applied_df = df.apply(lambda row: extract_from_title(row[title_col]), axis='columns', result_type='expand')
merged_df = pd.concat([df, applied_df], axis='columns')
merged_df.rename(columns={0:"recipient", 1:"sender"}, inplace=True)
merged_df.to_csv("z2_processed_data_with_receiver_sender.csv")

Alexei Adzhubei s Account of His Visit to Washington to the Central Committee of the Communist Party of the Soviet Union 0
[('Alexei Adzhubei s Account', 'PERSON'), ('Washington', 'GPE'), ('the Central Committee', 'ORG'), ('the Communist Party', 'ORG'), ('the Soviet Union', 'GPE')]
Notes on a Bulletin of the Korean News Agency 0
[('the Korean News Agency', 'ORG'), ('0', 'CARDINAL')]
TELEGRAM 075 205 from the Romanian Embassy in Tehran to the Romanian Ministry of Foreign Affairs 0
[('TELEGRAM', 'ORG'), ('075 205', 'CARDINAL'), ('the Romanian Embassy', 'ORG'), ('Tehran', 'GPE'), ('the Romanian Ministry of Foreign Affairs', 'ORG')]
Telegram from the Embassy in Beijing Smuggling at North Korean Embassy in Oslo 0
[('Embassy', 'LOC'), ('North Korean Embassy', 'ORG')]
Telegram from Norwegian Ambassador to China to Ministry of Foreign Affairs North Korea s Embassy in Oslo 0
[('Norwegian', 'NORP'), ('China', 'GPE'), ('Ministry of Foreign Affairs', 'ORG'), ('North Korea', 'GPE')]
Ciphered Telegr

In [138]:
# Outputs
applied_df.to_csv("test.csv")

This next part matches the sender/recipients with a political party.
Takes in as input a csv for the knowledge base and a csv for our corpus, and receiver/sender columns.
Returns a dataframe with a receiver_org and sender_org column.

In [7]:
import pandas as pd

# Replace the arguments to the function call below to run this script.
# data_path = "./extracted_data/compiled/telegrams_with_extracted_receiver_sender.csv"
#data_path = "data_clean_with_receiver_sender.csv"
data_path  = "z2_processed_data_with_receiver_sender.csv"
# data_path_2 = "./extracted_data/compiled/data_clean_with_receiver_sender.csv"
data_path_2 = "z2_processed_data_with_receiver_sender.csv"
kb_path = "./BERT-NER-dev/merged_political_military.csv"
receiver_col = "recipient"
sender_col = "sender"

data = pd.read_csv(data_path, index_col=0)
data_2 = pd.read_csv(data_path_2, index_col=0)
kb = pd.read_csv(kb_path)

In [13]:
pd.set_option('display.max_rows', 1000)
kb

Unnamed: 0,item,itemLabel,politicalPartyLabel,positionHeldLabel,positionEnd,positionStart,militaryRankLabel,militaryRankStart,ismilitaryRankPIT,militaryEnd
0,http://www.wikidata.org/entity/Q16988,Liu Shaoqi,Communist Party of China,President of the People's Republic of China,1968-10-31T00:00:00Z,1959-04-27T00:00:00Z,,,,
1,http://www.wikidata.org/entity/Q16988,Liu Shaoqi,Communist Party of China,Chairman of the Standing Committee of the Nati...,1959-04-18T00:00:00Z,1954-09-27T00:00:00Z,,,,
2,http://www.wikidata.org/entity/Q17132,Soong Ching-ling,Communist Party of China,Vice President of the People's Republic of China,1975-01-17T00:00:00Z,1959-04-27T00:00:00Z,,,,
3,http://www.wikidata.org/entity/Q22513,Zhu De,Communist Party of China,Vice President of the People's Republic of China,1959-04-27T00:00:00Z,1954-09-27T00:00:00Z,,,,
4,http://www.wikidata.org/entity/Q22513,Zhu De,Communist Party of China,Vice Chairman of the Communist Party of China,1966-08-01T00:00:00Z,1956-09-28T00:00:00Z,,,,
5,http://www.wikidata.org/entity/Q22513,Zhu De,Communist Party of China,Secretary of the Central Commission for Discip...,1955-03-01T00:00:00Z,1949-11-01T00:00:00Z,,,,
6,http://www.wikidata.org/entity/Q22513,Zhu De,Communist Party of China,Chairman of the Standing Committee of the Nati...,1976-07-06T00:00:00Z,1959-04-28T00:00:00Z,,,,
7,http://www.wikidata.org/entity/Q17410,Zhou Enlai,Communist Party of China,Premier of the State Council of the People's R...,1976-01-08T00:00:00Z,1954-09-27T00:00:00Z,,,,
8,http://www.wikidata.org/entity/Q17410,Zhou Enlai,Communist Party of China,member of the Politburo Standing Committee of ...,1976-01-08T00:00:00Z,1956-09-28T00:00:00Z,,,,
9,http://www.wikidata.org/entity/Q17410,Zhou Enlai,Communist Party of China,member of the National People's Congress,1976-01-08T00:00:00Z,1954-09-15T00:00:00Z,,,,


In [9]:
kb_dict = kb[['itemLabel', 'politicalPartyLabel']].set_index('itemLabel').to_dict()
kb_real_dict = kb_dict['politicalPartyLabel']

In [10]:
def flatten_dict(dict):
    out = {}
    for key, value in dict.items():
        lower = key.lower()
        names_arr = lower.split(' ')
        for name in names_arr:
            out[name] = value
    return out

kb_flat = flatten_dict(kb_real_dict)

In [11]:
kb_flat

{'liu': 'Communist Party of China',
 'shaoqi': 'Communist Party of China',
 'soong': 'Communist Party of China',
 'ching-ling': 'Communist Party of China',
 'zhu': 'Communist Party of China',
 'de': 'Communist Party of China',
 'zhou': 'Communist Party of China',
 'enlai': 'Communist Party of China',
 'mao': 'Communist Party of China',
 'zedong': 'Communist Party of China',
 'chen': 'Communist Party of China',
 'yi': 'Communist Party of China',
 'xie': 'Communist Party of China',
 'fuzhi': 'Communist Party of China',
 'kim': "Workers' Party of Korea",
 'il-sung': "Workers' Party of Korea",
 'jiang': 'Communist Party of China',
 'qing': 'Communist Party of China',
 'lin': 'Communist Party of China',
 'biao': 'Communist Party of China',
 'peng': 'Communist Party of China',
 'dehuai': 'Communist Party of China',
 'sanzo': 'Communist Party of China',
 'nosaka': 'Communist Party of China',
 'wang': 'Communist Party of China',
 'guangmei': 'Communist Party of China',
 'dun': 'Communist Party

In [12]:
# Define an additional knowledge base, for more general entity names

additional = {
    'chinese': 'Communist Party of China', 
    'ccp': 'Communist Party of China',
    'beijing': 'Communist Party of China',
    'china': 'Communist Party of China',
    'soviet': 'Communist Party of the Soviet Union',
    'ussr': 'Communist Party of the Soviet Union',
    'filippov': 'Communist Party of the Soviet Union',
    'moscow': 'Communist Party of the Soviet Union',
    'north': 'Workers\' Party of Korea'
    }

In [76]:
# Categorizes a person by entity based on substrings of the name.
# The first match in the name is returned.
def map_entity_to_state(entity, kb, additional_kb):
    lower = str(entity).lower()
    entity_arr = lower.split(" ")
    # Apply recognition with the knowledge base
    for name_part in entity_arr:
        if name_part in kb.keys():
            return kb[name_part]

    # Apply recognition for the additional keys
    for key, value in additional_kb.items():
        if key in lower:
            return value

In [77]:
authors_clean = data_2['0'].apply(clean_title)
sender_org_1 = authors_clean.apply(lambda row: map_entity_to_state(row, kb_flat, additional))
sender_org_2 = data_2['sender'].apply(lambda row: map_entity_to_state(row, kb_flat, additional))
receiver_org = data_2['recipient'].apply(lambda row: map_entity_to_state(row, kb_flat, additional))

In [78]:
def combiner(d1, d2):
    if d1 == "":
        d1 = None
    if d2 == "":
        d2 = None
    if d1 == None and d2 == None:
        return None
    elif d1 == None:
        return d2
    elif d2 == None:
        return d1
    else:
        return d1

sender_org = sender_org_1.combine(sender_org_2, combiner)

In [82]:
data_2['sender_org'] = sender_org
data_2['receiver_org'] = receiver_org
data_2.to_csv('z2_processed_data_with_orgs.csv')