# Data Preparation

In [1]:
import pandas as pd
import gzip
import gc

Load in MapAffil 2018 dataset of authorships (dropped duplicate affiliation texts and sorted by frequency)

In [2]:
columns_to_read = ["affiliation", "city", "state", "country"]

data_types = {
    "affiliation": str,
    "city": str,
    "state": str,
    "country": str
}

unique_sorted_mapaffil = pd.read_csv("data/mapaffil2018-unique_sorted.csv.gz", usecols=columns_to_read, dtype=data_types)

Load in lists of spaCy outputs

In [3]:
def read_lines(file_path):
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            yield line.strip()

all_orgs = list(read_lines("data/mapaffil2018-unique_sorted-affiliation-orgs.txt.gz"))
all_gpes = list(read_lines("data/mapaffil2018-unique_sorted-affiliation-gpes.txt.gz"))

Locate all affiliations containing newlines and remove any extra spaCy outputs that were generated due to the newlines

In [4]:
intervals = []

for index in unique_sorted_mapaffil[unique_sorted_mapaffil["affiliation"].str.contains('\n')].index:
    affiliation = unique_sorted_mapaffil.loc[index, "affiliation"]
    newline_count = affiliation.count('\n')
    second_index = index+newline_count
    intervals.append((index, second_index))

intervals.sort(key=lambda x: x[0], reverse=True)

for start, end in intervals:
    del all_orgs[start:end]
    del all_gpes[start:end]

Remove departments/divisions from ORGs

In [5]:
def remove_department_and_division(orgs):
    if orgs != "":
        orgs_list = orgs.split(", ")
        removed_orgs_list = [org for org in orgs_list if "Department" not in org and "Division" not in org]
        return ', '.join(removed_orgs_list) or ""

for idx, orgs in enumerate(all_orgs):
    all_orgs[idx] = remove_department_and_division(orgs)

Add the spaCy outputs for each affiliation text in the complete dataframe

In [6]:
unique_sorted_mapaffil["org"] = all_orgs
unique_sorted_mapaffil["gpe"] = all_gpes

unique_sorted_mapaffil["org"] = unique_sorted_mapaffil["org"].fillna('') # convert null values to empty strings for ORG column in order for consistency with GPE column

Remove affiliations with no spaCy detected ORGs nor GPEs, affiliations with no assigned country, affiliations containing "FROM..." prefix, affiliations containing newlines, affiliations with over 200 characters, and affiliations containing semicolons

In [7]:
mask = ((unique_sorted_mapaffil["org"] == "") & (unique_sorted_mapaffil["gpe"] == "")) | \
       (unique_sorted_mapaffil["country"] == "-") | \
       (unique_sorted_mapaffil["affiliation"].str.contains("FROMPMC")) | \
       (unique_sorted_mapaffil["affiliation"].str.contains("FROMNIH")) | \
       (unique_sorted_mapaffil["affiliation"].str.contains("FROMPAT")) | \
       (unique_sorted_mapaffil["affiliation"].str.contains("\n")) | \
       (unique_sorted_mapaffil['affiliation'].str.len() > 200) | \
       (unique_sorted_mapaffil["affiliation"].str.contains(";"))

unique_sorted_mapaffil.drop(unique_sorted_mapaffil.index[mask], inplace=True)

# Free up memory 
del mask
del all_orgs
del all_gpes
gc.collect()

23

Create validation set of ambiguous affiliations that MapAffil was unable to assign a city to

In [8]:
unique_sorted_mapaffil["combined"] = unique_sorted_mapaffil["state"] + ", " + unique_sorted_mapaffil["country"] 
unique_sorted_mapaffil.query("(city == combined) | (city == country)").to_parquet("data/ambiguous_mapaffil_validation.parquet", compression="gzip", engine="fastparquet")

gc.collect()

140

Create validation set of PubMed affiliations from papers published after December 2018

In [7]:
authorships = pd.read_csv("data/authorships.csv.gz")

post_2018_authorships = authorships[authorships["PublicationYear"] > 2018]
post_2018_authorships.drop_duplicates(subset="Affiliation").to_parquet("data/post_2018_validation.parquet", compression="gzip", engine="fastparquet")

del authorships
del post_2018_authorships
gc.collect()

  authorships = pd.read_csv("/Users/brian/Documents/Geocode NLP/data/authorships.csv.gz")


79

Finalize training dataset by removing the ambiguous MapAffil affiliations

In [10]:
unique_sorted_mapaffil.query("(city != combined) & (city != country)").to_parquet("data/clean_spacy_mapaffil.parquet", compression="gzip", engine="fastparquet")