In [None]:
import pandas as pd
import re
import spacy

In [8]:
# reading extracted citation data
citations_df = pd.read_csv("../data/citations_data/thinktank_citations_tdm.csv", index_col=0)

# Data Cleaning
Renaming think tanks that operate under larger think tanks, or are different departments of the same think tank

In [9]:
# 38 North is owned by the Stimson Center
citations_df['thinktank'] = thinktank_df['thinktank'].apply(lambda x: 'Henry L. Stimson Center' if x == '38 North' else x)

# Progressive Policy Institute is another name for Third Way
citations_df['thinktank'] = thinktank_df['thinktank'].apply(lambda x: 'Third Way' if x == 'Progressive Policy Institute' else x)

# Global Trade Watch is part of Public Citizen
citations_df['thinktank'] = thinktank_df['thinktank'].apply(lambda x: 'Public Citizen' if x == 'Global Trade Watch' else x)

# Catholic Family and Human Rights Institute = Center for Family and Human Rights
citations_df['thinktank'] = thinktank_df['thinktank'].apply(lambda x: 'Center for Family and Human Rights' if x == 'Catholic Family and Human Rights Institute' else x)

# Henry L. Stimson Center = The Stimson Center
citations_df['thinktank'] = thinktank_df['thinktank'].apply(lambda x: 'The Stimson Center' if x == 'Henry L. Stimson Center' else x)

In [4]:
# removing HTML tags from extracted sentences
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    no_tags = re.sub(clean, '', text)
    return no_tags.strip()

In [5]:
citations_df['sentence_clean'] = citations_df['sentence'].apply(remove_html_tags)

In [6]:
# adding year of citation, based on article publication date
citations_df['year']=pd.to_datetime(citations_df['date']).apply(lambda x: x.year)

# Topic Tagging w/ Keywords

## Tagging International Country Mentions
If the extracted citation sentence mentions a country name, that citation is classified as 'international'

In [34]:
import pycountry

In [35]:
nlp = spacy.load("en_core_web_sm")

In [36]:
# This function iterates through named Geo-political entities identifiyed by nltk, and uses fuzzy matching
# to return a standardized country name, if one is found.
def get_international(txt):
    doc = nlp(txt)
    for ent in doc.ents:
        if 'saudi' in txt.lower():
            return 'Saudi Arabia'
        if ent.label_ == 'GPE':
            try:
                match = pycountry.countries.search_fuzzy(ent.text)[0].name
                if match == 'United States' or match == 'Virgin Islands, U.S.':
                    continue
                return match
            except:
                continue

In [37]:
citations_df['international_mention'] = citations_df['sentence_clean'].apply(get_international)

In [38]:
citations_df.to_csv("../data/citations_data/thinktank_citations_tagged.csv")

## Tagging Economics, Health, Education, and Political Topics

In [None]:
citations_df = pd.read_csv("../data/thinktank_citations_tagged.csv", index_col=0)

In [None]:
# List of topic-specific keywords to be identified in citation sentences

econ_keywords = ['monetary', 'inflation', 'Fed', 'Federal Reserve', 'labor market', 'labor union',
                 'labor', 'tax', 'taxes', 'taxation', 'banking', 'banks', 'finance', 'financial']

education_keywords = ['school', 'schools', 'education', 'college', 'university', 'universities', 'teacher',
                     'teachers', 'kindergarten', 'students', 'student']

health_keywords = ['health insurance', 'Medicare', 'hospital', 'hospitals', 'medicine', 'health care', 
                   'doctor', 'doctors']

politics_keywords = ['Democrat', 'Republican', 'Senate', 'The House', 'caucus', 'Congress', 'bill', 'legislation']

In [None]:
# returns 1 if any keywords in keyword_list are identified in the citation sentence (txt), 0 otherwise 
def tag_keywords(txt, keyword_list):
    txt = txt.lower()
    for keyword in keyword_list:
        if keyword.lower() in txt:
            return 1
    return 0

In [None]:
# Applying tag_keywords for economics, education, health, and politics
# economy topics
citations_df['econ_keyword'] = thinktank_df['sentence_clean'].apply(lambda x: tag_keywords(x, econ_keywords))
# education topics
citations_df['edu_keyword'] = thinktank_df['sentence_clean'].apply(lambda x: tag_keywords(x, education_keywords))
# health topics
citations_df['health_keyword'] = thinktank_df['sentence_clean'].apply(lambda x: tag_keywords(x, health_keywords))
# political topics
citations_df['pol_keyword'] = thinktank_df['sentence_clean'].apply(lambda x: tag_keywords(x, politics_keywords))

In [None]:
# saving topic-tagged citations dataframe
citations_df.to_csv("../data/citations_data/thinktank_citations_tagged.csv")