In [7]:
import pandas as pd
import re
import spacy

In [8]:
thinktank_df = pd.read_csv("./data/citations_data/thinktank_citations_tdm.csv", index_col=0)

# Data Cleaning

In [9]:
# 38 North is owned by the Stimson Center
thinktank_df['thinktank'] = thinktank_df['thinktank'].apply(lambda x: 'Henry L. Stimson Center' if x == '38 North' else x)

# Progressive Policy Institute is another name for Third Way
thinktank_df['thinktank'] = thinktank_df['thinktank'].apply(lambda x: 'Third Way' if x == 'Progressive Policy Institute' else x)

# Global Trade Watch is part of Public Citizen
thinktank_df['thinktank'] = thinktank_df['thinktank'].apply(lambda x: 'Public Citizen' if x == 'Global Trade Watch' else x)

# Catholic Family and Human Rights Institute = Center for Family and Human Rights
thinktank_df['thinktank'] = thinktank_df['thinktank'].apply(lambda x: 'Center for Family and Human Rights' if x == 'Catholic Family and Human Rights Institute' else x)

# Henry L. Stimson Center = The Stimson Center
thinktank_df['thinktank'] = thinktank_df['thinktank'].apply(lambda x: 'The Stimson Center' if x == 'Henry L. Stimson Center' else x)

In [4]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    no_tags = re.sub(clean, '', text)
    return no_tags.strip()

In [5]:
thinktank_df['sentence_clean'] = thinktank_df['sentence'].apply(remove_html_tags)

In [6]:
thinktank_df['year']=pd.to_datetime(thinktank_df['date']).apply(lambda x: x.year)

# Topic Tagging w/ Keywords

## International Country Mentions

In [34]:
import pycountry

In [35]:
nlp = spacy.load("en_core_web_sm")

In [36]:
def get_international(txt):
    doc = nlp(txt)
    for ent in doc.ents:
        if 'saudi' in txt.lower():
            return 'Saudi Arabia'
        if ent.label_ == 'GPE':
            try:
                match = pycountry.countries.search_fuzzy(ent.text)[0].name
                if match == 'United States' or match == 'Virgin Islands, U.S.':
                    continue
                return match
            except:
                continue

In [37]:
thinktank_df['international_mention'] = thinktank_df['sentence_clean'].apply(get_international)

In [38]:
thinktank_df.to_csv("./data/citations_data/thinktank_citations_tagged.csv")

## Tagging Other Topics

In [39]:
thinktank_df = pd.read_csv("./data/thinktank_citations_tagged.csv", index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: './data/thinktank_citations_tagged.csv'

In [None]:
econ_keywords = ['monetary', 'inflation', 'Fed', 'Federal Reserve', 'labor market', 'labor union',
                 'labor', 'tax', 'taxes', 'taxation', 'banking', 'banks', 'finance', 'financial']

education_keywords = ['school', 'schools', 'education', 'college', 'university', 'universities', 'teacher',
                     'teachers', 'kindergarten', 'students', 'student']

health_keywords = ['health insurance', 'Medicare', 'hospital', 'hospitals', 'medicine', 'health care', 
                   'doctor', 'doctors']

politics_keywords = ['Democrat', 'Republican', 'Senate', 'The House', 'caucus', 'Congress', 'bill', 'legislation']

In [None]:
def tag_keywords(txt, keyword_list):
    txt = txt.lower()
    for keyword in keyword_list:
        if keyword.lower() in txt:
            return 1
    return 0

In [None]:
# economy topics
thinktank_df['econ_keyword'] = thinktank_df['sentence_clean'].apply(lambda x: tag_keywords(x, econ_keywords))
# education topics
thinktank_df['edu_keyword'] = thinktank_df['sentence_clean'].apply(lambda x: tag_keywords(x, education_keywords))
# health topics
thinktank_df['health_keyword'] = thinktank_df['sentence_clean'].apply(lambda x: tag_keywords(x, health_keywords))
# political topics
thinktank_df['pol_keyword'] = thinktank_df['sentence_clean'].apply(lambda x: tag_keywords(x, politics_keywords))

In [None]:
thinktank_df[thinktank_df.econ_keyword == 1].groupby(['pub', 'year', 'thinktank']) \
    ['title'].count().reset_index().shape

In [None]:
thinktank_df[thinktank_df.edu_keyword == 1].groupby(['pub', 'year', 'thinktank']) \
    ['title'].count().reset_index().shape

In [None]:
thinktank_df[thinktank_df.health_keyword == 1].groupby(['pub', 'year', 'thinktank']) \
    ['title'].count().reset_index().shape

In [None]:
thinktank_df[thinktank_df.pol_keyword == 1].groupby(['pub', 'year', 'thinktank']) \
    ['title'].count().reset_index().shape

In [None]:
thinktank_df.to_csv("./data/citations_data/thinktank_citations_tagged.csv")