In [30]:
import pandas as pd
import os
import re
from bs4 import BeautifulSoup
import nltk.data
import json
import numpy as np

In [15]:
# constructing keyword list of reference words
saying_list = ['said', 'say', 'explained', 'explains', 'according', 'report', 'replied', 'replies', 'study', 'told', 'tells']

In [16]:
# loading nltk tokenizer - required punkt tokenizer to be installed
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [17]:
# importing dictionary of think tank names and aliases
think_tank_dic = json.load(open("../data/thinktank_data/tt_names.json"))

In [18]:
# function takes in body of text and think tank name. Tokenizes text into sentences and returns a sentence
# if think tank tt is mentioned
def get_sentence(text, tt):
    if not text or not isinstance(text, str):
        return None
    # tokenizing text into sentences
    sentences = tokenizer.tokenize(text)
    
    for s in sentences:
        # list of think thank name aliases
        known_names = [i for i in think_tank_dic[tt] if isinstance(i, str)] 
        # returns sentence if think tank or alias is mentioned in sentence along with a 'saying' word
        if any([i in s for i in known_names]) and any([said_word in s for said_word in saying_list]):
            return s
    return None

In [19]:
# setting path to directory containing .xml files of articles - Only available on TDM studio
path = "./data/articles/"

In [22]:
# takes in article path (stored as xml file) and returns article data: date, title, text, and publication
def get_article_info(path):
    xml_file = open(path)
    soup = BeautifulSoup(xml_file)
    date = soup.find('startdate').text
    title = soup.find('title').text
    text = soup.find('text').text
    copyright_text = soup.find('copyright').text
    if 'Dow Jones' in copyright_text:
        pub = 'WSJ'
    elif 'New York' in copyright_text:
        pub = 'NYT'
    else:
        pub = 'WP'
    
    return (date, title, text, pub)
    

In [23]:
# instantiating arrays to hold date, title, cited sentence, and name of publication
date_arr, title_arr, sentence_arr, pub_arr = [], [], [], []
# instantiating array to hold think tank name, file path, and error list
tt_arr, path_arr, error_list= [], [], []

# reading in list of article paths
file_array = os.listdir(path)

# iterating over articles
for p in file_array:
    try:
        date, title, text, pub = get_article_info(path + "/" + p)
        file_path = path + "/" + p
    except Exception as err:
        error_list.append(p)
        continue
    try:
        # iterating over every think tank
        for tt in think_tank_dic:
            # checking for citations to think tank in article
            match = get_sentence(text, tt)
            if match:
                path_arr.append(p)
                title_arr.append(title)
                date_arr.append(date)
                pub_arr.append(pub)
                tt_arr.append(tt)
                sentence_arr.append(match)
    except Exception as err:
        error_list.append(p)
        continue

In [24]:
# constructing data frame of citations with article title, date, sentence containing citation, 
# think tank name, and name of publication
df = pd.DataFrame({'file_path':path_arr, 'title':title_arr, 'date':date_arr, 'sentence':sentence_arr, 
                   'pub':pub_arr, 'thinktank':tt_arr})

In [26]:
# saving citations dataset
df.to_csv('../data/citations/thinktank_citations_tdm.csv')