In [228]:
# General
from tqdm import tqdm_notebook as tqdm
import json
import re
import pandas as pd
import pickle

In [230]:
def process_txt(x):
    '''
    Text preprocessing pipeline that returns
    a processed string
    '''
    regex = re.compile(r'[►*/-@.?!&~,":#$;\'()=+|0-9]')  # Remove Characters
    x = regex.sub("", x)  # Run Regex
    x = x.replace('  ', ' ')  # Replace Double Spaces
    x = x.replace('-', ' ')  # Replace Hyphens
    return x

def process_and_load_thinktank_json(filename):
    # Load Data
    with open(filename, 'r') as f:
        articles_df = pd.DataFrame(json.load(f)['articles_data'])

#     DEBUG
#     articles_df = articles_df[0:50]

    # Clean Text For Use in API
    articles_df.title = articles_df.title.apply(process_txt)
    articles_df.text = articles_df.text.apply(process_txt)
    articles_df['titleText'] = articles_df.title + articles_df.text

    # Remove Empty Strings
    articles_df = articles_df[articles_df.titleText.apply(lambda x: len(x)) != 0]

    # Max Character Length for AWS Comprehend = 5000 Characters
    articles_df.titleText = articles_df.titleText.apply(lambda x: x[0:4000])

    # Drop Unused Columns
    articles_df.drop(['url', 'text', 'title'],axis=1,inplace=True)
    
    return articles_df


In [231]:
def batchProcessComprehend(input_list, batch_size=25):
    def callEntityApi(text_list):
        return client.batch_detect_entities(
            TextList=text_list, LanguageCode='en')

    def callKeyphraseApi(text_list):
        return client.batch_detect_key_phrases(
            TextList=text_list, LanguageCode='en')
    
    def callSentimentAPI(text_list):
        return client.batch_detect_sentiment(
            TextList=text_list, LanguageCode='en')

    # Connect To Session
    session = boto3.Session(profile_name='damocles')
    comprehend = session.client('comprehend', region_name='eu-west-2')
    
    # Get Batches
    batches = [input_list[i:i + batch_size] for 
               i in range(0, len(input_list), batch_size)]
    
    # Init
    entity_li = []
    keyphrase_li = []
    sentiment_li = []
    sentiment_score_li = []


    for batch in tqdm(batches):
        # Extract Entities
        entity_response = callEntityApi(batch)
        entity_li += [
            r['Entities'] for r in entity_response['ResultList']]

        # Extract Keyphrase
        keyphrase_response = callKeyphraseApi(batch)
        keyphrase_li += [
            r['KeyPhrases'] for r in keyphrase_response['ResultList']]
        
        # Extract Sentiment
        sentiment_response = callSentimentAPI(batch)
        sentiment_li += [
            r['Sentiment'] for r in sentiment_response['ResultList']]
        sentiment_score_li += [
            r['SentimentScore'] for r in sentiment_response['ResultList']]
        
    return entity_li, keyphrase_li, sentiment_li, sentiment_score_li

In [232]:
# Load Data
filename = 'data/think_tanks_thinktank_articles.json'
articles_df = process_and_load_thinktank_json(filename)

# Attach Entites, KeyPhrases, Sentiment
articles_df['entities'],\
articles_df['keyphrases'],\
articles_df['sentiment'],\
articles_df['sentiment_score']=\
    batchProcessComprehend(articles_df.titleText.values.tolist())

# Save To Pickle
with open('ent_sent_keyphrase.pickle', 'wb') as handle:
    pickle.dump(articles_df, handle)

HBox(children=(IntProgress(value=0, max=116), HTML(value='')))

In [270]:
def extract_node(x):
    date = str(x.date)
    keyphrases = [item['Text'] for item in x.keyphrases]
    entities = [item['Text'] for item in x.entities]
    topics = x.topics
    return (date, keyphrases, entities, topics)

articles_df.apply(extract_node, axis=1).values[1]

In [272]:
articles_df.apply(extract_node, axis=1).values[1]

('May\xa024,\xa02019',
 ['implementation',
  'awareness',
  'the UN Sustainable Development Goals',
  'Sri Lanka\n\n[',
  'This article',
  'Southern Voice',
  'July',
  'The article',
  'a component',
  'the success',
  'the Sustainable Goals SDGs',
  'Sri Lanka',
  'a nation',
  'wide education and awareness',
  'campaign',
  'the goals',
  'The author',
  'a high level authority',
  'the responsibility]',
  'Three years',
  'Sri Lanka',
  'the\xa0 Sustainable Development Goals SDGs',
  'all but one member state',
  'These goals',
  'the Agenda',
  'the overarching aim',
  'poverty',
  'social economic and environmental issues',
  'nature',
  'the Agenda Sri Lanka',
  'several steps',
  'its realization',
  'This July Sri Lanka',
  'the\xa0first voluntary national review',
  'the High Level Political Forum',
  'Its purpose',
  'the extent',
  'the nation',
  'progress',
  'the SDGs Several Sri Lankan stakeholders',
  'proactive steps',
  'the SDGs',
  'the National Planning Departmen

In [269]:
articles_df

Unnamed: 0,category,date,topics,titleText,entities,keyphrases,sentiment,sentiment_score
0,Case study,"May 24, 2019","[Collaboration, Communications and impact, pol...",Health concerns in India in the SDGs era\n\nTh...,"[{'Score': 0.9796985387802124, 'Type': 'LOCATI...","[{'Score': 0.9506336450576782, 'Text': 'Health...",NEUTRAL,"{'Positive': 0.0032495984341949224, 'Negative'..."
1,Case study,"May 24, 2019","[Collaboration, Communications and impact, mon...",From planning to implementation raising awaren...,"[{'Score': 0.48153752088546753, 'Type': 'ORGAN...","[{'Score': 0.9670957922935486, 'Text': 'implem...",NEUTRAL,"{'Positive': 0.0028960187919437885, 'Negative'..."
2,Case study,"May 24, 2019","[Africa, Communications and impact, Functions,...",Think tanks contributing towards the health re...,"[{'Score': 0.9832321405410767, 'Type': 'DATE',...","[{'Score': 0.7393423914909363, 'Text': 'Think ...",NEUTRAL,"{'Positive': 0.01400300394743681, 'Negative': ..."
3,Case study,"May 24, 2019","[Collaboration, Communications and impact, Inf...",Improved health and nutrition contributing to ...,"[{'Score': 0.8823003172874451, 'Type': 'LOCATI...","[{'Score': 0.9519321918487549, 'Text': 'Improv...",NEUTRAL,"{'Positive': 0.04854888841509819, 'Negative': ..."
4,Case study,"May 24, 2019","[Collaboration, Communications and impact, Pol...",Achieving the Sustainable Development Goals in...,"[{'Score': 0.9901058077812195, 'Type': 'LOCATI...","[{'Score': 0.9935138821601868, 'Text': 'the Su...",NEUTRAL,"{'Positive': 0.0364609956741333, 'Negative': 0..."
5,Case study,"May 24, 2019","[Collaboration, Communications and impact, Pol...",Process matters when it comes to implementing ...,"[{'Score': 0.8568176031112671, 'Type': 'TITLE'...","[{'Score': 0.4616624712944031, 'Text': 'Proces...",NEUTRAL,"{'Positive': 0.01574675552546978, 'Negative': ..."
6,Case study,"May 24, 2019","[Capacity Development, Collaboration, Communic...",Is a focus on happiness enough to help a count...,"[{'Score': 0.5432640910148621, 'Type': 'LOCATI...","[{'Score': 0.9733020067214966, 'Text': 'a focu...",NEUTRAL,"{'Positive': 0.042726851999759674, 'Negative':..."
7,Opinion,"May 15, 2019","[Communication, Communications and impact, Pol...",How deeply should think tanks engage with publ...,"[{'Score': 0.8117936849594116, 'Type': 'ORGANI...","[{'Score': 0.9939494132995605, 'Text': 'tanks'...",NEUTRAL,"{'Positive': 0.06777610629796982, 'Negative': ..."
8,Research,"May 13, 2019","[Education, Research, The RSA, Think tanks and...",The great dilemmas that universities face in t...,"[{'Score': 0.7044923901557922, 'Type': 'DATE',...","[{'Score': 0.9855058789253235, 'Text': 'The gr...",NEUTRAL,"{'Positive': 0.30317506194114685, 'Negative': ..."
9,Opinion,"April 25, 2019","[ACBF, policy design, policy implementation, U...",Bridging the policy design and implementation ...,"[{'Score': 0.6703212857246399, 'Type': 'LOCATI...","[{'Score': 0.8619583249092102, 'Text': 'the po...",NEUTRAL,"{'Positive': 0.00322878360748291, 'Negative': ..."


In [261]:
articles_df.entities.values[0]

[{'Score': 0.9796985387802124,
  'Type': 'LOCATION',
  'Text': 'India',
  'BeginOffset': 19,
  'EndOffset': 24},
 {'Score': 0.9825558662414551,
  'Type': 'LOCATION',
  'Text': 'India',
  'BeginOffset': 474,
  'EndOffset': 479},
 {'Score': 0.8713580965995789,
  'Type': 'ORGANIZATION',
  'Text': 'National Institution for Transforming India NITI Aayog',
  'BeginOffset': 604,
  'EndOffset': 658},
 {'Score': 0.9263405799865723,
  'Type': 'LOCATION',
  'Text': 'India',
  'BeginOffset': 695,
  'EndOffset': 700},
 {'Score': 0.7133698463439941,
  'Type': 'ORGANIZATION',
  'Text': 'NITI Ayog',
  'BeginOffset': 852,
  'EndOffset': 861},
 {'Score': 0.87736576795578,
  'Type': 'LOCATION',
  'Text': 'India',
  'BeginOffset': 899,
  'EndOffset': 904},
 {'Score': 0.6385401487350464,
  'Type': 'QUANTITY',
  'Text': 'each SDG',
  'BeginOffset': 949,
  'EndOffset': 957},
 {'Score': 0.9949374198913574,
  'Type': 'DATE',
  'Text': 'July',
  'BeginOffset': 1954,
  'EndOffset': 1958},
 {'Score': 0.3965561687

In [262]:
# articles_df.apply(extract_node, axis=1)

In [226]:
import pickle
with open('ent_sent_keyphrase.pickle', 'rb') as handle:
    articles_df = pickle.load(handle)