In [58]:
# Import statements.
# You will need the following packages installed to run this notebook.
import difflib
import random
import requests
import json
import datetime 
import dateutil.relativedelta
from ipywidgets import interact, widgets
from IPython.display import display
import pandas as pd
from unidecode import unidecode
import operator


from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, EmotionOptions, SentimentOptions



In [40]:
# Using the IBM Watson Natural Language Understanding API.

# Please visit https://www.ibm.com/cloud/watson-natural-language-understanding to register and obtain your API Key
# and follow instructions.
# Please visit https://anaconda.org/conda-forge/ibm-watson for conda installation.

authenticator = IAMAuthenticator('paste-your-ibm-watson-api-key') #single quotes needed.
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2019-07-12',
    authenticator=authenticator
)
natural_language_understanding.set_service_url('paste_your_service_url_here') #single quotes needed.

# Request to obtain all english newspaper sources.
# Please visit https://newsapi.org/register to register and obtain your API Key.

source_url = ('http://newsapi.org/v2/sources?'
       'language=en&'
       'apiKey=paste-your-news-api-key-here') #single quotes needed.
source_response = requests.get(source_url)
if source_response.json()['status'] == 'ok':
    sources = {source['name']: source['id'] for source in source_response.json()['sources']}
else:
    print("Bad request")
    
# The 'sources' variable holds the dictionary where the keys are names of the news sources and values are the ids
# of the news sources.
# We need the source ids to make a request for articles.

# Code referenced and used from 'https://gist.github.com/pbugnion/5bb7878ff212a0116f0f1fbc9f431a5c'

def multi_checkbox_widget(descriptions):
    """ Widget with a search field and lots of checkboxes """
    options_dict = {description: widgets.Checkbox(description=description, value=False) for description in descriptions}
    options = [options_dict[description] for description in descriptions]
    options_widget = widgets.VBox(options, layout={'overflow': 'scroll'})
    multi_select = widgets.VBox([options_widget])

    return multi_select

# Creating a list of keys to create a checkbox list to select from
descriptions = list(sources.keys())

widget = multi_checkbox_widget(descriptions)
widget

VBox(children=(VBox(children=(Checkbox(value=False, description='ABC News'), Checkbox(value=False, description…

In [41]:
# Selected options are recorded here in selected_options variable
# We use this list to map to the source ids using the sources dictionary/ hash map
# Creating a list of ids (variable name: 'ids')

selected_options = [w.description for w in widget.children[0].children if w.value]
ids = []
for name in selected_options:
    ids.append(sources[name])
    
# The developer version (community or free edition) in newsapi.org website prevents us from using more than 20 sources.
# So we truncate the list the user creates by using only the first 20 sources selected by the user.

if len(ids) > 20:
    ids = ids[:20]

# Creating a string 'source_string' from the 'ids' which will be used to make the api call.
source_string = ''
for s in ids:
    source_string += s+',' 
source_string = source_string[:-1]
source_string

'the-times-of-india,the-verge,the-wall-street-journal,the-washington-post,the-washington-times,time,usa-today,vice-news,wired'

In [42]:
# The developer version only gives us the results which are no more than a month old.
# To prevent the user from making a bad request we only use the previous month's date 
# to get the results from that date to present date.
# The string in previous_month is used to make the api call.

now = datetime.datetime.now()
previous_month = now + dateutil.relativedelta.relativedelta(months=-1)
previous_month = previous_month.strftime('%Y-%m-%d')
previous_month

'2020-07-18'

In [43]:
# Creating a widget for the user to enter the keywords which the article must contain.

print("Type keywords that must appear in article")
layout = widgets.Layout(width='auto', height='40px')
must_have_text = widgets.Text( layout=layout,
    placeholder='Type one keyword and Press Enter, then type another',
    description='String:',
)
display(must_have_text)

must_have_keywords_list = []
def callback(wdgt, ):
    # replace by something useful
    display(wdgt.value)
    must_have_keywords_list.append('+'+wdgt.value)
    

must_have_text.on_submit(callback)

Type keywords that must appear in article


Text(value='', description='String:', layout=Layout(height='40px', width='auto'), placeholder='Type one keywor…

In [44]:
must_have_keywords = ''.join(must_have_keywords_list)
must_have_keywords

''

In [45]:
# Creating a widget for the user to enter the keywords which the article must not contain.

print("Type keywords that must not appear in article")
must_not_have_text = widgets.Text(layout=layout,
    placeholder='Type one keyword and Press Enter, then type another',
    description='String:',
)
display(must_not_have_text)

must_not_have_keywords_list = []
def callback(wdgt):
    # replace by something useful
    display(wdgt.value)
    must_not_have_keywords_list.append('-'+wdgt.value)

must_not_have_text.on_submit(callback)

Type keywords that must not appear in article


Text(value='', description='String:', layout=Layout(height='40px', width='auto'), placeholder='Type one keywor…

In [46]:
must_not_have_keywords = ''.join(must_not_have_keywords_list)
must_not_have_keywords

''

In [47]:
# After creating variables for keywords we join them to create the query.
# This query is used to search the article's body and find matches.
# This variable is used to make the api call too.

query = must_have_keywords+must_not_have_keywords
query

''

In [48]:
# Creating a widget to search the Title of the article only.
# This is optional. Using this may narrow down the number of results.

print('Type keywords to search for in the article title only')
title_search_text = widgets.Text(layout=layout,
    placeholder='Type one keyword and Press Enter, then type another',
    description='String:',
)
display(title_search_text)

title_query_list = []
def callback(wdgt):
    # replace by something useful
    display(wdgt.value)
    title_query_list.append('+'+wdgt.value)

title_search_text.on_submit(callback)

Type keywords to search for in the article title only


Text(value='', description='String:', layout=Layout(height='40px', width='auto'), placeholder='Type one keywor…

In [49]:
# This variable is used to make the api call too.

title_query = ''.join(title_query_list)
title_query

''

In [61]:
# Creating a function to make the api call and return the DataFrame containing the following columns and their content
# source-id
# source-name
# author
# title
# description
# content
# published date
# sentiment associated with the article
# emotion associated with the article

def call_api(base_url='http://newsapi.org/v2/everything?', from_date=previous_month, query='',title_query='',
             source_string=source_string):

# Error if page parameter ('page=1&'+ in the url) is set to more than 1 (i.e. when pageSize = 100)
# {'status': 'error', 
# 'code': 'maximumResultsReached', 
# 'message': 'You have requested too many results. 
# Developer accounts are limited to a max of 100 results. 
# You are trying to request results 100 to 200. 
# Please upgrade to a paid plan if you need more results.'}

# The newsapi.org developer edition only allows 100 results.
# A single page can contain upto 100 results.

    url = (base_url+
           'q='+query+'&'+
           'qInTitle='+title_query+'&'+
           'from='+from_date+'&'+
           'sources='+source_string+'&'+
           'sortBy=publishedAt&'+
           'page=1&'+
           'pageSize=100&'+
           'apiKey=paste-your-news-api-key-here') #single quotes needed.

    response = requests.get(url)

    
    json_obj = response.json()
    
# Creating a list of dictionaries which can easily be converted to a pandas dataframe.

    list_of_articles = []
    if json_obj.get('articles', None) is None:
        print("No articles obtained. Please try again.")
        print("Here is the response from newapi.org: ")
        print(json_obj)
    else:
        for i in json_obj.get('articles', None):
            # Creating a temporary dictionary to append to list of dictionaries.
            
            temp_article_dict = {}
            temp_article_dict['source-id'] = i['source']['id']
            temp_article_dict['source-name'] = i['source']['name']
            temp_article_dict['author'] = i.get('author', 'Author Name(s) Missing')
            temp_article_dict['title'] = i.get('title', 'Title Missing')
            temp_article_dict['description'] = i.get('description', 'Description Missing')
            if i.get('content') == None:
                temp_article_dict['content'] = None
                temp_article_dict['sentiment-of-content'] = None
                temp_article_dict['emotion-of-content'] = None
            else:
                # Decoding the content of the article to prevent error from IBM Watson analyzer.
                
                decoded_content = unidecode(i.get('content'))  
                
                # Entire content will not be recorded in the output csv file.
                # Also, some decoded content may not be readable.
                
                temp_article_dict['content'] = decoded_content
                ibm_response = natural_language_understanding.analyze(language='en',
                                text=decoded_content,
                                features=Features(emotion=EmotionOptions(), sentiment=SentimentOptions())).get_result()
                temp_article_dict['sentiment-of-content'] = ibm_response.get('sentiment').get('document').get('label')
                
                # Getting the emotion with highest score.
                # Emotions returned are sadness, anger, fear, disgust, joy. 
                
                emotion_of_content = max(ibm_response.get('emotion').get('document').get('emotion').items(), 
                                         key=operator.itemgetter(1))[0]
                temp_article_dict['emotion-of-content'] = emotion_of_content
                
            temp_article_dict['published_date'] = i.get('publishedAt', 'Published Date Missing')
            
            # Appending dictionary to list of articles
            
            list_of_articles.append(temp_article_dict)
    
#   Converting list of dictionaries to dataframe.

    articles = pd.DataFrame(list_of_articles)
    return articles



In [62]:
x = call_api()

In [64]:
# Stores the results from the api(s) to the csv file named Sentiment-Analysis-on-News-Articles.csv

x.to_csv('Sentiment-Analysis-on-News-Articles.csv')