In [29]:
from newsapi import NewsApiClient
import datetime as dt
import pandas as pd
from datetime import datetime
import numpy as np

from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta, date

import requests
import dotenv
import os
import json
from pathlib import Path
import dateutil.parser
import math

# Initial imports
import tensorflow as tf

from numpy.random import seed
seed(1)

from tensorflow import random
random.set_seed(2)

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

%matplotlib inline

In [30]:
dotenv.load_dotenv()

news_api_key = os.getenv("NEWSAPI_KEY")

news_sources = 'australian-financial-review, usa-today, business insider, reuters, cnn, the-wall-street-journal, financial-post, bloomberg, fox-news, fortune, the-washington-post, the-washington-times, financial-times, oil-price, fox-business'
news_domains='afr.com, usatoday.com/news, businessinsider.com,reuters.com, us.cnn.com, wsj.com, business.financialpost.com, bloomberg.com, foxnews.com, fortune.com, washingtonpost.com, washingtontimes.com, ft.com, oilprice.com, foxbusiness.com'

kw_list = "oil OR crude NOT (castor OR coconut OR cotton OR groundnut OR lin OR palm OR rape OR soy OR sun OR veg OR vegetable)"

news_request_url = "https://newsapi.org/v2/everything"

In [31]:
def get_day_news(date_string):
    
    page_size = 20
    page_number = 1
        
    news_option = {'apiKey': news_api_key, 
                   'q': kw_list, 
                   'sortBy': 'relevancy', 
                   'language': 'en', 
                   'pageSize': page_size,
                   'page' : page_number, 
                   'from': date_string,
                   'to': date_string,}

    # Execute get request
    new_response_data = requests.get(news_request_url, news_option)
    news_data = new_response_data.json()
    #print(json.dumps(news_data, indent=4)) 
    
    total_results = news_data['totalResults']
    total_pages = math.ceil(total_results / page_size) + 1
    if total_pages > 4 :
        total_pages = 4 
        
    print("Total_Results:", total_results, " Total_Pages:", total_pages-1)
    
    day_news = pd.DataFrame([])
    day_news_list = []
    page_size = 100
    
    for i in range(1, total_pages):
        #print ('Page:', i)
        news_option = {'apiKey': news_api_key, 
                       'q': kw_list, 
                       'sortBy': 'relevancy', 
                       'language': 'en', 
                       'pageSize': page_size,
                       'page' : i, 
                       'from': date_string,
                       'to': date_string,}
        
        new_response_data = requests.get(news_request_url, news_option)
        news_data = new_response_data.json()
    
        day_news_list.extend(news_data['articles'])
    
    day_news = pd.DataFrame.from_dict(day_news_list)
    day_news.drop(['title','urlToImage','description','author', 'source', 'url'], axis=1, inplace=True)
    #print(day_news)
    
    #convert object column to datetime
    day_news['publishedAt'] = pd.to_datetime(day_news['publishedAt'], infer_datetime_format=True)
    day_news['DayID'] = day_news.publishedAt.dt.strftime('%Y-%m-%d')
    day_news = day_news.set_index('DayID').reset_index()

    #shorten the text under the "content" column to 500 characters and save into a new column
    day_news['text'] = day_news['content'].str[:300]
    day_news.drop(['content', 'publishedAt'],axis=1, inplace=True)

    day_news.drop_duplicates(subset=['DayID','text'],keep='first', inplace=True)
    
    day_news = day_news.dropna()
    
    return day_news

In [32]:
start_date = date(2018, 1, 20)
#nextday_date = datetime.datetime.today() + datetime.timedelta(days=1)

end_date = date(2021, 1, 19)

news_df = pd.DataFrame()

def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)


for news_date in daterange(start_date, end_date):
    print(news_date.strftime("%Y-%m-%d"))
    day_news = get_day_news(news_date)
    news_df = news_df.append(day_news, ignore_index = True) 

2018-01-20
Total_Results: 643  Total_Pages: 3
2018-01-21
Total_Results: 606  Total_Pages: 3
2018-01-22
Total_Results: 1661  Total_Pages: 3
2018-01-23
Total_Results: 1685  Total_Pages: 3
2018-01-24
Total_Results: 1731  Total_Pages: 3
2018-01-25
Total_Results: 1695  Total_Pages: 3
2018-01-26
Total_Results: 1409  Total_Pages: 3
2018-01-27
Total_Results: 635  Total_Pages: 3
2018-01-28
Total_Results: 571  Total_Pages: 3
2018-01-29
Total_Results: 1657  Total_Pages: 3
2018-01-30
Total_Results: 1713  Total_Pages: 3
2018-01-31
Total_Results: 1684  Total_Pages: 3
2018-02-01
Total_Results: 1781  Total_Pages: 3
2018-02-02
Total_Results: 1581  Total_Pages: 3
2018-02-03
Total_Results: 668  Total_Pages: 3
2018-02-04
Total_Results: 610  Total_Pages: 3
2018-02-05
Total_Results: 1595  Total_Pages: 3
2018-02-06
Total_Results: 1807  Total_Pages: 3
2018-02-07
Total_Results: 1895  Total_Pages: 3
2018-02-08
Total_Results: 1954  Total_Pages: 3
2018-02-09
Total_Results: 1642  Total_Pages: 3
2018-02-10
Total_Re

## Scoring Sentiment Using VADER

In this section, you will use VADER sentiment from the `nltk` library to score the sentiment of the testing set. Later, you will assess model performance using metrics such as accuracy, precision, recall, among others.

In [33]:
# Create the features set (X) and the target vector (y)
X = news_df["text"].values

In [34]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/admin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [35]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [36]:
# Define two lists to store vader sentiment scoring
y_vader_prob = []
y_vader_pred = []
y_vader_compound = []

In [37]:
# Score sentiment of test set using Vader
for text in X:
    y_vader_prob.append(analyzer.polarity_scores(text)["pos"])
    
    sentiment_score = analyzer.polarity_scores(text)["compound"]
    y_vader_compound.append(sentiment_score)
    if sentiment_score >= 0.1:
        y_vader_pred.append(1)
    else:
        y_vader_pred.append(0)

In [38]:
news_df['sentiment'] = y_vader_pred
news_df['compound'] = y_vader_compound
news_df

Unnamed: 0,DayID,text,sentiment,compound
0,2018-01-20,The oil business fuels our everyday lives. If ...,1,0.2732
1,2018-01-20,"In 1967, following the Six-Day War, oil-produc...",0,-0.8689
2,2018-01-20,MUMBAI: Union transport minister Nitin Gadkari...,1,0.3612
3,2018-01-20,The base of this winter chopped salad is a new...,1,0.6796
4,2018-01-20,WASHINGTON (AP) — The Trump administration's p...,1,0.1027
...,...,...,...,...
298943,2021-01-18,By William Clowes (Bloomberg) —\r\nThe worlds ...,0,-0.4601
298944,2021-01-18,"Speaking on Monday, the head of the Atomic Ene...",1,0.2732
298945,2021-01-18,"Joe Biden, President-elect, United States of A...",1,0.4215
298946,2021-01-18,"Some companies, which had earlier been cleared...",1,0.1027


In [39]:
#daily_sentiment = news_df.groupby('DayID')['sentiment'].mean()
daily_sentiment = news_df.groupby('DayID').agg({'sentiment':['mean'], 'compound':['mean']})
daily_sentiment.columns = daily_sentiment.columns.droplevel(0)
daily_sentiment.columns=['mean_sentiment', 'mean_compound']
daily_sentiment

Unnamed: 0_level_0,mean_sentiment,mean_compound
DayID,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-20,0.511278,0.102247
2018-01-21,0.620690,0.211092
2018-01-22,0.533784,0.091852
2018-01-23,0.554770,0.133828
2018-01-24,0.510870,0.062822
...,...,...
2021-01-15,0.574007,0.175040
2021-01-16,0.574661,0.141493
2021-01-17,0.570248,0.143250
2021-01-18,0.441176,0.085961


In [40]:
daily_sentiment.to_csv('daily_sentiment.csv')

In [41]:
# Option 1: Normalizing data using MinMaxScaler from sklearn
#from sklearn.preprocessing import MinMaxScaler

#scaler = MinMaxScaler()
#scaler.fit(np.array(y_vader_prob).reshape(-1,1))
#y_vader_prob_norm = scaler.transform(np.array(y_vader_prob).reshape(-1,1))
#y_vader_prob[:5]
#y_vader_prob_norm[:5]

In [42]:
# Option 2: Using a comprehension list
#normalized = [(x - min(y_vader_prob)) / (max(y_vader_prob) - min(y_vader_prob)) for x in y_vader_prob]
#normalized[:5]