<a href="https://colab.research.google.com/github/kconstable/crypto-ensemble-model-predictions/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis

## Import Libraries

In [None]:
!pip install pygooglenews

In [None]:
# NPL
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import pandas as pd
import numpy as np

import requests
from pygooglenews import GoogleNews
from bs4 import BeautifulSoup as bs
import requests as req

import time
from datetime import date, timedelta,datetime

import plotly.graph_objects as go
from plotly.subplots import make_subplots
# import plotly.express as px

import pickle
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')


## Web Scrapping

### References
+ https://github.com/kotartemiy/pygooglenews#documentation


In [None]:
def scrape_forbes_page(url):
  """
  """

  # make request
  res = req.get(url)
  text = ""

  # if the request is succesful
  if res.status_code == 200:
    # create soup object
    soup = bs(res.content)

    # try to extract the text data
    try:
      # get the article section
      # article = soup.find('div',{'class':'article-body fs-article fs-responsive-text current-article'})

      # extract the text from each paragraph
      paragraphs = soup.find_all('p')
      for p in paragraphs:
        text = text + p.get_text()

    except BaseException as err:
      print(f'\t=>Parsing Error:{url}')
      pass

  # print the response error
  else:
    print(f'\t=>Response Error:{res.status_code}')

  return text

In [None]:
def get_date_ranges(from_,to_,interval='mth'):
  """
  year: year integer
  interval: day,week,mth,qtr
  """
  date_ranges =[]

  if interval=='day':
    for dt in pd.date_range(from_,to_,freq='D'):
      start = date(dt.year,dt.month,dt.day)
      end= start + timedelta(days=1)
      date_ranges.append({'start':str(start),'end':str(end)})

  if interval=='week':
    for dt in pd.date_range(from_,to_,freq='W'):
      start = str(date(dt.year,dt.month,dt.day))
      end = str(dt+timedelta(weeks=1))
      date_ranges.append({'start':start,'end':end})

  if interval=='mth':
    for dt in pd.date_range(from_,to_,freq='M'):
      start = str(date(dt.year,dt.month,1))
      end = str(date(dt.year,dt.month,dt.day))
      date_ranges.append({'start':start,'end':end})

  if interval =='qtr':
    for dt in pd.date_range(from_,to_,freq='Q'):
      start = str(date(dt.year,dt.month-2,1))
      end = str(date(dt.year,dt.month,dt.day))
      date_ranges.append({'start':start,'end':end})

  return date_ranges

In [None]:
def get_google_news(q,from_,to_,interval='mth',get_article = False,news=None,throttle=3):
  """
  q          : query string
  from_      : from date
  to_        : tp date
  interval   : day,week,mth,qtr
  get_article: boolean. If true, scrape the full article, else get the title and link
  new        : a list of news items to append to.If none, a new list is created
  throttle   : number of seconds to wait between scrapping forbes news stories
  """
  
  # define the quarterly date ranges
  date_ranges = get_date_ranges(from_,to_,interval)
              
  # append to news if provided, or start new
  if news is None:
    news=[]

  # google news object => get english news, worldwide
  gn = GoogleNews(lang='en')  

  # loop through the date ranges to get news from
  # the google rss feed
  for date in date_ranges:
    print(f"Scaping News From:{date['start']} To:{date['end']}")
    print("="*60)
    # search google news for the date range
    search = gn.search(q, from_=date['start'],to_=date['end'])

    # extract stories, urls from each news story
    for i,entry in enumerate(search['entries']):
      print(f"{i+1})  {entry.title}")

      # get the article text (only works with forbes)
      if get_article:
        time.sleep(throttle)
        text = scrape_forbes_page(entry.link)
      else:
        text = ''

      # extract the data from google news RSS
      dt = entry.published_parsed
      story = {
          'date':datetime(dt.tm_year,dt.tm_mon,dt.tm_mday),
          'title':entry.title,
          'link':entry.link,
          'text':text
      }
      news.append(story)
    print("\n")
  return news

In [None]:
def get_google_news_by_country(q,from_,to_,country_list,interval='mth',get_article = False,news=None,throttle=3):
  """
  q          : query string
  from_      : from date
  to_        : tp date
  interval   : day,week,mth,qtr
  get_article: boolean. If true, scrape the full article, else get the title and link
  new        : a list of news items to append to.If none, a new list is created
  throttle   : number of seconds to wait between scrapping forbes news stories
  """
  
  # define the quarterly date ranges
  date_ranges = get_date_ranges(from_,to_,interval)
              
  # append to news if provided, or start new
  if news is None:
    news=[]

  for country in country_list:
    # google news object => get english news, worldwide
    print(f'News from Country:{country}')
    print('='*60)
    gn = GoogleNews(lang='en',country=country)  

    # loop through the date ranges to get news from
    # the google rss feed
    for date in date_ranges:
      print(f"Scaping News From:{date['start']} To:{date['end']}")
      print("="*60)
      # search google news for the date range
      search = gn.search(q, from_=date['start'],to_=date['end'])

      # extract stories, urls from each news story
      for i,entry in enumerate(search['entries']):
        print(f"{i+1})  {entry.title}")

        # get the article text (only works with forbes)
        if get_article:
          time.sleep(throttle)
          text = scrape_forbes_page(entry.link)
        else:
          text = ''

        # extract the data from google news RSS
        dt = entry.published_parsed
        story = {
            'date':datetime(dt.tm_year,dt.tm_mon,dt.tm_mday),
            'title':entry.title,
            'link':entry.link,
            'text':text
        }
        news.append(story)
      print("\n")
  return news

In [None]:
# Query: Scape full article text from forbes
stock = 'bitcoin'
ticker = 'BTC'
site = 'www.forbes.com'

# forbes query for full text extraction
# bitcoin must be in the article
q = f"allintext:{stock} site:{site}"
gn = GoogleNews(lang='en')

# scrape google news RSS for stories
news = get_google_news(q,'2021-01-01','2021-10-31','week',True,news,3)


# save to drive as dataframe
# remove duplicates
df_news = pd.DataFrame(news)
df_news.drop_duplicates(subset=['link'],inplace=True)
df_news.set_index('date',inplace=True)
df_news.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/{ticker}_news.pickle')

print("News:",len(news))

In [None]:
# Query: Get Total News Articles (titles and links only)
# exceeds 100 stories per day starting Aug 24 2021
stock = 'bitcoin'
ticker= 'BTC'

# for news count query
# bitcoin must be in the title of the article
q =f"intitle:{stock}"
gn = GoogleNews(lang='en')

# scrape google news RSS for stories
# news_volume = get_goggle_news(q,'2016-01-01','2016-06-30','day',False)
# news_volume = get_goggle_news(q,'2021-09-30','2021-10-31','day',False,news_volume)

# get news volume by world + individual country (when there are >=100 results per day)
country_list = ['us','uk','jp','in','cn','ru','ng','fr','de','hk','il','it','pk','ph']
get_google_news_by_country(q,'2021-08-24','2021-10-31',country_list,'day',False,news_volume)

print("News:",len(news_volume))

# save to drive as dataframe
# remove duplicates
df_news_volume = pd.DataFrame(news_volume)
df_news_volume.drop_duplicates(subset=['link'],inplace=True)
df_news_volume.set_index('date',inplace=True)
df_news_volume.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/{ticker}_news_volume.pickle')

## Sentiment Analysis

In [222]:
# load saved news data
df_news = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/BTC_news.pickle')
df_news_vol = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/BTC_news_volume.pickle')

In [141]:
def calc_sentiment(df_news):
  """
  """
  # count the occurances of bitcoin in the article text
  df_news['term_count'] = df_news['text'].apply(lambda x:x.count("Bitcoin")+x.count('bitcoin'))

  # calculate the sentiment score
  sid = SentimentIntensityAnalyzer()
  df_news['sentiment_text'] = df_news['text'].apply(lambda x:sid.polarity_scores(x)['compound'])
  df_news['sentiment_title'] = df_news['title'].apply(lambda x:sid.polarity_scores(x)['compound'])


In [5]:
def plot_distribution(df,features):
  """
  Plot a historgram and bloxplot the input feature
  Input: 
    df:      a dataframe of features
    feature: a feature name
  Output:
    Plots the histogram and boxplot for the feature
  """
  feature1 = features[0]
  feature2 = features[1]

  # create subplots
  fig = make_subplots(rows=1, cols=2)

  # histogram
  fig.add_trace(
      go.Histogram(x=df[feature1],name=feature1+" histogram",marker=dict(color='#a8b8d0')),
      row=1, col=1
  )
  fig.add_trace(
      go.Histogram(x=df[feature2],name=feature2 + " histogram",marker=dict(color='orange')),
      row=1,col=1
  )

  #box plot
  fig.add_trace(
      go.Box(y=df[feature1],name=feature1,fillcolor='#a8b8d0',line_color='#a8b8d0'),
      row=1, col=2
  )
  fig.add_trace(
      go.Box(y=df[feature2],name=feature2,fillcolor='orange',line_color = 'orange'),
      row=1,col=2
  )

  #
  fig.update_layout(height=400, 
                    width=600, 
                    title_text=f"Distribution:Full Text Sentiment vs Title Sentiment",
                    template='plotly_white',
                    xaxis_title ='',
                    showlegend=False)
  fig.show()


In [15]:
def calc_weighted_sentiment(df,ma_days =[12,20,40]):
  """
  Calculates the weighted sentiment by term count (bitcoin mentions in 
  the article). Aggregates the results by day
  Params:
    df: df of news articles
    ma_days: number of days in the moving average
  """
  # get a copy of the news df
  dff = df.copy()

  # get the total news count by day
  df_counts = dff.groupby(by=dff.index).agg({'term_count' : ['sum']})
  df_counts.columns = ["_".join(a) for a in df_counts.columns.to_flat_index()]

  # add the total news count to the daily df
  df_agg = dff.join(df_counts,how='inner')

  # calc the weighted sentiment by term count
  df_agg['weighted_sentiment']=df_agg['term_count'] / df_agg['term_count_sum'] * df_agg['sentiment_text']

  # calc the weighted sentiment by term count
  df_weighted = df_agg.groupby(by = df_agg.index).agg({'weighted_sentiment':'sum'})

  #calc the moving average weighted sentiment by days
  for ma in ma_days:
    df_weighted[f'ma_sentiment_{str(ma)}'] = df_weighted['weighted_sentiment'].rolling(window=ma).mean()

  return df_weighted


In [225]:
def plot_sentiment(df,from_=None,to_=None,ma_days=[12,20,40]):
  """
  Plots the daily weighted average sentiment and moving average sentiment
  Params:
    df: a dataframe of news stories with sentiment 
    from_: Filter from this date
    to_  : to this date if provided
    ma_days: the number of days in the moving average plot
  """
  dff = calc_weighted_sentiment(df,ma_days)

  # filter by date if provided
  if from_ is not None and to_ is not None:
    dff = dff.loc[from_:to_,:]

  fig = go.Figure()

  fig.add_trace(
      go.Scatter(x=dff.index,
                 y = dff.weighted_sentiment,
                 name='Daily Sentiment',
                 line_color = '#a8b8d0',
                 opacity=0.5,
          )
  )
  # add the moving averages
  colors = ['orange','crimson','lightcoral']
  for i,ma in enumerate(ma_days):
    fig.add_trace(
        go.Scatter(x=dff.index, 
                  y=dff[f'ma_sentiment_{str(ma)}'],
                  name=f'{ma} Day Moving Average Sentiment',
                  line_color = colors[i])
    )

  fig.update_layout(template = 'plotly_white',
                    height = 500,
                    width = 600,
                    title = 'News Sentiment (Forbes-Full Article Text)',
                    legend=dict(
                      yanchor="bottom",
                      y=0.1,
                      xanchor="left",
                      x=0.2))
  fig.show()

In [217]:
def calc_news_count(df_vol,ma_days_volume=20,ma_days_sentiment=5):
  """
  """
  # calculate the sentiment score from the news titles
  sid = SentimentIntensityAnalyzer()
  df_vol['sentiment_title'] = df_vol['title'].apply(lambda x:sid.polarity_scores(x)['compound'])

  # count the articles by day
  df_news_count = df_vol.groupby(by=df_vol.index).agg({'title':'count','sentiment_title':'mean'})
  df_news_count.rename(columns={"title":"news_count"},inplace=True)

  # # add moving average
  df_news_count[f'ma_news_count'] = df_news_count['news_count'].rolling(window=ma_days_volume).mean()
  df_news_count[f'ma_sentiment_title'] = df_news_count['sentiment_title'].rolling(window=ma_days_sentiment).mean()


  return df_news_count
  

In [236]:
def plot_news_count(df_vol,ma_days_volume=20,ma_days_sentiment=5,df_news_count=None):
  """
  """
  if df_news_count is None:
    df = calc_news_count(df_vol,ma_days_volume,ma_days_sentiment)
  else:
    df = df_news_count.copy()

  # create figure  
  fig = make_subplots(specs=[[{"secondary_y": True}]])
  # fig = go.Figure()

  # daily volume
  fig.add_trace(
      go.Scatter(
          name = 'Daily News Volume',
          x = df.index,
          y = df.news_count,
          fill = 'tozeroy',
          opacity=0.5,
          marker_color = '#a8b8d0',
          marker_line_color = '#a8b8d0'
      ),secondary_y = False
  )

  # Moving average daily volume
  fig.add_trace(
      go.Scatter(
          name = f'{ma_days_volume} Day Moving Average News Volume',
          x = df.index,
          y = df.ma_news_count,
          line_color = 'gold'
      ),secondary_y =False
  )
  
  # moving average sentiment of the news titles
  fig.add_trace(
      go.Scatter(
          name = f'{ma_days_sentiment} Day Moving Average Sentiment (article titles)',
          x=df.index,
          y=df.ma_sentiment_title,
          line_color = 'orange'
      ),secondary_y = True
  )

  fig.update_layout(title = 'News Count & Sentiment (multiple sources)',
                    template = 'plotly_white',
                    width = 700,
                    height =500,
                    legend=dict(
                      yanchor="top",
                      y=1,
                      xanchor="left",
                      x=0.01)
                    )
  fig.update_yaxes(title_text="News Count", secondary_y=False)
  fig.update_yaxes(title_text="News Sentiment (article titles)", secondary_y=True,range=[-1,1])
  fig.show()

In [238]:
ticker = 'BTC'

# Calculate the sentiment (full text & title)
calc_sentiment(df_news)

# plot sentiment distributions
plot_distribution(df_news,['sentiment_text','sentiment_title'])

# calculate the weighted sentiment by term count
df_weighted = calc_weighted_sentiment(df_news,[10,20,40])

# plot sentiment by day, and moving average
plot_sentiment(df_news,ma_days=[20])

# calculate the news counts by day
df_news_count = calc_news_count(df_news_vol,20,5)

# plot news count
plot_news_count(df_news_vol,20,5,df_news_count)

# save output for feature consolidation
df_weighted.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/{ticker}_sentiment.pickle')
df_news_count.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/{ticker}_news_counts.pickle')