## Import Libraries


In [None]:
!pip install pygooglenews
!pip install pytrends

In [64]:
import pandas as pd
import numpy as np
import requests
import time
import pickle
import re
from datetime import date, timedelta,datetime
from tabulate import tabulate
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from google.colab import files
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from google.colab import drive
drive.mount('/content/drive')

from pygooglenews import GoogleNews
from pytrends.request import TrendReq
from bs4 import BeautifulSoup as bs
import requests as req


# alphavalue key
with open('/content/drive/MyDrive/Colab Notebooks/capstone2/data/av_key.txt') as f:
    key = f.read().strip()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Structured Features

### Fundamentals

In [None]:
stock = 'VMW'

In [None]:
# current ratios
url = f'https://www.alphavantage.co/query?function=OVERVIEW&symbol={stock}&apikey={key}'
r = requests.get(url)
data = r.json()

data

In [None]:
url = f'https://www.alphavantage.co/query?function=EARNINGS&symbol={stock}&apikey={key}'
r = requests.get(url)
data = r.json()

# Earnings Per Share
print('date\t Reported EPS\t Estimated EPS\t Surprise %')
for earnings in data['quarterlyEarnings']:
  print(f"{earnings['reportedDate']}\t {earnings['reportedEPS']}\t {earnings['estimatedEPS']}\t\t {earnings['surprisePercentage']}")

  

In [None]:
# income
stock= 'VMW'
url = f'https://www.alphavantage.co/query?function=INCOME_STATEMENT&symbol={stock}&apikey={key}'
r = requests.get(url)
data = r.json()

print('date\t\t Net Income\t Gross Profit')
for income in data['quarterlyReports']:
  print(f"{income['fiscalDateEnding']}\t {income['comprehensiveIncomeNetOfTax']} \t{income['grossProfit']}")



In [None]:
# cashflow
url = f'https://www.alphavantage.co/query?function=CASH_FLOW&symbol={stock}&apikey={key}'
r = requests.get(url)
data = r.json()

print('date\t\t ProfitLoss\t Cashflow')
for cf in data['quarterlyReports']:
  print(f"{cf['fiscalDateEnding']}\t {cf['profitLoss']}\t {cf['operatingCashflow']}")

### Economic

In [None]:
# FX Rates
fx_from = 'CAD'
fx_to = 'USD'
size = 'compact'

url = f'https://www.alphavantage.co/query?function=FX_DAILY&from_symbol={fx_from}&to_symbol={fx_to}&apikey={key}&outputsize={size}'
r = requests.get(url)
data = r.json()

print('date\t\t rate')
for date,rate in data['Time Series FX (Daily)'].items():
  print(f"{date}\t {rate['4. close']}")

In [None]:
# GDP
interval = 'quarterly'
url = f'https://www.alphavantage.co/query?function=REAL_GDP&interval={interval}&apikey={key}'
r = requests.get(url)
data = r.json()

print('date\t\t GDP')
for gdp in data['data']:
  print(f"{gdp['date']}\t {gdp['value']}")

### Cypto-Bitcoin

#### Fear and Greed Index for Bitcoin

In [None]:
def get_fear_greed_index(index_date = None):
  """
  """
  if index_date is None:
    # get all data available
    url ="https://api.alternative.me/fng/?limit=0&date_format=cn"
  else:
    # get the latest three data-points
    url = "https://api.alternative.me/fng/?limit=3&date_format=cn"
  
  # make request
  r = requests.get(url)

  # check the status of the request 
  if r.status_code !=200:
    print("\t=> Greed-Fear Index API Error")
    return ''
  
  # convert data to a dataframe
  data = r.json()
  df = pd.DataFrame(data['data'])
  df['date'] = pd.to_datetime(df['timestamp'])
  df.set_index('date',inplace=True)
  df = df[['value','value_classification']]

  if index_date is not None:
    # filter for the index date
    df = df[index_date]

  return df

#### Bitcoin Futures
+ https://www.cnbc.com/quotes/@BTC.1

#### Google Trends

In [None]:
def get_google_trends(search_terms_list,timeframe ='today 5-y' ):
  """
  timeframe: 'today 5-y', '2016-12-14 2017-01-25'
  Reference: https://pypi.org/project/pytrends/
  """
  # connect to google trends
  pytrends = TrendReq(hl='en-US', tz=360)

  # search for the terms
  pytrends.build_payload(search_terms_list,timeframe=timeframe)

  # get data over time
  df = pytrends.interest_over_time()

  df.columns = ['value','isPartial']
  df = df[['value']]

  return df

## Sentiment

### References
+ https://github.com/kotartemiy/pygooglenews#documentation
+ https://pypi.org/project/pytrends/

In [65]:
def scrape_forbes_page(url):
  """
  """

  # make request
  res = req.get(url)
  text = ""

  # if the request is succesful
  if res.status_code == 200:
    # create soup object
    soup = bs(res.content)

    # try to extract the text data
    try:
      # get the article section
      # article = soup.find('div',{'class':'article-body fs-article fs-responsive-text current-article'})

      # extract the text from each paragraph
      paragraphs = soup.find_all('p')
      for p in paragraphs:
        text = text + p.get_text()

    except BaseException as err:
      print(f'\t=>Parsing Error:{url}')
      pass

  # print the response error
  else:
    print(f'\t=>Response Error:{res.status_code}')

  return text


In [66]:
def get_date_ranges(from_,to_,interval='mth'):
  """
  year: year integer
  interval: day,week,mth,qtr
  """
  date_ranges =[]

  if interval=='day':
    for dt in pd.date_range(from_,to_,freq='D'):
      start = date(dt.year,dt.month,dt.day)
      end= start + timedelta(days=1)
      date_ranges.append({'start':str(start),'end':str(end)})

  if interval=='week':
    for dt in pd.date_range(from_,to_,freq='W'):
      start = str(date(dt.year,dt.month,dt.day))
      end = str(dt+timedelta(weeks=1))
      date_ranges.append({'start':start,'end':end})

  if interval=='mth':
    for dt in pd.date_range(from_,to_,freq='M'):
      start = str(date(dt.year,dt.month,1))
      end = str(date(dt.year,dt.month,dt.day))
      date_ranges.append({'start':start,'end':end})

  if interval =='qtr':
    for dt in pd.date_range(from_,to_,freq='Q'):
      start = str(date(dt.year,dt.month-2,1))
      end = str(date(dt.year,dt.month,dt.day))
      date_ranges.append({'start':start,'end':end})

  return date_ranges

In [None]:
country_list = ['us','uk','jp','in','cn','ru','ng','fr','de','hk','il','it','pk','ph']
for country in country_list:
  gn = GoogleNews(lang='en',country=country)
  tmp = gn.search(q,from_ = '2021-08-24', to_='2021-08-25')

  print(country,":",len(tmp['entries']))
  print(tmp['entries'][0]['title'])
  print(tmp['entries'][99]['title'],'\n')



In [67]:
def get_goggle_news(q,from_,to_,interval='mth',get_article = False,news=None,throttle=3):
  """
  q          : query string
  from_      : from date
  to_        : tp date
  interval   : day,week,mth,qtr
  get_article: boolean. If true, scrape the full article, else get the title and link
  new        : a list of news items to append to.If none, a new list is created
  throttle   : number of seconds to wait between scrapping forbes news stories
  """
  
  # define the quarterly date ranges
  date_ranges = get_date_ranges(from_,to_,interval)
              
  # append to news if provided, or start new
  if news is None:
    news=[]

  # google news object => get english news, worldwide
  gn = GoogleNews(lang='en')  

  # loop through the date ranges to get news from
  # the google rss feed
  for date in date_ranges:
    print(f"Scaping News From:{date['start']} To:{date['end']}")
    print("="*60)
    # search google news for the date range
    search = gn.search(q, from_=date['start'],to_=date['end'])

    # extract stories, urls from each news story
    for i,entry in enumerate(search['entries']):
      print(f"{i+1})  {entry.title}")

      # get the article text (only works with forbes)
      if get_article:
        time.sleep(throttle)
        text = scrape_forbes_page(entry.link)
      else:
        text = ''

      # extract the data from google news RSS
      dt = entry.published_parsed
      story = {
          'date':datetime(dt.tm_year,dt.tm_mon,dt.tm_mday),
          'title':entry.title,
          'link':entry.link,
          'text':text
      }
      news.append(story)
    print("\n")
  return news

In [123]:
def get_goggle_news_by_country(q,from_,to_,country_list,interval='mth',get_article = False,news=None,throttle=3):
  """
  q          : query string
  from_      : from date
  to_        : tp date
  interval   : day,week,mth,qtr
  get_article: boolean. If true, scrape the full article, else get the title and link
  new        : a list of news items to append to.If none, a new list is created
  throttle   : number of seconds to wait between scrapping forbes news stories
  """
  
  # define the quarterly date ranges
  date_ranges = get_date_ranges(from_,to_,interval)
              
  # append to news if provided, or start new
  if news is None:
    news=[]

  for country in country_list:
    # google news object => get english news, worldwide
    print(f'News from Country:{country}')
    print('='*60)
    gn = GoogleNews(lang='en',country=country)  

    # loop through the date ranges to get news from
    # the google rss feed
    for date in date_ranges:
      print(f"Scaping News From:{date['start']} To:{date['end']}")
      print("="*60)
      # search google news for the date range
      search = gn.search(q, from_=date['start'],to_=date['end'])

      # extract stories, urls from each news story
      for i,entry in enumerate(search['entries']):
        print(f"{i+1})  {entry.title}")

        # get the article text (only works with forbes)
        if get_article:
          time.sleep(throttle)
          text = scrape_forbes_page(entry.link)
        else:
          text = ''

        # extract the data from google news RSS
        dt = entry.published_parsed
        story = {
            'date':datetime(dt.tm_year,dt.tm_mon,dt.tm_mday),
            'title':entry.title,
            'link':entry.link,
            'text':text
        }
        news.append(story)
      print("\n")
  return news

In [62]:
# Query
stock = 'bitcoin'
ticker = 'BTC'
site = 'www.forbes.com'

# forbes query for full text extraction
# bitcoin must be in the article
q = f"allintext:{stock} site:{site}"
gn = GoogleNews(lang='en')

# scrape google news RSS for stories
news = get_goggle_news(q,'2021-01-01','2021-10-31','week',True,news,3)


# save to drive as dataframe
# remove duplicates
df_news = pd.DataFrame(news)
df_news.drop_duplicates(subset=['link'],inplace=True)
df_news.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/{ticker}_news.pickle')

print("News:",len(news))


Scaping News From:2021-01-03 To:2021-01-10 00:00:00
1)  Bitcoin Continues Its Climb, Soars Above $40,000 For The First Time - Forbes
2)  Bitcoin Jumps To $34,000, But Here’s Why Warren Buffett Will Never Own It - Forbes
3)  As Bitcoin Smashes Through $40,000, Data Reveals What’s Behind The Huge 2021 Bitcoin Price Boom - Forbes
4)  Happy Birthday, Bitcoin! 2021 Is Your Year - Forbes
5)  Why Bitcoin Believers Could Be Proved Right In 2021 As The Bitcoin Price Continues To Climb - Forbes
6)  Bitcoin Surges Past $40,000 | Crypto Exceeds $1 Trillion Market Cap - Forbes
7)  Bitcoin Trades Near All-Time High After Latest Gains - Forbes
8)  Bitcoin Breaks $40k, Here’s What’s Different To 2017 - Forbes
9)  What The Lindy Effect Teaches Us About Bitcoin’s Rally - Forbes
10)  Elon Musk, The World’s Richest Person, Wants To Be Paid In Bitcoin - Forbes
11)  Crypto Surges To $1 Trillion As Bitcoin, Ethereum, Ripple’s XRP, Cardano And Stellar Price Soars - Forbes
12)  Bitcoin Rallies Nearly 15% After

In [126]:
# Query: Get Total News Articles (titles and links only)
# exceeds 100 stories per day starting Aug 24 2021
stock = 'bitcoin'
ticker= 'BTC'

# for news count query
# bitcoin must be in the title of the article
q =f"intitle:{stock}"
gn = GoogleNews(lang='en')

# scrape google news RSS for stories
# news_volume = get_goggle_news(q,'2016-01-01','2016-06-30','day',False)
# news_volume = get_goggle_news(q,'2021-09-30','2021-10-31','day',False,news_volume)

# get news volume by world + individual country (when there are >=100 results per day)
country_list = ['us','uk','jp','in','cn','ru','ng','fr','de','hk','il','it','pk','ph']
get_goggle_news_by_country(q,'2021-08-24','2021-10-31',country_list,'day',False,news_volume)

print("News:",len(news_volume))

# save to drive as dataframe
# remove duplicates
df_news_volume = pd.DataFrame(news_volume)
df_news_volume.drop_duplicates(subset=['link'],inplace=True)
df_news_volume.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/{ticker}_news_volume.pickle')




[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Scaping News From:2021-10-31 To:2021-11-01


News from Country:ph
Scaping News From:2021-08-24 To:2021-08-25
1)  Bitcoin 'whales' jump back into market during cryptocurrency's rebound to $50,000 - CNBC
2)  Bitcoin’s Biggest Corporate Investor Buys Another $177 Million—Boosting Holdings To $5.4 Billion Amid Massive Crypto Comeback - Forbes
3)  Cryptocurrency prices today: Bitcoin, Ether dip after heavy selloff - India Today
4)  Bitcoin price holds near $50,000 as hash rate improves - Yahoo Philippines News
5)  Cryptocurrency prices today fall. Check latest rates of Bitcoin, ether, dogecoin, other cryptos - Mint
6)  Bitcoin's revival shows cryptocurrency 'is not a fad' - Yahoo Finance
7)  As Bitcoin soars to near $50,000, Elon Musk’s profit jumps by 250% - Fortune
8)  Bitcoin mining 'golden age' shows higher profits and demand for more infrastructure - Yahoo Finance
9)  From bitcoin dreamer to fugitive, fleeing the Taliban