In [1]:
import requests
import json
import os
import re
import datetime
import time
import string
import pandas as pd
from bs4 import BeautifulSoup as bs
from joblib import Parallel, delayed

# Google Scraping

In [21]:
def get_useragent():
    return random.choice(_useragent_list)

_useragent_list = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
]

request_timeout = 10
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Cache-Control': 'private, max-age=0',
}
html_pattern = "<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>"
TODAY = datetime.date.today()

#search variables
from_year = 2023
to_year = 2024
total_news_per_year = 1000

In [22]:
"""googlesearch is a Python library for searching Google, easily."""
from time import sleep
from bs4 import BeautifulSoup
from requests import get
import urllib
import random

def get_headers(): 
    headers = {
        'User-Agent': get_useragent(),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Cache-Control': 'private, max-age=0',
    }

    return headers

def _req(term, results, lang, start, proxies, timeout, from_year, to_year):
    resp = get(
        url="https://www.google.com/search",
        headers={
            "User-Agent": get_useragent()
        },
        params={
            "q": term,
            "num": results,  # Prevents multiple requests
            "hl": lang,
            "start": start,
            "lr": "lang_en",
            "tbm": 'nws',
            "tbs": "qdr:d"
            #"tbs": f"sbd:1,lr:lang_en,cdr:1,cd_min:1/1/{from_year},cd_max:{to_year}"
        },
        proxies=proxies,
        timeout=timeout,
    )
    resp.raise_for_status()
    return resp

def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, from_year=2020, to_year=2020):
    """Search the Google search engine"""

    escaped_term = urllib.parse.quote_plus(term) # make 'site:xxx.xxx.xxx ' works.

    # Proxy
    proxies = None
    if proxy:
        if proxy.startswith("https"):
            proxies = {"https": proxy}
        else:
            proxies = {"http": proxy}

    # Fetch
    start = 0
    res = []
    while start < num_results:
        # Send request
        resp = _req(escaped_term, num_results - start,
                    lang, start, proxies, timeout, from_year, to_year)
        # Parse
        soup = BeautifulSoup(resp.text, "html.parser")
        result_block = soup.find_all("div", attrs={"class": "SoaBEf"})

        if len(result_block)==0:
            break

        for result in result_block:
            # Find url, title, description
            url = result.find("a", href=True)
            title = result.find("div", attrs={"class": "n0jPhd ynAwRc MBeuO nDgy9d"})
            date = result.find("div", attrs={"class": "OSrXXb rbYSKb LfVVr"})
            obj = {}
            obj["title"] = None
            obj["date"] = None
            obj["url"] = None
            start += 1

            if title:
                obj["title"] = title.text
            if date:
                obj["date"] = date.text
            if url:
                obj["url"] = url["href"]
                res.append(obj)
                        
        sleep(sleep_interval)
    return res

In [23]:
def get_google_news_urls(q, total, from_year, to_year, sleep_interval=90):
    urls = []
    urls = search(q, num_results=total, from_year=from_year, to_year=to_year, sleep_interval=sleep_interval)
    return urls

In [24]:
def save_news_urls(q, total, from_year, to_year):
    dir_name = str(TODAY)
    filename = f'{q}.json'
    
    if dir_name not in os.listdir():
        cwd = os.getcwd()
        os.mkdir(f'{cwd}/{dir_name}')
        
    if filename in os.listdir(f'./{dir_name}'):
        print(f'{q}.json exists.')
        return None
    
    urls = {}
    for year in range(from_year, to_year + 1):
        urls[str(year)] = get_google_news_urls(q, total, year, year)
        print(f'Year {year} News urls collection - Done')
        #delay request to prevent 429 Too many Request
    print()
    
    f = open(f'./{TODAY}/{filename}', 'w')
    f.write(json.dumps(urls, indent = 4))
    f.close()
    return urls

In [25]:
def get_news_data(url):
    text = ""
    try:
        article = requests.get(
            url,
            headers=headers,
            timeout=request_timeout
        )
        status_code = article.status_code
        if status_code >= 400:
            f = open(f'./{TODAY}/ERROR_LOG.txt', 'a')
            f.write(f'[{time.asctime(time.localtime())}] Code {status_code}: {url}\n')
            f.close()
            return None
        soup = bs(article.content, "html.parser")
        article_body = soup.find("div", {"class": "caas-body"})
        paragraphs = article_body.find_all("p")
        sleep(1)
        if paragraphs is not None:
            for p in paragraphs:
                text += re.sub(html_pattern, '', p.text).strip() + ' '  
        return text
    except requests.exceptions.RequestException as e:
        msg = e

In [26]:
def save_dataset(q, urls, year):
    total_count = 0
    for url in urls:
        try:
            data = url
            data['text'] = get_news_data(data['url'])
            if data is None:
                continue
            if q not in os.listdir(f'./{TODAY}'):
                cwd = os.getcwd()
                os.mkdir(f'{cwd}/{TODAY}/{q}')
            total_count += 1
            file_error_symbols = []
            filename = f'{year}-{total_count}'
            f = open(f'./{TODAY}/{q}/{filename}.json', "w")
            f.write(json.dumps(data, indent = 4))
            f.close()
        except requests.exceptions.RequestException as e:
            print(e)
    print(f'Saved {total_count} news data in year {year}')
    #print(os.system("npx prettier -w ./dataset/*.json"))

In [27]:
def get_urls_from_file(filename):
    f = open(filename, "r")
    data = json.load(f)
    f.close()
    return data

In [28]:
def get_keywords_from_file(filename):
    keywords = []
    f = open(filename, "r")
    for keyword in f:
        keywords.append(keyword)
    f.close()
    return list(map(lambda keyword: keyword.strip(), keywords))

In [29]:
def parallel_save_dataset(q, urls):
    start = time.time()
    Parallel(n_jobs = -1)(delayed(save_dataset)(q, urls[year], year) for year in urls)
    end = time.time()
    print('{:.2f} seconds used'.format(end - start))

In [30]:
#run all functions to save news data
def scrap_news_data(total, from_year, to_year, keywords_filename="keywords.txt"):
    #count used time
    start = time.time()
    #step 1 - load keywords file
    keywords = get_keywords_from_file(keywords_filename)
    for query in keywords:
        print(f'Scrapping {query} ...')
        #step 2 - save urls to json file
        save_news_urls(query, total, from_year, to_year)
        #step 3 - get urls object from previous saved json file
        urls = get_urls_from_file(f'./{TODAY}/{query}.json')
        #step 4 - web scrapping news data from specific year of urls & save into directory seperately
        parallel_save_dataset(query, urls)
        
    end = time.time()
    print('[DONE] {:.2f} seconds used'.format(end - start))

In [31]:
def json_to_csv(q, json_obj, index):
    text = None
    title = None
    date = None
    url = json_obj['url']
    if 'date' in json_obj.keys():
        date = json_obj['date']
    if json_obj['text'] is not None:
        text = re.sub(r'[,.\'"|]', '', json_obj['text'])
    if 'title' in json_obj.keys() and json_obj['title'] != None:
        title = re.sub(r'[,.\'"|]', '', json_obj['title'])
    if text != '' and text != None:
        text = ' '.join(text.encode('utf-8').decode().split())
    if title != '' and title != None:
        title = ' '.join(title.split()).strip() 
    f = open(f'./{TODAY}/{q}.csv', 'a', encoding='utf-8')
    f.write(f'{index},\"{title}\",\"{date}\",\"{url}\",\"{text}\"\n')
    f.close()

In [32]:
def all_json_to_csv():
    cols = "title,date,url,text\n"
    keywords = get_keywords_from_file('keywords.txt')
    for keyword in keywords:
        f = open(f'./{TODAY}/{keyword}.csv', 'w')
        f.write(cols)
        f.close()
        for index, filename in enumerate(os.listdir(f'./{TODAY}/{keyword}')):
            json_file = open(f'./{TODAY}/{keyword}/{filename}', 'r')
            json_obj = json.load(json_file)
            json_file.close()
            json_to_csv(keyword, json_obj, index)
    print('Done')

In [33]:
def combine_csv():
    keywords = get_keywords_from_file('keywords.txt')
    dataframes = []
    for keyword in keywords:
        df = pd.read_csv(f'./{TODAY}/{keyword}.csv', on_bad_lines='skip')
        dataframes.append(df)
    dataframe = pd.concat(dataframes, ignore_index=True)
    # drop duplicate url
    dataframe = dataframe.drop_duplicates(subset=['url'])
    dataframe.to_csv(f'./{TODAY}/dataset.csv')
    print('Done')

In [34]:
scrap_news_data(total_news_per_year, from_year, to_year)

Scrapping AAPL ...
Year 2023 News urls collection - Done
Year 2024 News urls collection - Done



AttributeError: 'NoneType' object has no attribute 'find_all'

# Yahoo Finance

In [16]:
requests.get("https://finance.yahoo.com/quote/NVDA/news?p=NVDA")

<Response [404]>

In [89]:
import yfinance as yf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from datetime import datetime
from transformers import pipeline

nvda = yf.Ticker("NVDA")

In [25]:
news = nvda.news

In [33]:
for i in news:
    title = i['title']
    date = datetime.fromtimestamp(i['providerPublishTime'])
    print(date, title)

2024-01-09 19:00:00 Best AI Stock 2024: Nvidia Stock vs. IBM Stock
2024-01-09 18:53:00 These Stocks Are Moving the Most Today: Boeing, Juniper Networks, HPE, Nvidia, Unity Software, Match, and More
2024-01-09 18:30:00 Nvidia Stock Is Over $500 -- Here Is What Investors Should Know About Recent Updates
2024-01-09 18:05:00 Better Artificial Intelligence (AI) Stock: Nvidia vs. Alphabet
2024-01-09 16:24:00 Heard on the Street Recap: Maxing Out
2024-01-09 10:40:44 Dow Jones Futures Fall: Magnificent Seven Stocks Rally; Nvidia Breaks Out Past Buy Point
2024-01-09 09:55:50 CES 2024: How to watch live as Sony, Samsung and more reveal hardware, AI updates
2024-01-09 07:57:00 Markets Party Like It's 2023!


In [55]:
article = get_news_data(url=news[6]['link'])

In [58]:
stop_words = set(stopwords.words("english"))
special_characters = "!@#$%^&*()-+?_=,<>\""
lemmatizer = WordNetLemmatizer()

In [63]:
tokenized_words = word_tokenize(article.lower())
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_words]

In [65]:
preprocessed_words = [word for word in lemmatized_words if word not in stop_words]

In [70]:
preprocessed_words = [word for word in lemmatized_words if word not in special_characters]

In [82]:
out = " ".join(preprocessed_words)

In [113]:
model_id = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"

sentiment_pipe = pipeline("sentiment-analysis", model=model_id)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [115]:
sentiment_pipe(out)

Token indices sequence length is longer than the specified maximum sequence length for this model (2285 > 512). Running this sequence through the model will result in indexing errors


InvalidArgumentError: Exception encountered when calling layer "embeddings" "                 f"(type TFRobertaEmbeddings).

{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[0,512] = 514 is not in [0, 514) [Op:ResourceGather]

Call arguments received by layer "embeddings" "                 f"(type TFRobertaEmbeddings):
  • input_ids=tf.Tensor(shape=(1, 2285), dtype=int32)
  • position_ids=None
  • token_type_ids=tf.Tensor(shape=(1, 2285), dtype=int32)
  • inputs_embeds=None
  • past_key_values_length=0
  • training=False

In [118]:
len(out.split())

1426

In [119]:
out.split()

['ce',
 '2024',
 'is',
 'finally',
 'upon',
 'u',
 'taking',
 'over',
 'la',
 'vega',
 'with',
 'throng',
 'of',
 'crowd',
 'booth',
 'full',
 'of',
 'product',
 'and',
 'a',
 'lot',
 'of',
 'company',
 'making',
 'claim',
 'about',
 'how',
 'ai',
 'is',
 'improving',
 'their',
 'offering',
 '.',
 'a',
 'noted',
 'in',
 'our',
 'ce',
 'preview',
 'though',
 'the',
 'conference',
 'ha',
 'had',
 'it',
 'ups',
 'and',
 'down',
 'of',
 'late',
 'it',
 '’',
 's',
 'increasingly',
 'become',
 'an',
 'opportunity',
 'for',
 'startup',
 'to',
 'capture',
 'attention',
 'while',
 'all',
 'eye',
 'are',
 'drawn',
 'to',
 'the',
 'bigger',
 'budget',
 'announcement',
 'from',
 'the',
 'like',
 'of',
 'samsung',
 'sony',
 'and',
 'nvidia',
 '.',
 'techcrunch',
 'is',
 'on',
 'the',
 'ground',
 'at',
 'ce',
 '2024',
 'throughout',
 'the',
 'event',
 'next',
 'week',
 'with',
 'a',
 'particular',
 'focus',
 'on',
 'those',
 'startup',
 'that',
 'might',
 'be',
 'headlining',
 'a',
 'big',
 'livestr