<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Capstone Project:  Sentiment Analysis on Financial News

# Libraries

In [1]:
import time
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Scraping

##### Function to scrap latest 100 headlines from Finviz.

In [2]:
def get_finviz_headlines(ticker, start_date=None, end_date=None):
        
    # create an empty dataframe
    df_concat = pd.DataFrame()
    
    for t in ticker:
        
        path = r"C:\ChromeDriver\chromedriver.exe"
        service = Service(path)
        driver = webdriver.Chrome(service=service)
        driver.get('https://finviz.com')
        
        search = driver.find_element(By.XPATH, "//input[@placeholder='Search ticker, company or profile']")
        search.send_keys(t)
        search.send_keys(Keys.RETURN)  

        try:
            # Wait up to 10 seconds for the title element to be present
            WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//title")))     

            # Check that the title contains the text "GOOG"
            if t in driver.title:
                # Get the table element
                table = driver.find_element(By.ID, "news-table")
                # Get all the rows in the table
                rows = table.find_elements(By.TAG_NAME, "tr")
                # Initialize an empty list to store the data
                data = {'datetime':[], 'headlines':[], 'links':[]}
                # Loop through the rows and extract the data
                for row in rows:    
                    cells = row.find_elements(By.TAG_NAME, "td")
                    data['datetime'].append(cells[0].text)
                    data['headlines'].append(cells[1].text)
                    data['links'].append(cells[1].find_element(By.TAG_NAME, "a").get_attribute("href"))

                # Create dataframe
                df = pd.DataFrame(data)
                #removing the sources after '\n'
                df['headlines'] = df['headlines'].apply(lambda x: x.split('\n')[0].lower())
                #split the datetime format and create new columns for date and time
                for i,date_time in enumerate(df['datetime']):
                    if len(date_time.split(' ')) == 1:
                        df.loc[i, 'date'] = df.loc[i-1, 'date']
                    elif len(date_time.split(' ')) == 2:
                        df.loc[i, 'date'] =  date_time.split(' ')[0]
                    df.loc[i, 'time'] = date_time.split(' ')[-1]

                #create column for ticker
                df['ticker'] = t
                #convert to date and time format
                df['date'] = df['date'].apply(lambda x: pd.to_datetime(x, format='%b-%d-%y'))
                df['time'] = df['time'].apply(lambda x: pd.to_datetime(x, format='%I:%M%p').time())
                #drop the original datetime column
                df = df.drop('datetime', axis=1)
                # Append the current dataframe to the concatenated dataframe
                df_concat = pd.concat([df_concat, df], ignore_index=True)

            else:
                print(f"Title does not contain {t}")

        except TimeoutException:
            print("Timed out waiting for title element")

        finally:
            driver.quit()
            
    # Filter the dataframe based on the start and end dates
    if start_date is not None:
        df_concat = df_concat.query('date >= @start_date')
    if end_date is not None:
        df_concat = df_concat.query('date <= @end_date')
            
    # Return the concatenated dataframe
    return df_concat

##### Function to scrap latest 100 stock info from Yahoo Finance.

In [3]:
def get_stock_info(ticker):
    
    # create an empty dataframe
    df_concat = pd.DataFrame()
    
    for t in ticker:
        path = r"C:\ChromeDriver\chromedriver.exe"
        service = Service(path)
        driver = webdriver.Chrome(service=service)
        driver.get('https://sg.finance.yahoo.com/')

        # Search engine
        search = driver.find_element(By.ID, 'yfin-usr-qry')
        search.send_keys(t)
        search.send_keys(Keys.RETURN) 

        try:
            # Wait for the historical button to become clickable
            wait = WebDriverWait(driver, 10)        
            historical_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//span[text()="Historical data"]')))
            historical_button.click()

            # Wait up to 10 seconds for the class element to be present
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//table[@class="W(100%) M(0)"]'))) 
            table = driver.find_element(By.XPATH, '//table[@class="W(100%) M(0)"]')
            rows = table.find_elements(By.TAG_NAME, 'tr')
            stock_info = {'date':[], 'ticker':[], 'open':[], 'high':[], 'low':[], 'close':[], 'adj_close':[], 'volume':[]}
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, 'td')
                # Only append the appropriate cell for each key in stock_info
                if len(cells) == 7:
                    stock_info['ticker'].append(t)
                    stock_info['date'].append(cells[0].text)
                    stock_info['open'].append(cells[1].text)
                    stock_info['high'].append(cells[2].text)
                    stock_info['low'].append(cells[3].text)
                    stock_info['close'].append(cells[4].text)
                    stock_info['adj_close'].append(cells[5].text)
                    stock_info['volume'].append(cells[6].text) 
            #create dataframe
            df = pd.DataFrame(stock_info)
            #convert to date format
            df['date'] = df['date'].apply(lambda x: pd.to_datetime(x, format='%d %b %Y'))
            #convert price to int
            df[['open', 'high', 'low', 'close', 'adj_close']] = df[['open', 'high', 'low', 'close', 'adj_close']].astype(float)
            # remove commas from the 'volume' column and convert it to integer type
            df['volume'] = df['volume'].str.replace(',', '').astype(int)
            # Append the current dataframe to the concatenated dataframe
            df_concat = pd.concat([df_concat, df], ignore_index=True)
            
        except TimeoutException:
                print("Timed out")
        finally:
            driver.quit()
    return df_concat

# Function for Sentiment Prediction (Sentence)

In [4]:
def sentiment_prediction_text(text):
    
    #tokernization
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
    token = tokenizer.batch_encode_plus([text],
                                        padding='max_length',
                                        max_length=80,              #following best model finbert 1.1
                                        add_special_tokens=True,
                                        truncation=True,             
                                        return_attention_mask=True,  
                                        return_tensors='tf')
    input_ids = token['input_ids']                                   
    token_type_ids = token['token_type_ids']
    attention_masks = token['attention_mask']

    # create dataset from new input data
    new_dataset = tf.data.Dataset.from_tensor_slices(({
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    })).batch(1)                                                    #training model uses 32 because of large array of tensors

    # Load the BERT model with the custom object scope
    model = tf.keras.models.load_model('model_checkpoint/finbert1.1.h5',
                                       custom_objects={"TFBertForSequenceClassification": transformers.TFBertForSequenceClassification})

    # make predictions on the new data
    predictions = model.predict(new_dataset)

    # The predictions will be a probability distribution over the classes, use argmax to find the highest prob
    predicted_classes = np.argmax(predictions, axis=1)

    # Define a mapping from class labels to sentiment labels
    class_to_sentiment = {0: 'negative', 1: 'neutral', 2: 'positive'}

    # Map the predicted class labels to sentiment labels using numpy
    predicted_sentiments = np.vectorize(class_to_sentiment.get)(predicted_classes)

    return predicted_sentiments

# Function for Sentiment Prediction (Dataframe)

In [5]:
def sentiment_prediction_df(df):
    
    #list of text
    headline_list = []
    for headline in df['headlines']:
        headline_list.append(headline)
                          
    #tokernization
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
    token = tokenizer.batch_encode_plus(headline_list,
                                        padding='max_length',
                                        max_length=80,              #following best model finbert1.1
                                        add_special_tokens=True,
                                        truncation=True,             
                                        return_attention_mask=True,  
                                        return_tensors='tf')
    input_ids = token['input_ids']                                   
    token_type_ids = token['token_type_ids']
    attention_masks = token['attention_mask']

    # create dataset from new input data
    new_dataset = tf.data.Dataset.from_tensor_slices(({
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    })).batch(32)                                                    
    
    # Load the BERT model with the custom object scope
    model = tf.keras.models.load_model('model_checkpoint/finbert1.1.h5',
                                       custom_objects={"TFBertForSequenceClassification": transformers.TFBertForSequenceClassification})

    # make predictions on the new data
    predictions = model.predict(new_dataset)

    # The predictions will be a probability distribution over the classes, use argmax to find the highest prob
    predicted_classes = np.argmax(predictions, axis=1)

    # Define a mapping from class labels to sentiment labels
    class_to_sentiment = {0: 'negative', 1: 'neutral', 2: 'positive'}

    # Map the predicted class labels to sentiment labels using numpy
    predicted_sentiments = np.vectorize(class_to_sentiment.get)(predicted_classes)

    # Add the predicted sentiments as a new column to the input DataFrame
    df['predicted_sentiment'] = predicted_sentiments
    
    #function to assign int score
    def score(sentiment):
        if sentiment == 'neutral':
            return 0
        elif sentiment == 'positive':
            return 1
        elif sentiment == 'negative':
            return -1
    
    #create a column for sentiment in int (0,1,2)
    df['score'] = df['predicted_sentiment'].apply(lambda x: score(x))
    
    # Return the modified DataFrame
    return df

# Example

In [6]:
faang = get_finviz_headlines(['AAPL', 'META', 'AMZN', 'NFLX', 'GOOG'])

In [7]:
faang = sentiment_prediction_df(faang)



In [8]:
faang

Unnamed: 0,headlines,links,date,time,ticker,predicted_sentiment,score
0,britain's warren buffett recently bought stock...,https://finance.yahoo.com/m/e167c08c-1cf2-3d6a...,2023-04-26,08:00:00,AAPL,neutral,0
1,"apple mixc shenzhen opens friday, april 28, in...",https://finance.yahoo.com/news/apple-mixc-shen...,2023-04-25,23:30:00,AAPL,neutral,0
2,apple developing ai-powered health coaching se...,https://finance.yahoo.com/news/apple-developin...,2023-04-25,17:39:00,AAPL,neutral,0
3,"tiktok, twitter, meta face countdown to comply...",https://finance.yahoo.com/m/98869e1e-b8b1-39bd...,2023-04-25,13:56:00,AAPL,neutral,0
4,apple wins another court ruling. its app store...,https://finance.yahoo.com/m/c34d5d8c-ccc1-3754...,2023-04-25,12:59:00,AAPL,neutral,0
...,...,...,...,...,...,...,...
495,"elon musk, other ai bigwigs call for pause in ...",https://finance.yahoo.com/m/5951e4eb-0696-3054...,2023-03-29,08:33:00,GOOG,neutral,0
496,youtube looking into gandhis claim political v...,https://finance.yahoo.com/m/13dc1b3d-cbbd-3f8a...,2023-03-29,06:58:00,GOOG,negative,-1
497,three of the biggest trends in retail today,https://finance.yahoo.com/video/three-biggest-...,2023-03-28,21:31:00,GOOG,neutral,0
498,"nfl, redbird team up to distribute sunday tick...",https://finance.yahoo.com/m/e764fdf6-f52b-3b99...,2023-03-28,18:24:00,GOOG,neutral,0


In [14]:
faang['headlines'][499]

'alibaba stock continues to climb after announcement of six-way business split'

# END

In [None]:
pls ignore the rest...

In [None]:
# faang_stock_info = get_stock_info(['AAPL', 'META', 'AMZN', 'NFLX', 'GOOG'])

In [None]:
# faang_stock_info

In [None]:
# apple = faang[faang['ticker'] == 'AAPL']
# apple = apple.groupby('date')[['score']].mean()

In [None]:
# apple

In [None]:
# pd.merge(f, apple, on='date', how='right')

In [None]:
# f = faang_stock_info[faang_stock_info['ticker'] == 'AAPL'][['date', 'close', 'volume']].set_index('date')
# f = f.pct_change().dropna(axis=0)
# f

In [None]:
# faang_stock_info.head()

In [None]:
# plt.rcParams['figure.figsize'] = [15, 5]
# mean = faang.groupby(['ticker', 'date'])['score'].mean().unstack().transpose()
# mean.plot(kind='bar')
# plt.grid(alpha=0.5)
# mean