<a href="https://colab.research.google.com/github/ishaanpaul98/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports #

In [163]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import datetime
import yfinance as yf
import time
import requests
import io

In [164]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [165]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [166]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from sklearn.metrics import plot_confusion_matrix

# Stock Data Helper Functions #

In [167]:
def getStockDataDaily(symbol, day):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = "1d")
    return df

getStockDataDaily('AAPL', "2022-12-21")

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-21,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,130.919998,132.419998,129.639999,131.860001,131.860001,63771000
2022-12-27,131.380005,131.408005,128.720703,130.029999,130.029999,66849930


In [168]:
def arrayToString(arr):
    print("Starting array to list")
    listToStr = ' '.join([str(elem) for elem in arr])
    return listToStr


In [169]:
def getMultiStockDataDaily(symbols, day):
    if len(symbols) > 1:
        print("Length of symbols array is more than 1. STARTING ARRAYTOSTRING")
        symbols = arrayToString(symbols)
    print("Getting stock data for stock $"+symbols)
    df = yf.download(symbols, start=day, period = "1d", group_by='ticker')
    return df

getMultiStockDataDaily(['AAPL', 'TSLA'], "2022-12-21")

Length of symbols array is more than 1. STARTING ARRAYTOSTRING
Starting array to list
Getting stock data for stock $AAPL TSLA
[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,TSLA,TSLA,TSLA,TSLA,TSLA,TSLA,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2022-12-21,139.339996,141.259995,135.889999,137.570007,137.570007,145417400,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,136.0,136.630005,122.260002,125.349998,125.349998,210090300,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,126.370003,128.619995,121.019997,123.150002,123.150002,166396100,130.919998,132.419998,129.639999,131.860001,131.860001,63771000
2022-12-27,117.495003,119.669998,108.760002,109.099998,109.099998,201111195,131.380005,131.408005,128.720703,130.029999,130.029999,66849930


In [170]:
def getStockClose(symbol):
    print("Getting stock close for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['Close'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockClose("AAPL")

Getting stock close for stock $AAPL
AAPL 130.02999877929688


130.02999877929688

In [171]:
def getStockOpen(symbol):
    print("Getting stock open for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['Open'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockOpen("AAPL")

Getting stock open for stock $AAPL
AAPL 131.3800048828125


131.3800048828125

In [172]:
def getStockHigh(symbol):
    print("Getting stock high for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['High'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockHigh("AAPL")

Getting stock high for stock $AAPL
AAPL 131.4080047607422


131.4080047607422

In [173]:
def getStockLow(symbol):
    print("Getting stock low for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['Low'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockLow("AAPL")

Getting stock low for stock $AAPL
AAPL 128.720703125


128.720703125

# Gathering FinViz Data (Today's Sentiment) #

In [174]:
# Parameters 
n = 3 #the # of article headlines displayed per ticker
tickers = ['AAPL', 'TSLA', 'AMZN']

In [185]:
from finvizfinance.news import News
fnews = News()
all_news = fnews.get_news()
all_news

{'news':        Date                                              Title  \
 0   12:07AM  South Korea Dec exports to fall for third mont...   
 1    Dec-27  Fosun Lets China Users Register for Covid Shot...   
 2    Dec-27  Singapore Sentences Stock Manipulator to 36 Ye...   
 3    Dec-27  Here are the winning numbers for Tuesday night...   
 4    Dec-27  Japan Set to Import First Crude Shipment From ...   
 ..      ...                                                ...   
 85   Dec-27  Retirees Are One Reason the Fed Has Given Up o...   
 86   Dec-27  ‘Most Pro-Union President’ Runs Into Doubts in...   
 87   Dec-27  European Gas Futures Hold Decline on Warm Weat...   
 88   Dec-27  Analysis: Wood’s ARK slammed by higher interes...   
 89   Dec-27  Top Japan Bankers See Negative Rates to Stay a...   
 
                Source                                               Link  
 0     www.reuters.com  https://www.reuters.com/markets/asia/south-kor...  
 1   www.bloomberg.com  https://ww

In [175]:
# Get Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finwiz_url + ticker
    print("current url is: " +url)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36."}
    req = Request(url=url,headers=header) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass

current url is: https://finviz.com/quote.ashx?t=AAPL
current url is: https://finviz.com/quote.ashx?t=TSLA
current url is: https://finviz.com/quote.ashx?t=AMZN


Recent News Headlines for AAPL: 
Why Apple Investors Suffered a Tough Time on Tuesday ( Dec-27-22 06:42PM )
5 Companies With Huge Free Cash Flow ( 05:46PM )
Apple (AAPL) Dips More Than Broader Markets: What You Should Know ( 05:45PM )


Recent News Headlines for TSLA: 
Dow Jones Rises, But Tesla, Moderna Lead Growth Sell-Off; 5 Stocks Near Buy Points ( Dec-27-22 08:00PM )
5 things you really should not buy in 2023 ( 06:16PM )
Tesla (TSLA) Now -40% in a Month; Case-Shiller In Depth ( 05:42PM )


Recent News Headlines for AMZN: 
11 Best Amazon Shopping Hacks ( Dec-27-22 06:00PM )
Why Chewy Stock Was Down on Tuesday ( 03:15PM )
Washington Housing Conservancy acquires NoMa's Loree Grand to preserve affordable units ( 01:33PM )


In [176]:
# Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        print(x.get_text())
        text = x.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text ])

Dec-27-22 06:42PMWhy Apple Investors Suffered a Tough Time on Tuesday Motley Fool
05:46PM5 Companies With Huge Free Cash Flow Investopedia
05:45PMApple (AAPL) Dips More Than Broader Markets: What You Should Know Zacks
04:56PMThe Best Charging Stations of 2022 TheStreet.com
04:15PMApple Shares Hit Lowest Since June 2021 on iPhone Supply Concerns Bloomberg

04:02PM
Loading…

04:02PMApple iPhone 14 Pro Supply Bounces Back After Shortages Investor's Business Daily
02:03PMApple stock slumps toward lowest close in 18 months, worst yearly performance since 2008 MarketWatch
01:51PMApple Sued by Ex-Patent Attorney Over Discrimination, Harassment Bloomberg
01:24PMMeta and Alphabet Are Losing Their Advertising Throne Benzinga
01:21PMChina stocks tell 2 different stories about the economy Yahoo Finance
12:13PM3 Top Dividend Growth Stocks I'm Buying to Close Out 2022 Motley Fool
12:01PMTarget, Amazon and 4 More Retailers That Will Reward You for Turning in Your Old Stuff GOBankingRates
10:43AMApple

# Getting historic financial news data #

In [194]:
from os import environ as env
from dotenv import load_dotenv
load_dotenv()
key = env['key']

In [195]:
class Init():
    """Class that initializes global value for the module. It also use general method to initialize value.
     """

    def __init__(self):
        """Built-in method to inialize the global values for the module

        Attributes
        -------------------
        `self.start.date` : str
            start date of the training period. Must be within the last year for the free version of FinHub. Format
            must be "YYYY-mm-dd"
        `self.end_date` : str
            end date of the training period. Format must be "YYYY-mm-dd"
        `self.ticker` : list
            tickers on which we want to perform the test. Can be one ticker in form of a list as well as a list
            of tickers like the s&p 500.
        `self.db_name` : str
            name of the sqlite3 database
        `self.dir_path` : str
            directory where the data are saved. It takes into account the `self.start_date` and `self.end_date`
        `self.start_date_` : datetime object
            same thing as `start_date` but as a datetime object
        `self.end_date_` : datetime object
            same thing as `start_date` but as a datetime object
        """

        #initialize value here
        self.start_date = "2020-02-10"
        self.end_date = "2020-03-20"
        self.ticker = 'AAPL'

        self.db_name = 'financial_data'
        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/output/' + self.start_date + '_' + \
                        self.end_date + '/'
        Path(self.dir_path).mkdir(parents=True, exist_ok=True) #create new path if it doesn't exist
        self.start_date_ = datetime.strptime(self.start_date, "%Y-%m-%d")  #datetime object
        self.end_date_ = datetime.strptime(self.end_date, "%Y-%m-%d")    #datetime object
        self.delta_date = abs((self.end_date_ - self.start_date_).days) #number of days between 2 dates

        try:
            self.start_date_ > self.end_date_
        except:
            print("'start_date' is after 'end_date'")

        try :
            datetime.strptime(self.start_date, "%Y-%m-%d") <= (datetime.now()- relativedelta(years=1))
        except:
            print("'start_date' is older than 1 year. It doesn't work with the free version of FinHub")

In [196]:
class FinnHub():
    """Class to make API calls to FinnHub"""

    def __init__(self,start_date,end_date,start_date_,end_date_,ticker,dir_path,db_name):
        """ Class constructor

        Parameters
        ----------
        `start_date` : str
            Start date of the request. Must be within 1 year from now for must request
            with the free version of FinHub
        `end_date` : str
            End date of the request.
        `start_date_` : datetime object
            Same thing as `start_date` but as a datetime object
        `end_date_` : datetime object
             Same thing as `start_date` but as a datetime object
        `ticker` : str
            Ticker symbol
        `db_name` : str
            Name of the sqlite database
        `dir_path` : str
            Directory  where our data will be stored

        Attributes
        ----------
        `self.max_call` : int
            maximum api calls per minute for the finhub API
        `self.time_sleep` : int
            seconds to sleep before making a new API call. Default is 60 seconds as the maximum number of API calls is
            per minute
        `self.nb_request` : int
            nb of request made so far. Set to 0 in constructor `__init__` as we may loop through ticker
            and want to avoid the variable to reset to 0 when exiting the wrapper `iterate_day()` (which could generate
            an error)
        `self.finhub_key` : str
            finhub unique API key. Get yours here : https://finnhub.io/
        `self.db_name : str
            default file name for the sql database
        """

        #Initialize attributes values here
        self.max_call = 60
        self.time_sleep = 60
        self.nb_request = 0
        self.finhub_key = config('FINHUB_KEY')
        self.news_header = ['category', 'datetime','headline','id','image','related','source','summary','url']
        self.start_date = start_date
        self.end_date = end_date
        self.ticker = ticker
        self.ticker_request = ticker #different value because ticker like 'ALL' (All State) can generate error in SQLite
                                    #database
        self.dir_path = dir_path
        self.db_name = db_name
        self.js_data = []

        self.start_date_ = start_date_ #datetime object
        self.end_date_ = end_date_ #datetime object

        #call the methods to access historical financial headlines
        tickers = get_tickers()
        for ticker_ in tickers:
            self.ticker = ticker_ + '_'
            self.ticker_request = ticker_
            self.req_new()
            self.create_table()
            self.clean_table()
            self.lang_review()

In [198]:
def get_tickers():
    """Method that gets the stock symbols from companies listed in the S&P 500

    Return
    ------
    `tickers` : list
        S&P 500 company symbols
    """
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find_all('table')[0]  # Grab the first table

    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text.strip('\n')
        tickers.append(ticker)

    return tickers

In [199]:
def iterate_day(func):
    """ Decorator that makes the API call on FinHub each days between the `self.start_date`
    and `self.end_date` """

    def wrapper_(self):
        delta_date_ = delta_date(self.start_date,self.end_date)
        date_ = self.start_date
        date_obj = self.start_date_

        for item in range(delta_date_ + 1):
            self.nb_request +=1
            func(self,date_)
            date_obj = date_obj + relativedelta(days=1)
            date_  = date_obj.strftime("%Y-%m-%d")
            if self.nb_request == (self.max_call-1):
                time.sleep(self.time_sleep)
                self.nb_request=0
    return wrapper_

@iterate_day
def req_new(self,date_):
    """ Method that makes news request(s) to the Finnhub API"""

    request_ = requests.get('https://finnhub.io/api/v1/company-news?symbol=' + self.ticker_request + '&from=' +
                            date_ + '&to=' + date_ + '&token=' + self.finhub_key)
    self.js_data += request_.json()

# Sentiment Analysis of FinViz data #

In [177]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [178]:
# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

columns = ['Ticker', 'Date', 'Time', 'Headline']
news = pd.DataFrame(parsed_news, columns=columns)
scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

df_scores = pd.DataFrame(scores)
news = news.join(df_scores, rsuffix='_right')

In [179]:
# View Data 
#news = pd.DataFrame()
news['Date'] = pd.to_datetime(news.Date).dt.date

unique_ticker = news['Ticker'].unique().tolist()
news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}

values = []
for ticker in tickers: 
    dataframe = news_dict[ticker]
    dataframe = dataframe.set_index('Ticker')
    dataframe = dataframe.drop(columns = ['Headline'])
    print ('\n')
    print (dataframe.head())
    
    mean = round(dataframe['compound'].mean(), 2)
    values.append(mean)
    
df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
df = df.set_index('Ticker')
df = df.sort_values('Mean Sentiment', ascending=False)
print ('\n')
print (df)



              Date     Time    neg    neu    pos  compound
Ticker                                                    
AAPL    2022-12-27  06:42PM  0.487  0.513  0.000   -0.7650
AAPL    2022-12-27  05:46PM  0.000  0.517  0.483    0.6808
AAPL    2022-12-27  05:45PM  0.000  1.000  0.000    0.0000
AAPL    2022-12-27  04:56PM  0.000  0.588  0.412    0.6369
AAPL    2022-12-27  04:15PM  0.176  0.676  0.149   -0.1027


              Date     Time  neg    neu    pos  compound
Ticker                                                  
TSLA    2022-12-27  08:00PM  0.0  0.833  0.167    0.5267
TSLA    2022-12-27  06:16PM  0.0  1.000  0.000    0.0000
TSLA    2022-12-27  05:42PM  0.0  1.000  0.000    0.0000
TSLA    2022-12-27  05:34PM  0.0  1.000  0.000    0.0000
TSLA    2022-12-27  05:15PM  0.0  1.000  0.000    0.0000


              Date     Time    neg    neu    pos  compound
Ticker                                                    
AMZN    2022-12-27  06:00PM  0.000  0.459  0.541    0.7096
AMZN 

In [180]:
#Testing helper functions from stock_helper_functions.ipynb
aapl_open = getStockOpen("AAPL")

Getting stock open for stock $AAPL
AAPL 131.3800048828125


# Creating Dataset #

In [181]:
num = df.loc['AMZN']
num

Mean Sentiment   -0.04
Name: AMZN, dtype: float64

In [183]:
#Pulling stock data
for i in tickers:
    dataset = getStockDataDaily(i, day='2022-12-27')
    dataset['Sentiment'] = float(df.loc[i])
    #print(dataset)
    

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2022-12-27  131.380005  131.408005  128.720703  130.029999  130.029999   

              Volume  Sentiment  
Date                             
2022-12-27  66849930      -0.06  
Getting stock data for stock $TSLA
[*********************100%***********************]  1 of 1 completed
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2022-12-27  117.495003  119.669998  108.760002  109.099998  109.099998   

               Volume  Sentiment  
Date                              
2022-12-27  201111195      -0.11  
Getting stock data for stock $AMZN
[*********************100%***********************]  1 of 1 completed
                 Open       High   