<a href="https://colab.research.google.com/github/ishaanpaul98/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports #

In [127]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import datetime
import yfinance as yf
import time
import requests
import io

In [128]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [129]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [130]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from sklearn.metrics import plot_confusion_matrix

# Stock Data Helper Functions #

In [131]:
def getStockDataDaily(symbol, day):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = "1d")
    return df

getStockDataDaily('AAPL', "2022-12-21")

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-21,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,130.919998,132.419998,129.639999,131.860001,131.860001,63814900
2022-12-27,131.380005,131.410004,128.720001,130.029999,130.029999,69007800
2022-12-28,129.669998,131.029999,125.870003,126.040001,126.040001,85438400
2022-12-29,127.989998,130.479996,127.730003,129.610001,129.610001,75703700
2022-12-30,128.410004,129.949997,127.43,129.929993,129.929993,76960600
2023-01-03,130.279999,130.899994,124.169998,125.07,125.07,112117500
2023-01-04,126.889999,128.660004,125.080002,126.360001,126.360001,89113600
2023-01-05,127.129997,127.769997,124.760002,125.019997,125.019997,80962700


In [132]:
def arrayToString(arr):
    print("Starting array to list")
    listToStr = ' '.join([str(elem) for elem in arr])
    return listToStr


In [133]:
def getMultiStockDataDaily(symbols, day):
    if len(symbols) > 1:
        print("Length of symbols array is more than 1. STARTING ARRAYTOSTRING")
        symbols = arrayToString(symbols)
    print("Getting stock data for stock $"+symbols)
    df = yf.download(symbols, start=day, period = "1d", group_by='ticker')
    return df

getMultiStockDataDaily(['AAPL', 'TSLA'], "2022-12-21")

Length of symbols array is more than 1. STARTING ARRAYTOSTRING
Starting array to list
Getting stock data for stock $AAPL TSLA
[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,TSLA,TSLA,TSLA,TSLA,TSLA,TSLA,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2022-12-21,139.339996,141.259995,135.889999,137.570007,137.570007,145417400,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,136.0,136.630005,122.260002,125.349998,125.349998,210090300,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,126.370003,128.619995,121.019997,123.150002,123.150002,166989700,130.919998,132.419998,129.639999,131.860001,131.860001,63814900
2022-12-27,117.5,119.669998,108.760002,109.099998,109.099998,208643400,131.380005,131.410004,128.720001,130.029999,130.029999,69007800
2022-12-28,110.349998,116.269997,108.239998,112.709999,112.709999,221070500,129.669998,131.029999,125.870003,126.040001,126.040001,85438400
2022-12-29,120.389999,123.57,117.5,121.82,121.82,221923300,127.989998,130.479996,127.730003,129.610001,129.610001,75703700
2022-12-30,119.949997,124.480003,119.75,123.18,123.18,157304500,128.410004,129.949997,127.43,129.929993,129.929993,76960600
2023-01-03,118.470001,118.800003,104.639999,108.099998,108.099998,231402800,130.279999,130.899994,124.169998,125.07,125.07,112117500
2023-01-04,109.110001,114.589996,107.519997,113.639999,113.639999,180389000,126.889999,128.660004,125.080002,126.360001,126.360001,89113600
2023-01-05,110.510002,111.75,107.160004,110.339996,110.339996,157986300,127.129997,127.769997,124.760002,125.019997,125.019997,80962700


In [134]:
def getStockClose(symbol):
    print("Getting stock close for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['Close'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockClose("AAPL")

Getting stock close for stock $AAPL
AAPL 135.94000244140625


135.94000244140625

In [135]:
def getStockOpen(symbol):
    print("Getting stock open for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['Open'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockOpen("AAPL")

Getting stock open for stock $AAPL
AAPL 134.8300018310547


134.8300018310547

In [136]:
def getStockHigh(symbol):
    print("Getting stock high for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['High'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockHigh("AAPL")

Getting stock high for stock $AAPL
AAPL 137.2899932861328


137.2899932861328

In [137]:
def getStockLow(symbol):
    print("Getting stock low for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['Low'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockLow("AAPL")

Getting stock low for stock $AAPL
AAPL 134.14999389648438


134.14999389648438

# Gathering FinViz Data (Today's Sentiment) #

In [138]:
# Parameters 
n = 3 #the # of article headlines displayed per ticker
tickers = ['AAPL', 'TSLA', 'AMZN']

In [139]:
from finvizfinance.news import News
fnews = News()
all_news = fnews.get_news()
all_news

{'news':        Date                                              Title  \
 0   07:58PM  Looming debt ceiling showdown risks triggering...   
 1   07:52PM  Goldman Sachs Lifts China’s Economic Growth Fo...   
 2   07:39PM  Stocks Mixed, Yen Weakens Ahead of BOJ Meeting...   
 3   07:38PM  Cowboys Micah Parsons' weighs in on inflation,...   
 4   07:17PM  Mexican economy grew around 3% last year, fuel...   
 ..      ...                                                ...   
 85  08:46AM  Royal Mail accused of prioritising parcels ove...   
 86  08:45AM  Goldman Sachs Profit Plunges, Lagging Morgan S...   
 87  08:37AM  Morgan Stanley profit beats on strength in tra...   
 88  08:26AM  Putin’s Cash Flood From Exports Slows After Cu...   
 89  08:21AM  Morgan Stanley Beats as Wealth Management Hits...   
 
                Source                                               Link  
 0     foxbusiness.com  https://foxbusiness.com/economy/looming-debt-c...  
 1   www.bloomberg.com  https://ww

In [140]:
# Get Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finwiz_url + ticker
    print("current url is: " +url)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36."}
    req = Request(url=url,headers=header) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass

current url is: https://finviz.com/quote.ashx?t=AAPL
current url is: https://finviz.com/quote.ashx?t=TSLA
current url is: https://finviz.com/quote.ashx?t=AMZN


Recent News Headlines for AAPL: 
Apple Introduces Faster MacBook Pros and Mac Minis ( Jan-17-23 06:16PM )
Apple (AAPL) Gains As Market Dips: What You Should Know ( 05:45PM )
The ongoing big tech antitrust cases to watch in 2023 ( 04:57PM )


Recent News Headlines for TSLA: 
Market Rally Pauses, Tesla Jumps, Moderna Pops Late; What Investors Need Now ( Jan-17-23 07:59PM )
Jury selected for Elon Musk trial about 2018 Tesla buyout tweets ( 07:40PM )
Musks Tesla funding secured trial to begin after jury selected ( 07:31PM )


Recent News Headlines for AMZN: 
Americas ports have a pollution problem. All-electric short-haul trucking is one fix. ( Jan-17-23 07:06PM )
The 5 tech earnings to watch as holiday-season results start to flood in ( 06:51PM )
More than 25,000 global tech workers laid off in the first weeks of 2023, says layoff

In [141]:
# Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        print(x.get_text())
        text = x.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text ])

Jan-17-23 06:16PMApple Introduces Faster MacBook Pros and Mac Minis Bloomberg
05:45PMApple (AAPL) Gains As Market Dips: What You Should Know Zacks
04:57PMThe ongoing big tech antitrust cases to watch in 2023 Quartz
04:39PMSupreme Court Asks U.S. Government to Weigh In on Apple Patent Dispute The Wall Street Journal
04:02PMChina sees record population declines amid GDP growth Yahoo Finance Video

03:42PM
Loading…

03:42PMApple unveils M2 chips in new Mac Mini, MacBook Pro Yahoo Finance Video
02:41PMApple Rolls Out Latest MacBook Pro With New M2 Chips The Wall Street Journal
02:07PMThree big questions facing tech stocks this earnings season Yahoo Finance
12:48PMU.S. Supreme Court asks for gov't views on blockbuster Apple/Caltech patent dispute Reuters
12:03PMApple AirTags and Bluetooth Trackers Are Officially a Billion-Dollar Industry  Here's What To Know, Trends, and the Best Ways To Invest Benzinga
10:52AMApple debuts MacBook Pro and Mac Mini with new high-powered M2 Pro and M2 Max chi

# Gathering Data From AlphaAdvantage for Historical News #

In [142]:
from decouple import config
import requests
import urllib.parse
import json
AAapikey = config('AAKey')

In [143]:
#Function to convert user provided date to date required by AlphaAdvantage
def toAADate(oldDate):
    newDate = str(oldDate) + 'T0001'
    return newDate

In [144]:
# Get data from AlphaAdvantage for one ticker for a particular day
def getHistoricNewsData(ticker, startDate = None, endDate = None):
    url = 'https://www.alphavantage.co/query?'
    if startDate and endDate:
        startDate = toAADate(startDate)
        endDate = toAADate(endDate)
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'time_from': startDate, 'time_to': endDate, 'sort': 'LATEST','limit': 100, 'apikey': AAapikey}
    else:
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'sort': 'LATEST','limit': 100, 'apikey': AAapikey}
    r = requests.get(url, params = Myparams)
    data = r.json()
    #return data
    historic_news = pd.DataFrame(columns=['Date', 'Headline', 'Ticker'])
    for i in data.get("feed"):
        test_date = i.get("time_published")
        test_date = test_date[:8]
        row = [test_date, i.get("title"), ticker]
        new_df = pd.DataFrame([row],columns=['Date', 'Headline', 'Ticker'])
        historic_news = pd.concat([historic_news, new_df], axis=0, ignore_index=True)
    return historic_news

historic_news = getHistoricNewsData('AAPL', '20230108', '20230111')
print(historic_news)

        Date                                           Headline Ticker
0   20230110  Apple  ( AAPL )  Gains But Lags Market: What Y...   AAPL
1   20230110  Market Reclaims Key Levels Ahead Of CPI Inflat...   AAPL
2   20230110  Mega Millions Tops $1.1 Billion: Here's How Mu...   AAPL
3   20230110  IBM just broke a winning streak that lasted ne...   AAPL
4   20230110  Everyone's Worst Fears About the Roomba Have C...   AAPL
5   20230110  Could Apple WiFi chips just be a ploy to get a...   AAPL
6   20230110  Battery Maker Enovix Hopes To Recharge With Ne...   AAPL
7   20230110  Apple Chip Risk to Broadcom Isn't Material, Sa...   AAPL
8   20230110  1 Green Flag and 1 Red Flag for Apple Stock in...   AAPL
9   20230110  Lots Of News, Flattish Markets - Bitcoin  ( BT...   AAPL
10  20230110  Apple, Microsoft And Other Information Technol...   AAPL
11  20230110  This Top Chip Stock Is Expanding Into EVs -- I...   AAPL
12  20230110                 6 Best Music Stocks to Buy in 2023   AAPL
13  20

# Sentiment Analysis of AlphaAdvantage Historic Data #

In [164]:
nltk.download('vader_lexicon')
# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

scores = historic_news['Headline'].apply(analyzer.polarity_scores).tolist()

df_scores = pd.DataFrame(scores)
historic_news = historic_news.join(df_scores, rsuffix='_right')
#print(scores)
#print(df_scores.head())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [165]:
# View Data 
historic_news['Date'] = pd.to_datetime(historic_news.Date).dt.date

unique_ticker = historic_news['Ticker'].unique().tolist()
news_dict = {name: historic_news.loc[historic_news['Ticker'] == name] for name in unique_ticker}

values = []
for ticker in unique_ticker: 
    dataframe = news_dict[ticker]
    dataframe = dataframe.set_index('Ticker')
    print("_----------------------_")
    print(dataframe.head())
    dataframe = dataframe.drop(columns = ['Headline'])
    print ('\n')
    print (dataframe.head())
    
    mean = round(dataframe['compound'].mean(), 2)
    values.append(mean)
    
df = pd.DataFrame(list(zip(unique_ticker, values)), columns =['Ticker', 'Mean Sentiment']) 
df = df.set_index('Ticker')
df = df.sort_values('Mean Sentiment', ascending=False)
print ('\n')
print (df)

_----------------------_
              Date                                           Headline    neg  \
Ticker                                                                         
AAPL    2023-01-10  Apple  ( AAPL )  Gains But Lags Market: What Y...  0.251   
AAPL    2023-01-10  Market Reclaims Key Levels Ahead Of CPI Inflat...  0.000   
AAPL    2023-01-10  Mega Millions Tops $1.1 Billion: Here's How Mu...  0.000   
AAPL    2023-01-10  IBM just broke a winning streak that lasted ne...  0.197   
AAPL    2023-01-10  Everyone's Worst Fears About the Roomba Have C...  0.439   

          neu    pos  compound  
Ticker                          
AAPL    0.618  0.131   -0.3716  
AAPL    1.000  0.000    0.0000  
AAPL    0.665  0.335    0.8910  
AAPL    0.563  0.239    0.1531  
AAPL    0.382  0.178   -0.6249  


              Date    neg    neu    pos  compound
Ticker                                           
AAPL    2023-01-10  0.251  0.618  0.131   -0.3716
AAPL    2023-01-10  0.000  1.00

# Sentiment Analysis of FinViz data #

In [148]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [149]:
# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

columns = ['Ticker', 'Date', 'Time', 'Headline']
news = pd.DataFrame(parsed_news, columns=columns)
scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

df_scores = pd.DataFrame(scores)
news = news.join(df_scores, rsuffix='_right')

In [163]:
# View Data 
#news = pd.DataFrame()
news['Date'] = pd.to_datetime(news.Date).dt.date

unique_ticker = news['Ticker'].unique().tolist()
news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}

values = []
for ticker in tickers: 
    dataframe = news_dict[ticker]
    dataframe = dataframe.set_index('Ticker')
    dataframe = dataframe.drop(columns = ['Headline'])
    print ('\n')
    print (dataframe.head())
    
    mean = round(dataframe['compound'].mean(), 2)
    values.append(mean)
    
df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
df = df.set_index('Ticker')
df = df.sort_values('Mean Sentiment', ascending=False)
print ('\n')
print (df)



              Date     Time    neg    neu    pos  compound
Ticker                                                    
AAPL    2023-01-17  06:16PM  0.000  1.000  0.000    0.0000
AAPL    2023-01-17  05:45PM  0.000  0.806  0.194    0.3400
AAPL    2023-01-17  04:57PM  0.000  1.000  0.000    0.0000
AAPL    2023-01-17  04:39PM  0.153  0.847  0.000   -0.4019
AAPL    2023-01-17  04:02PM  0.000  0.794  0.206    0.3818


              Date     Time    neg    neu    pos  compound
Ticker                                                    
TSLA    2023-01-17  07:59PM  0.000  1.000  0.000    0.0000
TSLA    2023-01-17  07:40PM  0.000  1.000  0.000    0.0000
TSLA    2023-01-17  07:31PM  0.000  0.803  0.197    0.4019
TSLA    2023-01-17  05:42PM  0.000  0.787  0.213    0.4019
TSLA    2023-01-17  05:25PM  0.192  0.808  0.000   -0.2263


              Date     Time    neg    neu    pos  compound
Ticker                                                    
AMZN    2023-01-17  07:06PM  0.184  0.816  0.000  

In [None]:
#Testing helper functions from stock_helper_functions.ipynb
aapl_open = getStockOpen("AAPL")

Getting stock open for stock $AAPL
AAPL 134.8300018310547


# Creating Dataset #

In [None]:
num = df.loc['AMZN']
num

Mean Sentiment   -0.07
Name: AMZN, dtype: float64

In [None]:
#Pulling stock data
for i in tickers:
    dataset = getStockDataDaily(i, day='2022-12-27')
    dataset['Sentiment'] = float(df.loc[i])
    print(dataset)
    

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2022-12-27  131.380005  131.410004  128.720001  130.029999  130.029999   
2022-12-28  129.669998  131.029999  125.870003  126.040001  126.040001   
2022-12-29  127.989998  130.479996  127.730003  129.610001  129.610001   
2022-12-30  128.410004  129.949997  127.430000  129.929993  129.929993   
2023-01-03  130.279999  130.899994  124.169998  125.070000  125.070000   
2023-01-04  126.889999  128.660004  125.080002  126.360001  126.360001   
2023-01-05  127.129997  127.769997  124.760002  125.019997  125.019997   
2023-01-06  126.010002  130.289993  124.889999  129.619995  129.619995   
2023-01-09  130.470001  133.410004  129.889999  130.149994  130.149994   
2023-01-10  130.259995  131.259995  128.119995  130.729996  130.729996   
2023-01-