<a href="https://colab.research.google.com/github/ishaanpaul98/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports #

In [145]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import datetime
import yfinance as yf
import time
import requests
import io

In [146]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [147]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [148]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from sklearn.metrics import plot_confusion_matrix

# Stock Data Helper Functions #

In [149]:
def getStockDataDaily(symbol, day):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = "1d")
    return df

getStockDataDaily('AAPL', "2022-12-21")

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-21,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,130.919998,132.419998,129.639999,131.860001,131.860001,63814900
2022-12-27,131.380005,131.410004,128.720001,130.029999,130.029999,69007800
2022-12-28,129.669998,131.029999,125.870003,126.040001,126.040001,85438400
2022-12-29,127.989998,130.479996,127.730003,129.610001,129.610001,75703700
2022-12-30,128.410004,129.949997,127.43,129.929993,129.929993,76960600
2023-01-03,130.279999,130.899994,124.169998,125.07,125.07,112117500
2023-01-04,126.889999,128.660004,125.080002,126.360001,126.360001,89113600
2023-01-05,127.129997,127.769997,124.760002,125.019997,125.019997,80962700


In [150]:
def arrayToString(arr):
    print("Starting array to list")
    listToStr = ' '.join([str(elem) for elem in arr])
    return listToStr


In [151]:
def getMultiStockDataDaily(symbols, day):
    if len(symbols) > 1:
        print("Length of symbols array is more than 1. STARTING ARRAYTOSTRING")
        symbols = arrayToString(symbols)
    print("Getting stock data for stock $"+symbols)
    df = yf.download(symbols, start=day, period = "1d", group_by='ticker')
    return df

getMultiStockDataDaily(['AAPL', 'TSLA'], "2022-12-21")

Length of symbols array is more than 1. STARTING ARRAYTOSTRING
Starting array to list
Getting stock data for stock $AAPL TSLA
[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,TSLA,TSLA,TSLA,TSLA,TSLA,TSLA,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2022-12-21,139.339996,141.259995,135.889999,137.570007,137.570007,145417400,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,136.0,136.630005,122.260002,125.349998,125.349998,210090300,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,126.370003,128.619995,121.019997,123.150002,123.150002,166989700,130.919998,132.419998,129.639999,131.860001,131.860001,63814900
2022-12-27,117.5,119.669998,108.760002,109.099998,109.099998,208643400,131.380005,131.410004,128.720001,130.029999,130.029999,69007800
2022-12-28,110.349998,116.269997,108.239998,112.709999,112.709999,221070500,129.669998,131.029999,125.870003,126.040001,126.040001,85438400
2022-12-29,120.389999,123.57,117.5,121.82,121.82,221923300,127.989998,130.479996,127.730003,129.610001,129.610001,75703700
2022-12-30,119.949997,124.480003,119.75,123.18,123.18,157304500,128.410004,129.949997,127.43,129.929993,129.929993,76960600
2023-01-03,118.470001,118.800003,104.639999,108.099998,108.099998,231402800,130.279999,130.899994,124.169998,125.07,125.07,112117500
2023-01-04,109.110001,114.589996,107.519997,113.639999,113.639999,180389000,126.889999,128.660004,125.080002,126.360001,126.360001,89113600
2023-01-05,110.510002,111.75,107.160004,110.339996,110.339996,157986300,127.129997,127.769997,124.760002,125.019997,125.019997,80962700


In [152]:
def getStockClose(symbol):
    print("Getting stock close for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['Close'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockClose("AAPL")

Getting stock close for stock $AAPL
AAPL 130.72999572753906


130.72999572753906

In [153]:
def getStockOpen(symbol):
    print("Getting stock open for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['Open'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockOpen("AAPL")

Getting stock open for stock $AAPL
AAPL 130.25999450683594


130.25999450683594

In [154]:
def getStockHigh(symbol):
    print("Getting stock high for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['High'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockHigh("AAPL")

Getting stock high for stock $AAPL
AAPL 131.25999450683594


131.25999450683594

In [155]:
def getStockLow(symbol):
    print("Getting stock low for stock $"+symbol)
    ticker = yf.Ticker(symbol)
    data = ticker.history()
    #print(data)
    last_price = data['Low'].iloc[-1]
    print(symbol, last_price)
    return last_price

getStockLow("AAPL")

Getting stock low for stock $AAPL
AAPL 128.1199951171875


128.1199951171875

# Gathering FinViz Data (Today's Sentiment) #

In [156]:
# Parameters 
n = 3 #the # of article headlines displayed per ticker
tickers = ['AAPL', 'TSLA', 'AMZN']

In [157]:
from finvizfinance.news import News
fnews = News()
all_news = fnews.get_news()
all_news

{'news':        Date                                              Title  \
 0   12:50AM                                  Risk on, risk off   
 1   12:12AM  Best places to work in 2023, according to Glas...   
 2   12:00AM  Europe Still Winning on LNG Imports Even as Pr...   
 3    Jan-10  Cost of living: Retailer Uniqlo to raise pay i...   
 4    Jan-10  U.S. dollar on the verge of first ‘death cross...   
 ..      ...                                                ...   
 85   Jan-10  Review of Nickel Blowup Calls for Changes at L...   
 86   Jan-10  Investors Should Keep an Eye on Japan's Kinky ...   
 87   Jan-10  Brazil's 2022 inflation slows sharply, misses ...   
 88   Jan-10  BOE Warns Banks of Tougher Scrutiny on Credit ...   
 89   Jan-10  Boeing’s Flurry of Deliveries Spurs Best Month...   
 
                  Source                                               Link  
 0       www.reuters.com  https://www.reuters.com/markets/global-markets...  
 1       foxbusiness.com  http

In [158]:
# Get Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finwiz_url + ticker
    print("current url is: " +url)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36."}
    req = Request(url=url,headers=header) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass

current url is: https://finviz.com/quote.ashx?t=AAPL
current url is: https://finviz.com/quote.ashx?t=TSLA
current url is: https://finviz.com/quote.ashx?t=AMZN


Recent News Headlines for AAPL: 
Exclusive-Apple supplier BOE plans new factories in Vietnam -sources ( Jan-10-23 10:13PM )
Apple to Begin Making In-House Screens in 2024 in Shift Away From Samsung ( 10:04PM )
UPDATE 1-Apple to start using in-house screens from 2024 - Bloomberg News ( 08:39PM )


Recent News Headlines for TSLA: 
Market Rally Reclaims Key Levels Ahead Of CPI Inflation Report; Medpace, First Solar Flash Buy Signals ( Jan-11-23 12:09AM )
Tesla plans $717 million expansion at its Austin gigafactory ( Jan-10-23 09:21PM )
Tesla Considers $775 Million Texas Factory Expansion ( 09:15PM )


Recent News Headlines for AMZN: 
NFL Regular-Season Ratings Fell Amid Thursday Night Football Move to Amazon ( Jan-10-23 05:51PM )
What Is ChatGPT and Why Is Microsoft Interested in Buying It? ( 04:58PM )
Amazons Audible Says Audiobo

In [159]:
# Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        print(x.get_text())
        text = x.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text ])

Jan-10-23 10:13PMExclusive-Apple supplier BOE plans new factories in Vietnam -sources Reuters
10:04PMApple to Begin Making In-House Screens in 2024 in Shift Away From Samsung Bloomberg
08:39PMUPDATE 1-Apple to start using in-house screens from 2024 - Bloomberg News Reuters
07:56PMApple to start using in-house screens from 2024 - Bloomberg News Reuters
05:45PMApple (AAPL) Gains But Lags Market: What You Should Know Zacks

05:15PM
Loading…

05:15PMWork From Home Under Fire As Disney CEO Sets Office Mandate Investopedia
02:15PMApple says its paid out $320 billion to developers since 2008 as App Store fight heats up Yahoo Finance
02:14PMSEC alleges South Florida electronics retailer ran pump-and-dump scheme American City Business Journals
02:10PMNetflix stock: 'Glass Onion,' 'Wednesday,' and 'Troll' drive resurgence Yahoo Finance
01:45PMApple Is Chipping Away at Broadcom Now The Wall Street Journal
11:34AMApple to replace Broadcom chips with in-house design by 2025: Report Yahoo Finance Vi

# Gathering Data From AlphaAdvantage for Historical News #

In [166]:
from decouple import config
import requests
import urllib.parse
import json
AAapikey = config('AAKey')

In [167]:
#Function to convert user provided date to date required by AlphaAdvantage
def toAADate(oldDate):
    newDate = str(oldDate) + 'T0001'
    return newDate

In [168]:
# Get data from AlphaAdvantage for one ticker for a particular day
def getHistoricNewsData(ticker, startDate = None, endDate = None):
    url = 'https://www.alphavantage.co/query?'
    if startDate and endDate:
        startDate = toAADate(startDate)
        endDate = toAADate(endDate)
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'time_from': startDate, 'time_to': endDate, 'sort': 'LATEST','limit': 100, 'apikey': AAapikey}
    else:
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'sort': 'LATEST','limit': 100, 'apikey': AAapikey}
    r = requests.get(url, params = Myparams)
    data = r.json()
    return data

newsJson = getHistoricNewsData('AAPL', '20230108', '20230109')

In [169]:
newsJson.keys()

dict_keys(['items', 'sentiment_score_definition', 'relevance_score_definition', 'feed'])

In [170]:
newsJson.get("feed")[0].keys()

dict_keys(['title', 'url', 'time_published', 'authors', 'summary', 'banner_image', 'source', 'category_within_source', 'source_domain', 'topics', 'overall_sentiment_score', 'overall_sentiment_label', 'ticker_sentiment'])

In [171]:
for i in newsJson.get("feed"):
    print(i.get("overall_sentiment_score"))

0.043248
0.103426
0.212346
0.20584
0.358923
-0.041978


# Sentiment Analysis of FinViz data #

In [172]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [173]:
# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

columns = ['Ticker', 'Date', 'Time', 'Headline']
news = pd.DataFrame(parsed_news, columns=columns)
scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

df_scores = pd.DataFrame(scores)
news = news.join(df_scores, rsuffix='_right')

In [174]:
# View Data 
#news = pd.DataFrame()
news['Date'] = pd.to_datetime(news.Date).dt.date

unique_ticker = news['Ticker'].unique().tolist()
news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}

values = []
for ticker in tickers: 
    dataframe = news_dict[ticker]
    dataframe = dataframe.set_index('Ticker')
    dataframe = dataframe.drop(columns = ['Headline'])
    print ('\n')
    print (dataframe.head())
    
    mean = round(dataframe['compound'].mean(), 2)
    values.append(mean)
    
df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
df = df.set_index('Ticker')
df = df.sort_values('Mean Sentiment', ascending=False)
print ('\n')
print (df)



              Date     Time    neg    neu    pos  compound
Ticker                                                    
AAPL    2023-01-10  10:13PM  0.000  1.000  0.000    0.0000
AAPL    2023-01-10  10:04PM  0.000  1.000  0.000    0.0000
AAPL    2023-01-10  08:39PM  0.000  1.000  0.000    0.0000
AAPL    2023-01-10  07:56PM  0.000  1.000  0.000    0.0000
AAPL    2023-01-10  05:45PM  0.233  0.645  0.122   -0.3716


              Date     Time  neg    neu    pos  compound
Ticker                                                  
TSLA    2023-01-11  12:09AM  0.0  1.000  0.000      0.00
TSLA    2023-01-10  09:21PM  0.0  1.000  0.000      0.00
TSLA    2023-01-10  09:15PM  0.0  1.000  0.000      0.00
TSLA    2023-01-10  05:45PM  0.0  0.821  0.179      0.34
TSLA    2023-01-10  05:10PM  0.0  1.000  0.000      0.00


              Date     Time    neg    neu    pos  compound
Ticker                                                    
AMZN    2023-01-10  05:51PM  0.000  0.898  0.102    0.1779
AMZN 

In [175]:
#Testing helper functions from stock_helper_functions.ipynb
aapl_open = getStockOpen("AAPL")

Getting stock open for stock $AAPL
AAPL 130.25999450683594


# Creating Dataset #

In [176]:
num = df.loc['AMZN']
num

Mean Sentiment   -0.1
Name: AMZN, dtype: float64

In [178]:
#Pulling stock data
for i in tickers:
    dataset = getStockDataDaily(i, day='2022-12-27')
    dataset['Sentiment'] = float(df.loc[i])
    print(dataset)
    

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2022-12-27  131.380005  131.410004  128.720001  130.029999  130.029999   
2022-12-28  129.669998  131.029999  125.870003  126.040001  126.040001   
2022-12-29  127.989998  130.479996  127.730003  129.610001  129.610001   
2022-12-30  128.410004  129.949997  127.430000  129.929993  129.929993   
2023-01-03  130.279999  130.899994  124.169998  125.070000  125.070000   
2023-01-04  126.889999  128.660004  125.080002  126.360001  126.360001   
2023-01-05  127.129997  127.769997  124.760002  125.019997  125.019997   
2023-01-06  126.010002  130.289993  124.889999  129.619995  129.619995   
2023-01-09  130.470001  133.410004  129.889999  130.149994  130.149994   
2023-01-10  130.259995  131.259995  128.119995  130.729996  130.729996   

       