<a href="https://colab.research.google.com/github/ishaanpaul98/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports #

In [267]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import datetime
import yfinance as yf
import time
import requests
import io

In [268]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [269]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [270]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from sklearn.metrics import plot_confusion_matrix

# Stock Data Helper Functions #

In [271]:
def getStockDataDaily(symbol, day):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = "1d")
    return df

getStockDataDaily('AAPL', "2022-12-21")

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-21,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,130.919998,132.419998,129.639999,131.860001,131.860001,63814900
2022-12-27,131.380005,131.410004,128.720001,130.029999,130.029999,69007800
2022-12-28,129.669998,131.029999,125.870003,126.040001,126.040001,85438400
2022-12-29,127.989998,130.479996,127.730003,129.610001,129.610001,75703700
2022-12-30,128.410004,129.949997,127.43,129.929993,129.929993,76960600
2023-01-03,130.279999,130.899994,124.169998,125.07,125.07,112117500
2023-01-04,126.889999,128.660004,125.080002,126.360001,126.360001,89113600
2023-01-05,127.129997,127.769997,124.760002,125.019997,125.019997,80962700


In [272]:
def arrayToString(arr):
    print("Starting array to list")
    listToStr = ' '.join([str(elem) for elem in arr])
    return listToStr


In [273]:
def getMultiStockDataDaily(symbols, day):
    if len(symbols) > 1:
        print("Length of symbols array is more than 1. STARTING ARRAYTOSTRING")
        symbols = arrayToString(symbols)
    print("Getting stock data for stock $"+symbols)
    df = yf.download(symbols, start=day, period = "1d", group_by='ticker')
    return df

getMultiStockDataDaily(['AAPL', 'TSLA'], "2022-12-21")

Length of symbols array is more than 1. STARTING ARRAYTOSTRING
Starting array to list
Getting stock data for stock $AAPL TSLA
[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,TSLA,TSLA,TSLA,TSLA,TSLA,TSLA,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2022-12-21,139.339996,141.259995,135.889999,137.570007,137.570007,145417400,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,136.0,136.630005,122.260002,125.349998,125.349998,210090300,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,126.370003,128.619995,121.019997,123.150002,123.150002,166989700,130.919998,132.419998,129.639999,131.860001,131.860001,63814900
2022-12-27,117.5,119.669998,108.760002,109.099998,109.099998,208643400,131.380005,131.410004,128.720001,130.029999,130.029999,69007800
2022-12-28,110.349998,116.269997,108.239998,112.709999,112.709999,221070500,129.669998,131.029999,125.870003,126.040001,126.040001,85438400
2022-12-29,120.389999,123.57,117.5,121.82,121.82,221923300,127.989998,130.479996,127.730003,129.610001,129.610001,75703700
2022-12-30,119.949997,124.480003,119.75,123.18,123.18,157304500,128.410004,129.949997,127.43,129.929993,129.929993,76960600
2023-01-03,118.470001,118.800003,104.639999,108.099998,108.099998,231402800,130.279999,130.899994,124.169998,125.07,125.07,112117500
2023-01-04,109.110001,114.589996,107.519997,113.639999,113.639999,180389000,126.889999,128.660004,125.080002,126.360001,126.360001,89113600
2023-01-05,110.510002,111.75,107.160004,110.339996,110.339996,157986300,127.129997,127.769997,124.760002,125.019997,125.019997,80962700


In [274]:
def getMonthlyStockData(symbol, day, interval):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = interval, group_by='ticker')
    return df

getMonthlyStockData('AAPL', "2022-11-21", '1mo')

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-11-21,150.160004,150.369995,147.720001,148.009995,148.009995,58724100
2022-11-22,148.130005,150.419998,146.929993,150.179993,150.179993,51804100
2022-11-23,149.449997,151.830002,149.339996,151.070007,151.070007,58301400
2022-11-25,148.309998,148.880005,147.119995,148.110001,148.110001,35195900
2022-11-28,145.139999,146.639999,143.380005,144.220001,144.220001,69246000
2022-11-29,144.289993,144.809998,140.350006,141.169998,141.169998,83763800
2022-11-30,141.399994,148.720001,140.550003,148.029999,148.029999,111380900
2022-12-01,148.210007,149.130005,146.610001,148.309998,148.309998,71250400
2022-12-02,145.960007,148.0,145.649994,147.809998,147.809998,65447400
2022-12-05,147.770004,150.919998,145.770004,146.630005,146.630005,68826400


# Gathering FinViz Data (Today's News) #

In [275]:
# Parameters 
n = 3 #the # of article headlines displayed per ticker
tickers = ['AAPL']

In [276]:
from finvizfinance.news import News
fnews = News()
all_news = fnews.get_news()
all_news

{'news':        Date                                              Title  \
 0   06:15PM  Biden would veto House GOP bill restricting pr...   
 1   06:15PM  The first ETF launched 30 years ago, revolutio...   
 2   06:08PM  Asia Stocks Set to Rise in Wake of US Tech Ral...   
 3   06:03PM         Tech Shares Lead Rally as Nasdaq Climbs 2%   
 4   05:47PM  Asia Stocks Set to Rise in Wake of US Tech Ral...   
 ..      ...                                                ...   
 85  06:14AM  Pakistan Raises Key Rate by 100 Basis Points a...   
 86  06:12AM  Ghana Eurobond Holders Fret Over Better Terms ...   
 87  06:08AM                Morning bid: Who let the hawks out?   
 88  06:00AM  Ritchie Bros. Sweetens Deal for IAA, With Star...   
 89  05:53AM  BOE Says Insurers Are Too ‘Optimistic’ About A...   
 
                Source                                               Link  
 0     foxbusiness.com  https://foxbusiness.com/politics/biden-would-v...  
 1     foxbusiness.com  https://fo

In [277]:
# Get Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finwiz_url + ticker
    print("current url is: " +url)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36."}
    req = Request(url=url,headers=header) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass

current url is: https://finviz.com/quote.ashx?t=AAPL


Recent News Headlines for AAPL: 
25 Largest Privately Held Companies in America ( Jan-23-23 05:18PM )
Dow Jones Rallies 250 Points; What To Do Now; 10 Best Stocks To Buy And Watch ( 04:50PM )
Apple reportedly to debut new line of VR headsets ( 04:09PM )


In [278]:
# Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        print(x.get_text())
        text = x.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text ])

Jan-23-23 05:18PM25 Largest Privately Held Companies in America Insider Monkey
04:50PMDow Jones Rallies 250 Points; What To Do Now; 10 Best Stocks To Buy And Watch Investor's Business Daily
04:09PMApple reportedly to debut new line of VR headsets Yahoo Finance Video
03:10PMWhy Apple Stock Was Climbing Today Motley Fool
02:48PMApple in talks with Disney, others on VR content for new headset - Bloomberg News Reuters

01:50PM
Loading…

01:50PMThe big banks want to take on PayPal in e-commerce, but thats harder than it seems MarketWatch
01:27PMHow Apples Upcoming Mixed-Reality Headset Will Work Bloomberg
12:29PMApple, Disney, Salesforce: Why are the worlds best companies failing to innovate on the future of work? Fortune
12:15PMBig Banks Are Coming After PayPal and Apple With Digital Wallet: WSJ Barrons.com
12:10PMWhy Apple (AAPL) is Poised to Beat Earnings Estimates Again Zacks
12:00PMStock Market Recovery: These 4 Stocks Have Been on the Rise in 2023 Motley Fool
11:01AMWhy Buying This FA

# Gathering Data From AlphaAdvantage for Historical News #

In [323]:
from decouple import config
import requests
import urllib.parse
import json
import datetime
AAapikey = config('AAKey')

In [347]:
#Function to convert user provided date to date required by AlphaAdvantage
def toAADate(oldDate):
    newDate = oldDate.strftime("%Y%m%dT001")
    return str(newDate)

In [348]:
# Get data from AlphaAdvantage for one ticker for a particular day
def getHistoricNewsData(ticker, endDate = datetime.date.today() - datetime.timedelta(days = 1) , interval = '1mo'):
    url = 'https://www.alphavantage.co/query?'
    print("endDate is: " +str(endDate))
    if interval == '1mo':
        days_to_add = 30
    else:
        days_to_add = 60
    delta_days = datetime.timedelta(days = days_to_add)
    startDate = endDate - delta_days
    print("startDate is: " +str(startDate))
    if startDate and endDate:
        startDate = toAADate(startDate)
        endDate = toAADate(endDate)
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'time_from': startDate, 'time_to': endDate, 'sort': 'LATEST','limit': 100, 'apikey': AAapikey}
    else:
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'sort': 'LATEST','limit': 100, 'apikey': AAapikey}
    r = requests.get(url, params = Myparams)
    data = r.json()
    #return data
    historic_news = pd.DataFrame(columns=['Date', 'Headline', 'Ticker'])
    for i in data.get("feed"):
        test_date = i.get("time_published")
        test_date = test_date[:8]
        row = [test_date, i.get("title"), ticker]
        new_df = pd.DataFrame([row],columns=['Date', 'Headline', 'Ticker'])
        historic_news = pd.concat([historic_news, new_df], axis=0, ignore_index=True)
    return historic_news

#historic_news = getHistoricNewsData('AAPL', '2022-10-10', '2mo')
historic_news = getHistoricNewsData('AAPL')
print(historic_news.head())

endDate is: 2023-01-22
startDate is: 2022-12-23
       Date                                           Headline Ticker
0  20230124  Tesla Bull Says Time For Tim Cook To Step Down...   AAPL
1  20230123  'It is an employer's market': Tech layoffs may...   AAPL
2  20230123  Bitcoin Prices Retain Bulk Of Gains After Clim...   AAPL
3  20230123  Futures: Stocks Extend Gains; 10 Stocks To Buy...   AAPL
4  20230123  Amazon Earnings Set to Disappoint: Time to Buy...   AAPL


# Sentiment Analysis of News data #

In [282]:
def SentimentAnalysisNewsData(parsedNews):
    nltk.download('vader_lexicon')
    # Sentiment Analysis
    analyzer = SentimentIntensityAnalyzer()

    columns = ['Ticker', 'Date', 'Time', 'Headline']
    news = pd.DataFrame(parsedNews, columns=columns)
    scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

    df_scores = pd.DataFrame(scores)
    news = news.join(df_scores, rsuffix='_right')
    news['Date'] = pd.to_datetime(news.Date).dt.date

    unique_ticker = news['Ticker'].unique().tolist()
    news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}

    values = []
    for ticker in tickers: 
        dataframe = news_dict[ticker]
        dataframe = dataframe.set_index('Ticker')
        dataframe = dataframe.drop(columns = ['Headline'])
        print ('\n')
        print (dataframe.head())
        
        mean = round(dataframe['compound'].mean(), 2)
        values.append(mean)
        
    df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
    df = df.set_index('Ticker')
    df = df.sort_values('Mean Sentiment', ascending=False)
    print ('\n')
    print (df)
    return df

In [283]:
HistoricSentiment = SentimentAnalysisNewsData(historic_news)
TodaysSentiment = SentimentAnalysisNewsData(parsed_news)



              Date  Time    neg    neu    pos  compound
Ticker                                                 
AAPL    2023-01-23   NaN  0.090  0.574  0.336    0.7906
AAPL    2023-01-23   NaN  0.000  0.806  0.194    0.3400
AAPL    2023-01-23   NaN  0.000  0.661  0.339    0.4767
AAPL    2023-01-23   NaN  0.175  0.714  0.110   -0.2500
AAPL    2023-01-23   NaN  0.000  1.000  0.000    0.0000


        Mean Sentiment
Ticker                
AAPL              0.14


              Date     Time    neg    neu    pos  compound
Ticker                                                    
AAPL    2023-01-23  05:18PM  0.000  1.000  0.000    0.0000
AAPL    2023-01-23  04:50PM  0.000  0.811  0.189    0.6369
AAPL    2023-01-23  04:09PM  0.000  1.000  0.000    0.0000
AAPL    2023-01-23  03:10PM  0.293  0.707  0.000   -0.4404
AAPL    2023-01-23  02:48PM  0.000  1.000  0.000    0.0000


        Mean Sentiment
Ticker                
AAPL             -0.09


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Creating Dataset #

In [297]:
def createDataset(date_from, interval):
    for i in tickers:
        print("Creating dataset for $" +i)
        #Get historic stock data
        historic_stock = getMonthlyStockData(i, date_from, interval)
        print(historic_stock.head())
        #Get historic news data and sentiment
        historic_news = getHistoricNewsData(i, '2023012', '20230117')
        #Merge as training set
        #Get today's stock data
        #Get today's news data and sentiment


createDataset('2022-10-10', '2mo')

Creating dataset for $AAPL
Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2022-10-10  140.419998  141.889999  138.570007  140.419998  140.187439   
2022-10-11  139.899994  141.350006  138.220001  138.979996  138.749832   
2022-10-12  139.130005  140.360001  138.160004  138.339996  138.110886   
2022-10-13  134.990005  143.589996  134.369995  142.990005  142.753204   
2022-10-14  144.309998  144.520004  138.190002  138.380005  138.150833   

               Volume  
Date                   
2022-10-10   74899000  
2022-10-11   77033700  
2022-10-12   70433700  
2022-10-13  113224000  
2022-10-14   88598000  
