<a href="https://colab.research.google.com/github/ishaanpaul98/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports #

In [539]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import datetime
import yfinance as yf
import time
import requests
import io

In [540]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [541]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [542]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from sklearn.metrics import plot_confusion_matrix

# Stock Data Helper Functions #

In [543]:
def getStockDataDaily(symbol, day):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = "1d")
    return df

getStockDataDaily('AAPL', "2022-12-21")

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-21,132.979996,136.809998,132.75,135.449997,135.2435,85928000
2022-12-22,134.350006,134.559998,130.300003,132.229996,132.028412,77852100
2022-12-23,130.919998,132.419998,129.639999,131.860001,131.658981,63814900
2022-12-27,131.380005,131.410004,128.720001,130.029999,129.831772,69007800
2022-12-28,129.669998,131.029999,125.870003,126.040001,125.847855,85438400
2022-12-29,127.989998,130.479996,127.730003,129.610001,129.412415,75703700
2022-12-30,128.410004,129.949997,127.43,129.929993,129.731918,76960600
2023-01-03,130.279999,130.899994,124.169998,125.07,124.879326,112117500
2023-01-04,126.889999,128.660004,125.080002,126.360001,126.167366,89113600
2023-01-05,127.129997,127.769997,124.760002,125.019997,124.829399,80962700


In [544]:
def arrayToString(arr):
    print("Starting array to list")
    listToStr = ' '.join([str(elem) for elem in arr])
    return listToStr


In [545]:
def getMultiStockDataDaily(symbols, day):
    if len(symbols) > 1:
        print("Length of symbols array is more than 1. STARTING ARRAYTOSTRING")
        symbols = arrayToString(symbols)
    print("Getting stock data for stock $"+symbols)
    df = yf.download(symbols, start=day, period = "1d", group_by='ticker')
    return df

getMultiStockDataDaily(['AAPL', 'TSLA'], "2022-12-21")

Length of symbols array is more than 1. STARTING ARRAYTOSTRING
Starting array to list
Getting stock data for stock $AAPL TSLA
[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,TSLA,TSLA,TSLA,TSLA,TSLA,TSLA,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2022-12-21,139.339996,141.259995,135.889999,137.570007,137.570007,145417400,132.979996,136.809998,132.75,135.449997,135.2435,85928000
2022-12-22,136.0,136.630005,122.260002,125.349998,125.349998,210090300,134.350006,134.559998,130.300003,132.229996,132.028412,77852100
2022-12-23,126.370003,128.619995,121.019997,123.150002,123.150002,166989700,130.919998,132.419998,129.639999,131.860001,131.658981,63814900
2022-12-27,117.5,119.669998,108.760002,109.099998,109.099998,208643400,131.380005,131.410004,128.720001,130.029999,129.831772,69007800
2022-12-28,110.349998,116.269997,108.239998,112.709999,112.709999,221070500,129.669998,131.029999,125.870003,126.040001,125.847855,85438400
2022-12-29,120.389999,123.57,117.5,121.82,121.82,221923300,127.989998,130.479996,127.730003,129.610001,129.412415,75703700
2022-12-30,119.949997,124.480003,119.75,123.18,123.18,157304500,128.410004,129.949997,127.43,129.929993,129.731918,76960600
2023-01-03,118.470001,118.800003,104.639999,108.099998,108.099998,231402800,130.279999,130.899994,124.169998,125.07,124.879326,112117500
2023-01-04,109.110001,114.589996,107.519997,113.639999,113.639999,180389000,126.889999,128.660004,125.080002,126.360001,126.167366,89113600
2023-01-05,110.510002,111.75,107.160004,110.339996,110.339996,157986300,127.129997,127.769997,124.760002,125.019997,124.829399,80962700


In [546]:
def getMonthlyStockData(symbol, day = datetime.date.today() - datetime.timedelta(days = 1), interval = 30):
    print("Getting stock data for stock $"+symbol)
    yfinterval = ''
    if interval == 30:
        yfinterval = '31d'
    elif interval == 60:
        yfinterval = '61d'
    else:
        print("INVALID INTERVAL")
    aapl = yf.Ticker("AAPL")
    # get historical market data
    #hist = aapl.history(period="1mo")
    df = yf.download(symbol, period = yfinterval, group_by='ticker')
    return df

getMonthlyStockData('AAPL', "2022-11-21")

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-28,129.669998,131.029999,125.870003,126.040001,125.847855,85438400
2022-12-29,127.989998,130.479996,127.730003,129.610001,129.412415,75703700
2022-12-30,128.410004,129.949997,127.43,129.929993,129.731918,76960600
2023-01-03,130.279999,130.899994,124.169998,125.07,124.879326,112117500
2023-01-04,126.889999,128.660004,125.080002,126.360001,126.167366,89113600
2023-01-05,127.129997,127.769997,124.760002,125.019997,124.829399,80962700
2023-01-06,126.010002,130.289993,124.889999,129.619995,129.422394,87686600
2023-01-09,130.470001,133.410004,129.889999,130.149994,129.951584,70790800
2023-01-10,130.259995,131.259995,128.119995,130.729996,130.530701,63896200
2023-01-11,131.25,133.509995,130.460007,133.490005,133.286499,69458900


# Gathering FinViz Data (Today's News) #

In [547]:
# Parameters 
n = 3 #the # of article headlines displayed per ticker
tickers = ['AAPL']

In [548]:
from finvizfinance.news import News
fnews = News()
all_news = fnews.get_news()
all_news

{'news':        Date                                              Title  \
 0   02:06PM  How State Street Has Used AI to Find ‘Hidden G...   
 1   01:03PM  Biden admin. leaning toward blocking JetBlue-S...   
 2   12:56PM  Crypto Firms’ In-House Tokens Are Coming Under...   
 3   12:48PM  America’s Post-Pandemic Corporate Profit Boom ...   
 4   12:03PM  Plane and Bus Collide at Los Angeles Airport, ...   
 ..      ...                                                ...   
 85   Feb-10    Zara owner Inditex agrees 20% pay hike in Spain   
 86   Feb-10  EuroGroup Laminations pares early gains in Mil...   
 87   Feb-10  Futures fall amid rising yields; Lyft sinks on...   
 88   Feb-10                          AMERICAS Volatility stirs   
 89   Feb-10  Lyft falls as forecast exacerbates worries of ...   
 
                Source                                               Link  
 0   www.bloomberg.com  https://www.bloomberg.com/news/articles/2023-0...  
 1     foxbusiness.com  https://fo

In [549]:
# Get Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finwiz_url + ticker
    print("current url is: " +url)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36."}
    req = Request(url=url,headers=header) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass

current url is: https://finviz.com/quote.ashx?t=AAPL


Recent News Headlines for AAPL: 
The Smartest Investors Are Buying These 3 Beaten-Down Stocks ( Feb-11-23 01:26PM )
Big Oil raked in record profits last year, but check out Big Tech ( 11:15AM )
Super Bowl 2023: Everything you need to know about the finances and the big game ( 09:42AM )


In [550]:
# Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        #print(x.get_text())
        text = x.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text ])
print(parsed_news)

[['AAPL', 'Feb-11-23', '01:26PM', 'Feb-11-23 01:26PMThe Smartest Investors Are Buying These 3 Beaten-Down Stocks Motley Fool'], ['AAPL', 'Feb-11-23', '11:15AM', '11:15AMBig Oil raked in record profits last year, but check out Big Tech Yahoo Finance'], ['AAPL', 'Feb-11-23', '09:42AM', '09:42AMSuper Bowl 2023: Everything you need to know about the finances and the big game Yahoo Finance Video'], ['AAPL', 'Feb-11-23', '09:00AM', '09:00AM3 Artificial Intelligence Stocks That Could Beat the Market in 2023 Motley Fool'], ['AAPL', 'Feb-11-23', '08:07AM', '08:07AMWhere Will Skyworks Solutions Stock Be in 1 Year? Motley Fool'], ['AAPL', 'Feb-11-23', '05:50AM', '\n05:50AM\nLoading…\n'], ['AAPL', 'Feb-11-23', '05:50AM', '05:50AMOpinion: These Will Be the 3 Largest Stocks by 2030 Motley Fool'], ['AAPL', 'Feb-10-23', '06:44PM', 'Feb-10-23 06:44PMWeekly Roundup TheStreet.com'], ['AAPL', 'Feb-10-23', '05:55PM', "05:55PMHow to Buy Stocks Like Warren Buffett's Right-Hand Man Charlie Munger Zacks"], ['A

# Gathering Data From AlphaAdvantage for Historical News #

In [551]:
from decouple import config
import requests
import urllib.parse
import json
import datetime
AAapikey = config('AAKey')

In [552]:
#Function to convert user provided date to date required by AlphaAdvantage
def toAADate(oldDate):
    newDate = oldDate.strftime("%Y%m%dT0130")
    return str(newDate)


In [553]:
#End Date is yesterday
def getHistoricNewsData(ticker, endDate = datetime.date.today() - datetime.timedelta(days = 1), interval = 30):
    #print("Starting the breakdown")
    # Format for parsed_news is [Ticker, Date, Time, Headlines]
    historic_parsed_news = []
    url = 'https://www.alphavantage.co/query?'
    window = 10
    startDate = endDate - datetime.timedelta(days = interval)
    rollingEndDate = startDate + datetime.timedelta(days = window)
    delta = datetime.timedelta(days = window)
    while (rollingEndDate <= endDate):
        startDateAA = toAADate(startDate)
        rollingEndDateAA = toAADate(rollingEndDate)
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'time_from': startDateAA, 'time_to': rollingEndDateAA, 'sort': 'EARLIEST','limit': 200, 'apikey': AAapikey}
        r = requests.get(url, params = Myparams)
        data = r.json()
        #print(data)
        for i in data.get("feed"):
            test_date = i.get("time_published")
            test_date = test_date[:8]
            newDate = datetime.datetime.strptime(test_date, '%Y%m%d').date() 
            historic_parsed_news.append([ticker, newDate,'0001', i.get("title")])
        startDate = rollingEndDate
        rollingEndDate = rollingEndDate + delta
    #print(historic_parsed_news)
    return historic_parsed_news


#historic_parsed_news = getHistoricNewsData('AAPL', interval = 30)

In [554]:
print(historic_parsed_news)



In [555]:
# Get data from AlphaAdvantage for one ticker for a particular day
def oldGetHistoricNewsData(ticker, endDate = datetime.date.today() - datetime.timedelta(days = 1) , interval = '1mo'):
    url = 'https://www.alphavantage.co/query?'
    print("endDate is: " +str(endDate))
    if interval == '1mo':
        days_to_add = 30
    else:
        days_to_add = 60
    delta_days = datetime.timedelta(days = days_to_add)
    startDate = endDate - delta_days
    print("startDate is: " +str(startDate))
    if startDate and endDate:
        startDate = toAADate(startDate)
        endDate = toAADate(endDate)
        #print("startDate is: " +str(startDate))
        #print("endDate is: " +str(endDate))
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'time_from': startDate, 'time_to': endDate, 'sort': 'EARLIEST','limit': 200, 'apikey': AAapikey}
    #else:
        #print("NEED DATES")
        #Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'sort': 'LATEST','limit': 100, 'apikey': AAapikey}
    r = requests.get(url, params = Myparams)
    data = r.json()
    #return data
    historic_news = pd.DataFrame(columns=['Date', 'Headline', 'Ticker'])
    for i in data.get("feed"):
        test_date = i.get("time_published")
        test_date = test_date[:8]
        newDate = datetime.datetime.strptime(test_date, '%Y%m%d').date()
        row = [newDate, i.get("title"), ticker]
        new_df = pd.DataFrame([row],columns=['Date', 'Headline', 'Ticker'])
        historic_news = pd.concat([historic_news, new_df], axis=0, ignore_index=True)
    return historic_news

#historic_news = getHistoricNewsData('AAPL', '2022-10-10', '2mo')
#historic_news = getHistoricNewsData('AAPL', interval = '1mo')
#print(historic_news)

# Sentiment Analysis of News data #

In [556]:
def SentimentAnalysisNewsData(parsedNews, printOut = False):
    #Downloading Vader Lexicon for Sentiment Analysis
    nltk.download('vader_lexicon')
    # Initializing Sentiment Analysis
    analyzer = SentimentIntensityAnalyzer()

    #Declaring Column Names
    columns = ['Ticker', 'Date', 'Time', 'Headline']
    #Creating dataframe from news
    news = pd.DataFrame(parsedNews, columns=columns)
    #Getting scores for headlines
    scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

    #Creating Dataframe of Scores
    df_scores = pd.DataFrame(scores)
    #Joining scores to news dataframe
    news = news.join(df_scores, rsuffix='_right')
    #Converting Date column to pd datetime date
    news['Date'] = pd.to_datetime(news.Date).dt.date

    #List of unique tickers
    unique_ticker = news['Ticker'].unique().tolist()
    #Creating dict for news based on ticker
    news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}
    #Initializing List of values
    # og values = []
    values = []
    df = pd.DataFrame()
    for ticker in tickers: 
        dataframe = news_dict[ticker]
        dataframe = dataframe.set_index('Ticker')
        #Dropping headlines column since we only need scores now
        dataframe = dataframe.drop(columns = ['Headline'])
        #mean = round(dataframe['compound'].mean(), 2)
        #Finding compound number for news of every day
        df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)
        #og values.append(mean)
        
    #print(values)
    #Combining tickers and values into new dataframe
    df['Ticker'] = ticker
    df = df.sort_values('Date', ascending=True)
    if printOut:
        print("-----------DF")
        print(df)
    #Returning the dataframe
    return df

In [562]:
#print("HISTORIC SENTIMENT")
HistoricSentiment = SentimentAnalysisNewsData(historic_parsed_news, printOut=True)
#print("\n")
#print("TODAYS SENTIMENT")
TodaysSentiment = SentimentAnalysisNewsData(parsed_news)

-----------DF
            compound Ticker
Date                       
2023-01-10      0.10   AAPL
2023-01-11      0.03   AAPL
2023-01-12      0.01   AAPL
2023-01-13      0.08   AAPL
2023-01-14     -0.07   AAPL
2023-01-15     -0.05   AAPL
2023-01-16      0.18   AAPL
2023-01-17      0.10   AAPL
2023-01-20      0.11   AAPL
2023-01-21      0.02   AAPL
2023-01-22     -0.00   AAPL
2023-01-23      0.15   AAPL
2023-01-24      0.07   AAPL
2023-01-25      0.05   AAPL
2023-01-26      0.18   AAPL
2023-01-27      0.20   AAPL
2023-01-30     -0.03   AAPL
2023-01-31      0.07   AAPL
2023-02-01      0.02   AAPL
2023-02-02      0.11   AAPL
2023-02-03     -0.07   AAPL


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)


# Creating Dataset #

In [558]:
def createDataset(tickers, interval = 30):
    for i in tickers:
        print("Creating dataset for $" +i)
        #Get historic stock data
        historic_stock = getMonthlyStockData(i, interval = interval)
        #Get historic news data
        historic_news = getHistoricNewsData(i, interval = interval)
        #Use news to get sentiment
        HistoricSentiment = SentimentAnalysisNewsData(historic_news)
        #Dropping ticker since all the tickers in the for loop are the same
        HistoricSentiment = HistoricSentiment.drop(columns=['Ticker'])
        #Merge as training set
        
        #Get today's stock data
        #Get today's news data
        #Use news to get sentiment


#createDataset(['AAPL'])

Creating dataset for $AAPL
Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)


In [579]:
print("Creating dataset for $")
#Get historic stock data
historic_stock = getMonthlyStockData('AAPL', interval = 30)
#Get historic news data
historic_news = getHistoricNewsData('AAPL', interval = 30)
#Use news to get sentiment
HistoricSentiment = SentimentAnalysisNewsData(historic_news)
#Dropping ticker since all the tickers in the for loop are the same
HistoricSentiment = HistoricSentiment.drop(columns=['Ticker'])

Creating dataset for $
Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)


In [585]:
#Merge as training set
newHistoricSentiment = HistoricSentiment
newHistoricStock = historic_stock
print(newHistoricStock.index)
df_merged = pd.concat([newHistoricSentiment, newHistoricStock], axis=1)
print(df_merged)


DatetimeIndex(['2022-12-28', '2022-12-29', '2022-12-30', '2023-01-03',
               '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-09',
               '2023-01-10', '2023-01-11', '2023-01-12', '2023-01-13',
               '2023-01-17', '2023-01-18', '2023-01-19', '2023-01-20',
               '2023-01-23', '2023-01-24', '2023-01-25', '2023-01-26',
               '2023-01-27', '2023-01-30', '2023-01-31', '2023-02-01',
               '2023-02-02', '2023-02-03', '2023-02-06', '2023-02-07',
               '2023-02-08', '2023-02-09', '2023-02-10'],
              dtype='datetime64[ns]', name='Date', freq=None)
                     compound        Open        High         Low       Close  \
Date                                                                            
2023-01-11               0.03  131.250000  133.509995  130.460007  133.490005   
2023-01-12               0.01  133.880005  134.259995  131.440002  133.410004   
2023-01-13               0.08  132.029999  134.919998  131.

  df_merged = pd.concat([newHistoricSentiment, newHistoricStock], axis=1)


In [None]:
#Get today's stock data
#Get today's news data
#Use news to get sentiment