<a href="https://colab.research.google.com/github/ishaanpaul98/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports #

In [665]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import datetime
import yfinance as yf
import time
import requests
import io

In [666]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [667]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [668]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from sklearn.metrics import plot_confusion_matrix

# Stock Data Helper Functions #

In [669]:
def getStockDataDaily(symbol, day):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = "1d")
    return df

getStockDataDaily('AAPL', "2022-12-21")

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-21,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,130.919998,132.419998,129.639999,131.860001,131.860001,63814900
2022-12-27,131.380005,131.410004,128.720001,130.029999,130.029999,69007800
2022-12-28,129.669998,131.029999,125.870003,126.040001,126.040001,85438400
2022-12-29,127.989998,130.479996,127.730003,129.610001,129.610001,75703700
2022-12-30,128.410004,129.949997,127.43,129.929993,129.929993,76960600
2023-01-03,130.279999,130.899994,124.169998,125.07,125.07,112117500
2023-01-04,126.889999,128.660004,125.080002,126.360001,126.360001,89113600
2023-01-05,127.129997,127.769997,124.760002,125.019997,125.019997,80962700


In [670]:
def arrayToString(arr):
    print("Starting array to list")
    listToStr = ' '.join([str(elem) for elem in arr])
    return listToStr


In [671]:
def getMultiStockDataDaily(symbols, day):
    if len(symbols) > 1:
        print("Length of symbols array is more than 1. STARTING ARRAYTOSTRING")
        symbols = arrayToString(symbols)
    print("Getting stock data for stock $"+symbols)
    df = yf.download(symbols, start=day, period = "1d", group_by='ticker')
    return df

getMultiStockDataDaily(['AAPL', 'TSLA'], "2022-12-21")

Length of symbols array is more than 1. STARTING ARRAYTOSTRING
Starting array to list
Getting stock data for stock $AAPL TSLA
[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,TSLA,TSLA,TSLA,TSLA,TSLA,TSLA
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2022-12-21,132.979996,136.809998,132.75,135.449997,135.449997,85928000,139.339996,141.259995,135.889999,137.570007,137.570007,145417400
2022-12-22,134.350006,134.559998,130.300003,132.229996,132.229996,77852100,136.0,136.630005,122.260002,125.349998,125.349998,210090300
2022-12-23,130.919998,132.419998,129.639999,131.860001,131.860001,63814900,126.370003,128.619995,121.019997,123.150002,123.150002,166989700
2022-12-27,131.380005,131.410004,128.720001,130.029999,130.029999,69007800,117.5,119.669998,108.760002,109.099998,109.099998,208643400
2022-12-28,129.669998,131.029999,125.870003,126.040001,126.040001,85438400,110.349998,116.269997,108.239998,112.709999,112.709999,221070500
2022-12-29,127.989998,130.479996,127.730003,129.610001,129.610001,75703700,120.389999,123.57,117.5,121.82,121.82,221923300
2022-12-30,128.410004,129.949997,127.43,129.929993,129.929993,76960600,119.949997,124.480003,119.75,123.18,123.18,157304500
2023-01-03,130.279999,130.899994,124.169998,125.07,125.07,112117500,118.470001,118.800003,104.639999,108.099998,108.099998,231402800
2023-01-04,126.889999,128.660004,125.080002,126.360001,126.360001,89113600,109.110001,114.589996,107.519997,113.639999,113.639999,180389000
2023-01-05,127.129997,127.769997,124.760002,125.019997,125.019997,80962700,110.510002,111.75,107.160004,110.339996,110.339996,157986300


In [672]:
def getMonthlyStockData(symbol, day = datetime.date.today() - datetime.timedelta(days = 1), interval = '1mo'):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = interval, group_by='ticker')
    return df

#getMonthlyStockData('AAPL', "2022-11-21", '1mo')

# Gathering FinViz Data (Today's News) #

In [673]:
# Parameters 
n = 3 #the # of article headlines displayed per ticker
tickers = ['AAPL']

In [674]:
from finvizfinance.news import News
fnews = News()
all_news = fnews.get_news()
all_news

{'news':        Date                                              Title  \
 0   04:51PM                 Morning Bid: Calm before the storm   
 1   04:22PM  Moving company tried to block negative reviews...   
 2   04:22PM  Apple, Amazon earnings, jobs data and Fed deci...   
 3   04:09PM  Could Big Tech layoffs keep growing? Apple, Am...   
 4   04:00PM  Chinese Travel Is Set to Return. The Question ...   
 ..      ...                                                ...   
 85   Jan-27  S&P 500 jumps to end at nearly 2-month high as...   
 86   Jan-27  : Musk under SEC probe connected with Autopilo...   
 87   Jan-27  FDA approves Eli Lilly's drug for rare blood c...   
 88   Jan-27  China Stocks Veteran Hao Hong Sees Hong Kong S...   
 89   Jan-27  : U.S. oil prices settle at their lowest in mo...   
 
                  Source                                               Link  
 0       www.reuters.com  https://www.reuters.com/markets/asia/global-ma...  
 1       foxbusiness.com  http

In [675]:
# Get Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finwiz_url + ticker
    print("current url is: " +url)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36."}
    req = Request(url=url,headers=header) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass

current url is: https://finviz.com/quote.ashx?t=AAPL


Recent News Headlines for AAPL: 
Why This May Be A 'Life Changing' Market Rally; Apple, Fed Meeting Loom As Tesla Run Hits 75% ( Jan-29-23 04:04PM )
Could Big Tech layoffs keep growing? Apple, Amazon, Facebook and Google may give hints in biggest week of earnings ( 03:01PM )
15 Most Famous Hedge Fund Managers and Their Top Stock Picks ( 02:35PM )


In [676]:
# Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        print(x.get_text())
        text = x.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text ])

Jan-29-23 04:04PMWhy This May Be A 'Life Changing' Market Rally; Apple, Fed Meeting Loom As Tesla Run Hits 75% Investor's Business Daily
03:01PMCould Big Tech layoffs keep growing? Apple, Amazon, Facebook and Google may give hints in biggest week of earnings MarketWatch
02:35PM15 Most Famous Hedge Fund Managers and Their Top Stock Picks Insider Monkey
10:30AMTarget, Amazon and 4 More Retailers That Will Reward You for Turning in Your Old Stuff GOBankingRates
07:53AMFed meeting, jobs data, Apple earnings: What to know this week Yahoo Finance

07:00AM
Loading…

07:00AM$50 AirPods Pro? Nope. Heres How to Spot Fake Apple Earbuds. The Wall Street Journal
06:00AMApple (NASDAQ:AAPL) stock performs better than its underlying earnings growth over last five years Simply Wall St.
05:45AM5 Top Stocks for February Motley Fool
01:09AMChina's 2022 smartphone shipments the lowest in 10 years - research firm Reuters
01:00AMChina's 2022 smartphone shipments the lowest in 10 years - research firm Reuters

# Gathering Data From AlphaAdvantage for Historical News #

In [677]:
from decouple import config
import requests
import urllib.parse
import json
import datetime
AAapikey = config('AAKey')

In [678]:
#Function to convert user provided date to date required by AlphaAdvantage
def toAADate(oldDate):
    newDate = oldDate.strftime("%Y%m%dT0130")
    return str(newDate)


In [679]:
# Get data from AlphaAdvantage for one ticker for a particular day
def getHistoricNewsData(ticker, endDate = datetime.date.today() - datetime.timedelta(days = 1) , interval = '1mo'):
    url = 'https://www.alphavantage.co/query?'
    print("endDate is: " +str(endDate))
    if interval == '1mo':
        days_to_add = 30
    else:
        days_to_add = 60
    delta_days = datetime.timedelta(days = days_to_add)
    startDate = endDate - delta_days
    print("startDate is: " +str(startDate))
    if startDate and endDate:
        startDate = toAADate(startDate)
        endDate = toAADate(endDate)
        print("startDate is: " +str(startDate))
        print("endDate is: " +str(endDate))
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'time_from': startDate, 'time_to': endDate, 'sort': 'EARLIEST','limit': 200, 'apikey': AAapikey}
    else:
        print("NEED DATES")
        #Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'sort': 'LATEST','limit': 100, 'apikey': AAapikey}
    r = requests.get(url, params = Myparams)
    data = r.json()
    #return data
    historic_news = pd.DataFrame(columns=['Date', 'Headline', 'Ticker'])
    for i in data.get("feed"):
        test_date = i.get("time_published")
        test_date = test_date[:8]
        newDate = datetime.datetime.strptime(test_date, '%Y%m%d').date()
        row = [newDate, i.get("title"), ticker]
        new_df = pd.DataFrame([row],columns=['Date', 'Headline', 'Ticker'])
        historic_news = pd.concat([historic_news, new_df], axis=0, ignore_index=True)
    return historic_news

#historic_news = getHistoricNewsData('AAPL', '2022-10-10', '2mo')
historic_news = getHistoricNewsData('AAPL', interval = '2mo')
print(historic_news)

endDate is: 2023-01-28
startDate is: 2022-11-29
startDate is: 20221129T0130
endDate is: 20230128T0130
           Date                                           Headline Ticker
0    2022-11-29   Asia shares take comfort in China property rally   AAPL
1    2022-11-29  Asia shares take comfort in China property ral...   AAPL
2    2022-11-29  Global markets: Asia shares take comfort in Ch...   AAPL
3    2022-11-29  App Store Awards Celebrate the Best Apps and G...   AAPL
4    2022-11-29  iPhone 15 Might Push HD Photography To New Hei...   AAPL
..          ...                                                ...    ...
195  2022-12-05  Foxconn says it's restoring production at the ...   AAPL
196  2022-12-05  The 3 Most Popular Robinhood Stocks Right Now:...   AAPL
197  2022-12-05  US Stocks Start New Trading Week On Negative N...   AAPL
198  2022-12-05  Apple In A Rush To Diversify iPhone Production...   AAPL
199  2022-12-05  Should SPDR S&P 500 ETF  ( SPY )  Be on Your I...   AAPL

[200 rows

# Sentiment Analysis of News data #

In [682]:
def SentimentAnalysisNewsData(parsedNews, printOut = False):
    #Downloading Vader Lexicon for Sentiment Analysis
    nltk.download('vader_lexicon')
    # Initializing Sentiment Analysis
    analyzer = SentimentIntensityAnalyzer()

    #Declaring Column Names
    columns = ['Ticker', 'Date', 'Time', 'Headline']
    #Creating dataframe from news
    news = pd.DataFrame(parsedNews, columns=columns)
    #Getting scores for headlines
    scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

    #Creating Dataframe of Scores
    df_scores = pd.DataFrame(scores)
    #Joining scores to news dataframe
    news = news.join(df_scores, rsuffix='_right')
    #Converting Date column to pd datetime date
    news['Date'] = pd.to_datetime(news.Date).dt.date

    #List of unique tickers
    unique_ticker = news['Ticker'].unique().tolist()
    #Creating dict for news based on ticker
    news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}
    #Initializing List of values
    values = []
    for ticker in tickers: 
        dataframe = news_dict[ticker]
        dataframe = dataframe.set_index('Ticker')
        #Dropping headlines column since we only need scores now
        dataframe = dataframe.drop(columns = ['Headline'])
        #if printOut:
            #print ('\n')
            #print (dataframe.head())
        
        #mean = round(dataframe['compound'].mean(), 2)
        #Finding compound number for news of every day
        mean = round(dataframe.groupby('Date')['compound'].mean(), 2)
        #Adding values to values list
        values.append(mean)
    
   
    #print(round(dataframe.groupby('Date')['compound'].mean(), 2))
    print("VALUES------------")
    print(values)
        
    #Combining tickers and values into new dataframe
    #df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
    #df = df.set_index('Ticker')
    #df = df.sort_values('date', ascending=False)
    if printOut:
        print("-----------DF")
        print(df.head())
        print(df.shape)
    #Returning the dataframe
    return df
    #if printOut:
        #print ('\n')
        #display (df)
    #return df

In [681]:
#print("HISTORIC SENTIMENT")
HistoricSentiment = SentimentAnalysisNewsData(historic_news, True)
#print("\n")
#print("TODAYS SENTIMENT")
#TodaysSentiment = SentimentAnalysisNewsData(parsed_news)

VALUES------------
[Date
2022-11-29    0.16
2022-11-30    0.03
2022-12-01    0.04
2022-12-02    0.12
2022-12-03   -0.06
2022-12-04   -0.03
2022-12-05   -0.02
Name: compound, dtype: float64]
-----------DF


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


TypeError: 'NoneType' object is not callable

# Creating Dataset #

In [None]:
def createDataset(date_from, int):
    for i in tickers:
        print("Creating dataset for $" +i)
        #Get historic stock data
        historic_stock = getMonthlyStockData(i, interval = int)
        print(historic_stock.head())
        #Get historic news data
        historic_news = getHistoricNewsData(i, interval = int)
        print(historic_news)
        #Use news to get sentiment
        HistoricSentiment = SentimentAnalysisNewsData(historic_news)
        print(HistoricSentiment.shape)
        #Merge as training set
        #Get today's stock data
        #Get today's news data
        #Use news to get sentiment


createDataset('2022-10-10', '2mo')

Creating dataset for $AAPL
Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2023-01-25  140.889999  142.429993  138.809998  141.860001  141.860001   
2023-01-26  143.169998  144.250000  141.899994  143.960007  143.960007   

              Volume  
Date                  
2023-01-25  65799300  
2023-01-26  54003800  
endDate is: 2023-01-25
startDate is: 2022-11-26
startDate is: 20221126T0130
endDate is: 20230125T0130
           Date                                           Headline Ticker
0    2022-11-26  10 exchange-traded funds to take exposure in U...   AAPL
1    2022-11-26  Market Rally Strong, But Here's Why You Should...   AAPL
2    2022-11-26  Motley Fool Investors Look Back at 2022 and Fo...   AAPL
3    2022-11-26  Black Friday Online Sales Hit Record, But Grow...   AAPL
4    202

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
