<a href="https://colab.research.google.com/github/ishaanpaul98/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports #

In [116]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import datetime
import yfinance as yf
import time
import requests
import io

In [117]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [118]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [119]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from sklearn.metrics import plot_confusion_matrix

# Stock Data Helper Functions #

In [120]:
def getStockDataDaily(symbol, day):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = "1d")
    return df

getStockDataDaily('AAPL', "2022-12-21")

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-21,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,130.919998,132.419998,129.639999,131.860001,131.860001,63814900
2022-12-27,131.380005,131.410004,128.720001,130.029999,130.029999,69007800
2022-12-28,129.669998,131.029999,125.870003,126.040001,126.040001,85438400
2022-12-29,127.989998,130.479996,127.730003,129.610001,129.610001,75703700
2022-12-30,128.410004,129.949997,127.43,129.929993,129.929993,76960600
2023-01-03,130.279999,130.899994,124.169998,125.07,125.07,112117500
2023-01-04,126.889999,128.660004,125.080002,126.360001,126.360001,89113600
2023-01-05,127.129997,127.769997,124.760002,125.019997,125.019997,80962700


In [121]:
def arrayToString(arr):
    print("Starting array to list")
    listToStr = ' '.join([str(elem) for elem in arr])
    return listToStr


In [122]:
def getMultiStockDataDaily(symbols, day):
    if len(symbols) > 1:
        print("Length of symbols array is more than 1. STARTING ARRAYTOSTRING")
        symbols = arrayToString(symbols)
    print("Getting stock data for stock $"+symbols)
    df = yf.download(symbols, start=day, period = "1d", group_by='ticker')
    return df

getMultiStockDataDaily(['AAPL', 'TSLA'], "2022-12-21")

Length of symbols array is more than 1. STARTING ARRAYTOSTRING
Starting array to list
Getting stock data for stock $AAPL TSLA
[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,TSLA,TSLA,TSLA,TSLA,TSLA,TSLA,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2022-12-21,139.339996,141.259995,135.889999,137.570007,137.570007,145417400,132.979996,136.809998,132.75,135.449997,135.449997,85928000
2022-12-22,136.0,136.630005,122.260002,125.349998,125.349998,210090300,134.350006,134.559998,130.300003,132.229996,132.229996,77852100
2022-12-23,126.370003,128.619995,121.019997,123.150002,123.150002,166989700,130.919998,132.419998,129.639999,131.860001,131.860001,63814900
2022-12-27,117.5,119.669998,108.760002,109.099998,109.099998,208643400,131.380005,131.410004,128.720001,130.029999,130.029999,69007800
2022-12-28,110.349998,116.269997,108.239998,112.709999,112.709999,221070500,129.669998,131.029999,125.870003,126.040001,126.040001,85438400
2022-12-29,120.389999,123.57,117.5,121.82,121.82,221923300,127.989998,130.479996,127.730003,129.610001,129.610001,75703700
2022-12-30,119.949997,124.480003,119.75,123.18,123.18,157304500,128.410004,129.949997,127.43,129.929993,129.929993,76960600
2023-01-03,118.470001,118.800003,104.639999,108.099998,108.099998,231402800,130.279999,130.899994,124.169998,125.07,125.07,112117500
2023-01-04,109.110001,114.589996,107.519997,113.639999,113.639999,180389000,126.889999,128.660004,125.080002,126.360001,126.360001,89113600
2023-01-05,110.510002,111.75,107.160004,110.339996,110.339996,157986300,127.129997,127.769997,124.760002,125.019997,125.019997,80962700


In [123]:
def getMonthlyStockData(symbol, day = datetime.date.today() - datetime.timedelta(days = 1), interval = '1mo'):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = interval, group_by='ticker')
    return df

#getMonthlyStockData('AAPL', "2022-11-21", '1mo')

# Gathering FinViz Data (Today's News) #

In [124]:
# Parameters 
n = 3 #the # of article headlines displayed per ticker
tickers = ['AAPL']

In [125]:
from finvizfinance.news import News
fnews = News()
all_news = fnews.get_news()
all_news

{'news':        Date                                              Title  \
 0   12:20AM  Stocks Rise Ahead of Powell, Aussie Extends Ga...   
 1   12:13AM  Hong Kong-China Travel Traffic Spikes as Borde...   
 2   12:04AM  Canada's Ivey PMI rebounds to 8-month high as ...   
 3   12:04AM  Post-QE bond losses rising reality for central...   
 4   12:00AM  How Russia Is Surviving the Tightening Grip on...   
 ..      ...                                                ...   
 85   Feb-06      These maps show the extent of the aftershocks   
 86   Feb-06  How the nurses' strike on Tuesday 7 February w...   
 87   Feb-06  AI startup Cohere in talks to raise funding at...   
 88   Feb-06  Bank of America 'still forecasting' 2023 reces...   
 89   Feb-06  Immigration Rebound Eases Shortage of Workers,...   
 
                Source                                               Link  
 0   www.bloomberg.com  https://www.bloomberg.com/news/articles/2023-0...  
 1   www.bloomberg.com  https://ww

In [126]:
# Get Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finwiz_url + ticker
    print("current url is: " +url)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36."}
    req = Request(url=url,headers=header) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass

current url is: https://finviz.com/quote.ashx?t=AAPL


Recent News Headlines for AAPL: 
Younger Customers Are Driving American Express' Results ( Feb-06-23 08:13PM )
Time to Buy PayPal Stock Before Earnings? ( 07:30PM )
Why Taiwan Semiconductor Manufacturing Is Falling Today ( 11:14AM )


In [127]:
# Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        #print(x.get_text())
        text = x.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text ])
print(parsed_news)

[['AAPL', 'Feb-06-23', '08:13PM', "Feb-06-23 08:13PMYounger Customers Are Driving American Express' Results Motley Fool"], ['AAPL', 'Feb-06-23', '07:30PM', '07:30PMTime to Buy PayPal Stock Before Earnings? Zacks'], ['AAPL', 'Feb-06-23', '11:14AM', '11:14AMWhy Taiwan Semiconductor Manufacturing Is Falling Today Motley Fool'], ['AAPL', 'Feb-06-23', '11:04AM', '11:04AMStock market today: Dow slips as Treasury yields jump to dent tech; Powell eyed Investing.com'], ['AAPL', 'Feb-06-23', '11:00AM', '11:00AMApple iPhone 14 being sold at discount in China: Report Yahoo Finance Video'], ['AAPL', 'Feb-06-23', '10:57AM', '\n10:57AM\nLoading…\n'], ['AAPL', 'Feb-06-23', '10:57AM', '10:57AM1 Supercharged Nasdaq Stock to Buy Hand Over Fist Before It Jumps Higher Motley Fool'], ['AAPL', 'Feb-06-23', '10:48AM', '10:48AMS&P 500 off lows but remains pressured as Fed jitters drive up Treasury yields Investing.com'], ['AAPL', 'Feb-06-23', '10:00AM', '10:00AMApple Heads the Management Top 250 All-Stars The 

# Gathering Data From AlphaAdvantage for Historical News #

In [128]:
from decouple import config
import requests
import urllib.parse
import json
import datetime
AAapikey = config('AAKey')

In [129]:
#Function to convert user provided date to date required by AlphaAdvantage
def toAADate(oldDate):
    newDate = oldDate.strftime("%Y%m%dT0130")
    return str(newDate)


In [147]:
#End Date is yesterday
def breakdownofhistoric(ticker, endDate = datetime.date.today() - datetime.timedelta(days = 1), interval = 30, runs = 1):
    print("Starting the breakdown")
    # Format for parsed_news is [Ticker, Date, Time, Headlines]
    historic_parsed_news = []
    url = 'https://www.alphavantage.co/query?'
    window = 10
    startDate = endDate - datetime.timedelta(days = interval)
    rollingEndDate = startDate + datetime.timedelta(days = window)
    delta = datetime.timedelta(days = window)
    while (rollingEndDate <= endDate):
        startDateAA = toAADate(startDate)
        rollingEndDateAA = toAADate(rollingEndDate)
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'time_from': startDateAA, 'time_to': rollingEndDateAA, 'sort': 'EARLIEST','limit': 200, 'apikey': AAapikey}
        r = requests.get(url, params = Myparams)
        data = r.json()
        print(data)
        for i in data.get("feed"):
            test_date = i.get("time_published")
            test_date = test_date[:8]
            newDate = datetime.datetime.strptime(test_date, '%Y%m%d').date() 
            historic_parsed_news.append([ticker, newDate,'0001', i.get("title")])
        startDate = rollingEndDate
        rollingEndDate = rollingEndDate + delta
    #print(historic_parsed_news)
    return historic_parsed_news


historic_parsed_news = breakdownofhistoric('AAPL', interval = 30)

Starting the breakdown
{'items': '200', 'sentiment_score_definition': 'x <= -0.35: Bearish; -0.35 < x <= -0.15: Somewhat-Bearish; -0.15 < x < 0.15: Neutral; 0.15 <= x < 0.35: Somewhat_Bullish; x >= 0.35: Bullish', 'relevance_score_definition': '0 < x <= 1, with a higher score indicating higher relevance.', 'feed': [{'title': 'Samsung profit slumps by most in decade on weak demand for memory chips', 'url': 'https://www.scmp.com/abacus/tech/article/3205773/samsung-profit-tumbles-69-cent-historic-drop-memory-chip-price-slump', 'time_published': '20230106T020301', 'authors': ['Bloomberg'], 'summary': 'Profit slumped last quarter by the most in more than a decade as memory chip prices crashed. Samsung may have to rein in capital expenditure to conserve cash, Citigroup says.', 'banner_image': 'https://img.i-scmp.com/cdn-cgi/image/fit=contain,width=425,format=auto/sites/default/files/styles/768x768/public/d8/images/canvas/2023/01/06/1cfe3e68-f139-4b3a-b067-740c0ed93a1f_d37d3833.jpg?itok=hA7js

In [None]:
# Get data from AlphaAdvantage for one ticker for a particular day
def getHistoricNewsData(ticker, endDate = datetime.date.today() - datetime.timedelta(days = 1) , interval = '1mo'):
    url = 'https://www.alphavantage.co/query?'
    print("endDate is: " +str(endDate))
    if interval == '1mo':
        days_to_add = 30
    else:
        days_to_add = 60
    delta_days = datetime.timedelta(days = days_to_add)
    startDate = endDate - delta_days
    print("startDate is: " +str(startDate))
    if startDate and endDate:
        startDate = toAADate(startDate)
        endDate = toAADate(endDate)
        print("startDate is: " +str(startDate))
        print("endDate is: " +str(endDate))
        Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'time_from': startDate, 'time_to': endDate, 'sort': 'EARLIEST','limit': 200, 'apikey': AAapikey}
    else:
        print("NEED DATES")
        #Myparams = {'function': 'NEWS_SENTIMENT', 'tickers': ticker, 'sort': 'LATEST','limit': 100, 'apikey': AAapikey}
    r = requests.get(url, params = Myparams)
    data = r.json()
    #return data
    historic_news = pd.DataFrame(columns=['Date', 'Headline', 'Ticker'])
    for i in data.get("feed"):
        test_date = i.get("time_published")
        test_date = test_date[:8]
        newDate = datetime.datetime.strptime(test_date, '%Y%m%d').date()
        row = [newDate, i.get("title"), ticker]
        new_df = pd.DataFrame([row],columns=['Date', 'Headline', 'Ticker'])
        historic_news = pd.concat([historic_news, new_df], axis=0, ignore_index=True)
    return historic_news

#historic_news = getHistoricNewsData('AAPL', '2022-10-10', '2mo')
historic_news = getHistoricNewsData('AAPL', interval = '1mo')
print(historic_news)

endDate is: 2023-02-05
startDate is: 2023-01-06
startDate is: 20230106T0130
endDate is: 20230205T0130
           Date                                           Headline Ticker
0    2023-01-06  Samsung profit slumps by most in decade on wea...   AAPL
1    2023-01-06  Samsung estimates quarterly profit sank to 8-y...   AAPL
2    2023-01-06  Apple's Iconic 1970s Trade Sign, Steve Wozniak...   AAPL
3    2023-01-06  Apple's Mixed Reality Headset 'Behind Schedule...   AAPL
4    2023-01-06  My Top Stock to Buy for 2023  ( and It's Not E...   AAPL
..          ...                                                ...    ...
195  2023-01-12  TSMC Stock Higher On Record Profits, But Muted...   AAPL
196  2023-01-12  Is Meta Platforms Sitting on a $1 Trillion Opp...   AAPL
197  2023-01-12  Should You Invest in the Invesco DWA Technolog...   AAPL
198  2023-01-12                  My Top Tech IPO to Buy in January   AAPL
199  2023-01-12  Taiwan Semiconductor Posts Mixed Fourth-Quarte...   AAPL

[200 rows

# Sentiment Analysis of News data #

In [143]:
def SentimentAnalysisNewsData(parsedNews, printOut = False):
    #Downloading Vader Lexicon for Sentiment Analysis
    nltk.download('vader_lexicon')
    # Initializing Sentiment Analysis
    analyzer = SentimentIntensityAnalyzer()

    #Declaring Column Names
    columns = ['Ticker', 'Date', 'Time', 'Headline']
    #Creating dataframe from news
    news = pd.DataFrame(parsedNews, columns=columns)
    print(news)
    #Getting scores for headlines
    scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

    #Creating Dataframe of Scores
    df_scores = pd.DataFrame(scores)
    #Joining scores to news dataframe
    news = news.join(df_scores, rsuffix='_right')
    #Converting Date column to pd datetime date
    news['Date'] = pd.to_datetime(news.Date).dt.date

    #List of unique tickers
    unique_ticker = news['Ticker'].unique().tolist()
    #Creating dict for news based on ticker
    news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}
    #Initializing List of values
    # og values = []
    values = pd.DataFrame(columns = ['Ticker', 'Date', 'Compound'])
    for ticker in tickers: 
        dataframe = news_dict[ticker]
        dataframe = dataframe.set_index('Ticker')
        #Dropping headlines column since we only need scores now
        dataframe = dataframe.drop(columns = ['Headline'])
        #if printOut:
            #print ('\n')
            #print (dataframe.head())
        
        #mean = round(dataframe['compound'].mean(), 2)
        #Finding compound number for news of every day
        testdf = pd.DataFrame(columns = ['Date', 'Mean Sentiment'])
        testdf = round(dataframe.groupby('Date')['compound'].mean(), 2)
        print(testdf.shape)
        #Adding values to values list
        #og values.append(mean)
    
   
    #print(round(dataframe.groupby('Date')['compound'].mean(), 2))
    #print("VALUES------------")
    #print(values)
        
    #Combining tickers and values into new dataframe
    #df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
    #df = df.set_index('Ticker')
    #df = df.sort_values('date', ascending=False)
    if printOut:
        print("-----------DF")
        #print(df.head())
        #print(df.shape)
    #Returning the dataframe
    return df
    #if printOut:
        #print ('\n')
        #display (df)
    #return df

In [148]:
#print("HISTORIC SENTIMENT")
HistoricSentiment = SentimentAnalysisNewsData(historic_parsed_news, True)
#print("\n")
#print("TODAYS SENTIMENT")
#TodaysSentiment = SentimentAnalysisNewsData(parsed_news)

    Ticker        Date  Time  \
0     AAPL  2023-01-06  0001   
1     AAPL  2023-01-06  0001   
2     AAPL  2023-01-06  0001   
3     AAPL  2023-01-06  0001   
4     AAPL  2023-01-06  0001   
..     ...         ...   ...   
595   AAPL  2023-02-01  0001   
596   AAPL  2023-02-01  0001   
597   AAPL  2023-02-01  0001   
598   AAPL  2023-02-01  0001   
599   AAPL  2023-02-01  0001   

                                              Headline  
0    Samsung profit slumps by most in decade on wea...  
1    Samsung estimates quarterly profit sank to 8-y...  
2    Apple's Iconic 1970s Trade Sign, Steve Wozniak...  
3    Apple's Mixed Reality Headset 'Behind Schedule...  
4    My Top Stock to Buy for 2023  ( and It's Not E...  
..                                                 ...  
595  23 Things That Didn't Exist When Tom Brady Ent...  
596  Meta stock spikes despite earnings miss, as Fa...  
597                 Morning Bid: Riding the Fed dragon  
598  Meta Revenue Beats As Company Announces 

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Creating Dataset #

In [None]:
def createDataset(date_from, int):
    for i in tickers:
        print("Creating dataset for $" +i)
        #Get historic stock data
        historic_stock = getMonthlyStockData(i, interval = int)
        print(historic_stock.head())
        #Get historic news data
        historic_news = getHistoricNewsData(i, interval = int)
        print(historic_news)
        #Use news to get sentiment
        HistoricSentiment = SentimentAnalysisNewsData(historic_news)
        print(HistoricSentiment.shape)
        #Merge as training set
        #Get today's stock data
        #Get today's news data
        #Use news to get sentiment


createDataset('2022-10-10', '2mo')

Creating dataset for $AAPL
Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2023-02-06  152.574997  153.100006  150.779999  151.729996  151.729996   

              Volume  
Date                  
2023-02-06  67990412  
endDate is: 2023-02-05
startDate is: 2022-12-07
startDate is: 20221207T0130
endDate is: 20230205T0130


JSONDecodeError: Expecting value: line 1 column 1 (char 0)