<a href="https://colab.research.google.com/github/ishaanpaul98/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports #

In [70]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import datetime
import yfinance as yf
import time
import requests
import io

In [71]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [72]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [73]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from sklearn.metrics import plot_confusion_matrix

# Stock Data Helper Functions #

In [74]:
def getStockDataDaily(symbol, day = datetime.date.today()):
    print("Getting stock data for stock $"+symbol)
    df = yf.download(symbol, start=day, period = "1d")
    return df

getStockDataDaily('AAPL')

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-02-22,148.869995,149.5,147.160004,149.149994,149.149994,26899690


In [77]:
def getYearlyStockData(symbol, day = datetime.date.today() - datetime.timedelta(days = 1), interval = 30):
    print("Getting stock data for stock $"+symbol)
    yfinterval = ''
    if interval == 30:
        yfinterval = '31d'
    elif interval == 60:
        yfinterval = '61d'
    elif interval == 365:
        yfinterval = '1y'
    else:
        print("INVALID INTERVAL")
    aapl = yf.Ticker("AAPL")
    # get historical market data
    #hist = aapl.history(period="1mo")
    df = yf.download(symbol, period = yfinterval, group_by='ticker')
    return df

#getMonthlyStockData('AAPL', "2022-11-21")

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-09,130.470001,133.410004,129.889999,130.149994,129.951584,70790800
2023-01-10,130.259995,131.259995,128.119995,130.729996,130.530701,63896200
2023-01-11,131.25,133.509995,130.460007,133.490005,133.286499,69458900
2023-01-12,133.880005,134.259995,131.440002,133.410004,133.206619,71379600
2023-01-13,132.029999,134.919998,131.660004,134.759995,134.55455,57758000
2023-01-17,134.830002,137.289993,134.130005,135.940002,135.732758,63646600
2023-01-18,136.820007,138.610001,135.029999,135.210007,135.003876,69672800
2023-01-19,134.080002,136.25,133.770004,135.270004,135.063782,58280400
2023-01-20,135.279999,138.020004,134.220001,137.869995,137.659805,79972200
2023-01-23,138.119995,143.320007,137.899994,141.110001,140.894882,81760300


# Gathering Data From AlphaAdvantage for Historical News #

In [97]:
#Importing data from CSV file
import csv

with open('datasets/AAPL-hist.csv', newline='') as f:
    reader = csv.reader(f)
    historic_news_list = list(reader)

In [98]:
temp_df = pd.DataFrame(historic_news_list,columns=['Time','Date','Headline', 'Ticker'])


In [99]:
temp_df = temp_df.iloc[:, [3, 1, 0, 2]]
temp_df

Unnamed: 0,Ticker,Date,Time,Headline
0,Ticker,Date,,Headline
1,AAPL,2022-03-01,0,"US stocks fall, oil tops $105 as Ukraine crisi..."
2,AAPL,2022-03-02,1,Apple halts product sales in Russia after Ukra...
3,AAPL,2022-03-02,2,Russia says its economy is taking 'serious blo...
4,AAPL,2022-03-02,3,"Rich Russians turn to luxury jewellery, watche..."
...,...,...,...,...
4597,AAPL,2023-02-01,4596,23 Things That Didn't Exist When Tom Brady Ent...
4598,AAPL,2023-02-01,4597,"Meta stock spikes despite earnings miss, as Fa..."
4599,AAPL,2023-02-01,4598,Morning Bid: Riding the Fed dragon
4600,AAPL,2023-02-01,4599,Meta Revenue Beats As Company Announces $40 Bi...


In [100]:
#historic_parsed_news = temp_df.drop('Ind',axis=1).values.tolist()
historic_parsed_news = temp_df.values.tolist()
historic_parsed_news.remove(['Ticker', 'Date', '', 'Headline'])

print(historic_parsed_news)



# Sentiment Analysis of News data #

In [86]:
def SentimentAnalysisNewsData(parsedNews, printOut = False):
    #Downloading Vader Lexicon for Sentiment Analysis
    nltk.download('vader_lexicon')
    # Initializing Sentiment Analysis
    analyzer = SentimentIntensityAnalyzer()

    #Declaring Column Names
    columns = ['Ticker', 'Date', 'Time', 'Headline']
    #Creating dataframe from news
    news = pd.DataFrame(parsedNews, columns=columns)
    #Getting scores for headlines
    scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

    #Creating Dataframe of Scores
    df_scores = pd.DataFrame(scores)
    #Joining scores to news dataframe
    news = news.join(df_scores, rsuffix='_right')
    #Converting Date column to pd datetime date
    news['Date'] = pd.to_datetime(news.Date).dt.date

    #List of unique tickers
    unique_ticker = news['Ticker'].unique().tolist()
    #Creating dict for news based on ticker
    news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}
    #Initializing List of values
    # og values = []
    values = []
    df = pd.DataFrame()
    for ticker in tickers: 
        dataframe = news_dict[ticker]
        dataframe = dataframe.set_index('Ticker')
        #Dropping headlines column since we only need scores now
        dataframe = dataframe.drop(columns = ['Headline'])
        #mean = round(dataframe['compound'].mean(), 2)
        #Finding compound number for news of every day
        df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)
        #og values.append(mean)
        
    #print(values)
    #Combining tickers and values into new dataframe
    df['Ticker'] = ticker
    df = df.sort_values('Date', ascending=True)
    if printOut:
        print("-----------DF")
        print(df)
    #Returning the dataframe
    return df

In [101]:
HistoricSentiment = SentimentAnalysisNewsData(historic_parsed_news)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)


# Creating Dataset #

In [88]:
print("Creating dataset for $")
#Get historic stock data
historic_stock = getYearlyStockData('AAPL', interval = 365)
#Get historic news data
historic_news = historic_parsed_news
#Use news to get sentiment
HistoricSentiment = SentimentAnalysisNewsData(historic_news)
#Dropping ticker since all the tickers in the for loop are the same
HistoricSentiment = HistoricSentiment.drop(columns=['Ticker'])

Creating dataset for $
Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)


In [89]:
#Merge as training set
newHistoricSentiment = HistoricSentiment
newHistoricStock = historic_stock
#print(newHistoricStock.index)
df_merged = pd.concat([newHistoricSentiment, newHistoricStock], axis=1)
#print(df_merged)
HistoricDataset = df_merged.dropna()
print(HistoricDataset)

            compound        Open        High         Low       Close  \
Date                                                                   
2022-03-01     -0.20  164.699997  166.600006  161.970001  163.199997   
2022-03-02      0.03  164.389999  167.360001  162.949997  166.559998   
2022-03-03      0.12  168.470001  168.910004  165.550003  166.229996   
2022-03-04      0.06  164.490005  165.550003  162.100006  163.169998   
2022-03-07     -0.02  163.360001  165.020004  159.039993  159.300003   
...              ...         ...         ...         ...         ...   
2023-01-26      0.18  143.169998  144.250000  141.899994  143.960007   
2023-01-27      0.08  143.160004  147.229996  143.080002  145.929993   
2023-01-30     -0.02  144.960007  145.550003  142.850006  143.000000   
2023-01-31      0.07  142.699997  144.339996  142.279999  144.289993   
2023-02-01      0.02  143.970001  146.610001  141.320007  145.429993   

             Adj Close      Volume  
Date                      

  df_merged = pd.concat([newHistoricSentiment, newHistoricStock], axis=1)


In [90]:
#Get today's stock data
todaysStockData = getStockDataDaily("AAPL")
#Get today's news data
todaysNews = parsed_news
#Use news to get sentiment
TodaysSentiment = SentimentAnalysisNewsData(parsed_news)

Getting stock data for stock $AAPL
[*********************100%***********************]  1 of 1 completed


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ishaan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)
  df = round(dataframe.groupby('Date')['Date', 'compound'].mean(), 2)


In [91]:
todaysDataset = pd.concat([newHistoricSentiment, newHistoricStock], axis=1)
todaysDataset = todaysDataset.dropna()
print(todaysDataset)

            compound        Open        High         Low       Close  \
Date                                                                   
2022-03-01     -0.20  164.699997  166.600006  161.970001  163.199997   
2022-03-02      0.03  164.389999  167.360001  162.949997  166.559998   
2022-03-03      0.12  168.470001  168.910004  165.550003  166.229996   
2022-03-04      0.06  164.490005  165.550003  162.100006  163.169998   
2022-03-07     -0.02  163.360001  165.020004  159.039993  159.300003   
...              ...         ...         ...         ...         ...   
2023-01-26      0.18  143.169998  144.250000  141.899994  143.960007   
2023-01-27      0.08  143.160004  147.229996  143.080002  145.929993   
2023-01-30     -0.02  144.960007  145.550003  142.850006  143.000000   
2023-01-31      0.07  142.699997  144.339996  142.279999  144.289993   
2023-02-01      0.02  143.970001  146.610001  141.320007  145.429993   

             Adj Close      Volume  
Date                      

  todaysDataset = pd.concat([newHistoricSentiment, newHistoricStock], axis=1)


In [96]:
# Find overlap between both sets
for dates in todaysDataset.index:
    print(dates)

2022-03-01
2022-03-02
2022-03-03
2022-03-04
2022-03-07
2022-03-08
2022-03-09
2022-03-10
2022-03-17
2022-03-18
2022-03-21
2022-03-22
2022-03-23
2022-04-01
2022-04-04
2022-04-05
2022-04-06
2022-04-07
2022-04-18
2022-04-19
2022-04-20
2022-04-21
2022-04-22
2022-05-02
2022-05-03
2022-05-04
2022-05-05
2022-05-06
2022-05-09
2022-05-16
2022-05-17
2022-05-18
2022-05-19
2022-05-20
2022-05-23
2022-05-31
2022-06-01
2022-06-02
2022-06-03
2022-06-06
2022-06-07
2022-06-15
2022-06-16
2022-06-17
2022-06-21
2022-06-22
2022-06-23
2022-06-24
2022-06-30
2022-07-01
2022-07-05
2022-07-06
2022-07-07
2022-07-08
2022-07-15
2022-07-18
2022-07-19
2022-07-20
2022-07-21
2022-08-01
2022-08-02
2022-08-03
2022-08-04
2022-08-05
2022-08-08
2022-08-15
2022-08-16
2022-08-17
2022-08-18
2022-08-19
2022-08-22
2022-08-29
2022-08-30
2022-08-31
2022-09-01
2022-09-02
2022-09-06
2022-09-13
2022-09-14
2022-09-15
2022-09-16
2022-09-19
2022-09-28
2022-09-29
2022-09-30
2022-10-03
2022-10-13
2022-10-14
2022-10-17
2022-10-18
2022-10-19

# Preprocessing Dataset #

In [93]:
#Dataset is already split into historical (train) and current (test)
#  

# Model Creation #

In [94]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [95]:
model = keras.Sequential()
model.add(layers.LSTM(100, return_sequences=True, input_shape=(trainingDataset.shape[1], 1)))
model.add(layers.LSTM(100, return_sequences=False))
model.add(layers.Dense(25))
model.add(layers.Dense(1))
model.summary()

NameError: name 'trainingDataset' is not defined

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(trainingDataset, testSet, batch_size= 1, epochs=3)

ValueError: Data cardinality is ambiguous:
  x sizes: 33
  y sizes: 112
Make sure all arrays contain the same number of samples.