<a href="https://colab.research.google.com/github/ikikika/data_science/blob/stock_market_sentiment_analysis/stock_market_sentiment_analysis/stock_market_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Description: This program predicts if the stock price of a company will increase or decrease based on top news headlines

In [1]:
pip install vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |██▋                             | 10kB 17.3MB/s eta 0:00:01[K     |█████▏                          | 20kB 2.2MB/s eta 0:00:01[K     |███████▉                        | 30kB 2.9MB/s eta 0:00:01[K     |██████████▍                     | 40kB 3.1MB/s eta 0:00:01[K     |█████████████                   | 51kB 2.5MB/s eta 0:00:01[K     |███████████████▋                | 61kB 2.8MB/s eta 0:00:01[K     |██████████████████▏             | 71kB 3.1MB/s eta 0:00:01[K     |████████████████████▉           | 81kB 3.4MB/s eta 0:00:01[K     |███████████████████████▍        | 92kB 3.6MB/s eta 0:00:01[K     |██████████████████████████      | 102kB 3.5MB/s eta 0:00:01[K     |████████████████████████████▋   | 112kB 3.5MB/s eta 0:00:01[K     |███████████████████████████████▏| 12

In [2]:
# Import libraries
import pandas as pd
import numpy as np
from textblob import TextBlob
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [8]:
# Store data in variables
df1 = pd.read_csv('https://raw.githubusercontent.com/ikikika/data_science/stock_market_sentiment_analysis/stock_market_sentiment_analysis/Combined_News_DJIA.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/ikikika/data_science/stock_market_sentiment_analysis/stock_market_sentiment_analysis/upload_DJIA_table.csv')

In [9]:
# Merge dataset on date field
merge = df1.merge(df2, how='inner', on='Date', left_index=True)

In [11]:
# Combine top news headlines
headlines = []
for row in range(0, len(merge.index)):
  headlines.append( ' '.join( str(x) for x in merge.iloc[row, 2:27] ) )

In [13]:
# Clean data
clean_headlines = []

for i in range(0, len(headlines)):
  clean_headlines.append( re.sub("b[(')]", '', headlines[i]) ) #removed b'
  clean_headlines[i] = re.sub('b[(")]', '', clean_headlines[i]) #remove b"
  clean_headlines[i] = re.sub("\'", '', clean_headlines[i]) #remove \'

In [16]:
# Add clean headlines to merge dataset

merge['Combined_News'] = clean_headlines

In [19]:
# Create function to get subjectivity

def getSubjectivity(text):
  return TextBlob(text).sentiment.subjectivity

# Create function to get polarity

def getPolarity(text):
  return TextBlob(text).sentiment.polarity

In [20]:
# Create 2 columns 'Subjectivity' and 'Polarity'
merge['Subjectivity'] = merge['Combined_News'].apply(getSubjectivity)
merge['Polarity'] = merge['Combined_News'].apply(getPolarity)

In [22]:
# Create function to get sentiment scores
def getSIA(text):
  sia = SentimentIntensityAnalyzer()
  sentiment = sia.polarity_scores(text)
  return sentiment

In [23]:
# Get sentiment scores for each day
compound = []
neg = []
pos = []
neu = []
SIA = 0

for i in range(0, len(merge['Combined_News'])):
  SIA = getSIA(merge['Combined_News'][i])
  compound.append(SIA['compound'])
  neg.append(SIA['neg'])
  neu.append(SIA['neu'])
  pos.append(SIA['pos'])

In [24]:
# Store sentiment scores in merge dataset
merge['Compound'] = compound
merge['Negative'] = neg
merge['Neutral'] = neu
merge['Positive'] = pos

In [26]:
# Create a list of columns to keep
keep_columns = ['Open', 'High', 'Low', 'Volume', 'Subjectivity', 'Polarity', 'Compound', 'Negative', 'Positive', 'Label']
df = merge[keep_columns]

In [29]:
# Create feature dataset
X = df
X = np.array(X.drop(['Label'], 1))

# Create target dataset
y = np.array(df['Label'])

In [31]:
# Split data into 80% train, 20% test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [32]:
# Create and train the model
model = LinearDiscriminantAnalysis().fit( x_train, y_train )

In [33]:
# Show model's prediction
predictions = model.predict(x_test)

In [35]:
# Show model metrics
print( classification_report(y_test, predictions) )

              precision    recall  f1-score   support

           0       0.86      0.79      0.83       193
           1       0.82      0.88      0.85       205

    accuracy                           0.84       398
   macro avg       0.84      0.84      0.84       398
weighted avg       0.84      0.84      0.84       398

