## for runtime purposes

In [1]:
import time
a = time.time()

# Sentiment Analysis

## Import statements

In [2]:
import pandas as pd
import re
from functools import reduce
from textblob import TextBlob



## Save excel files from data cleaning stage into DataFrames and name the timestamp column

In [3]:
nike = pd.read_excel('nike.xlsx')
starbucks = pd.read_excel('starbucks.xlsx')
target = pd.read_excel('target.xlsx')

nike = nike.rename(columns = {'Unnamed: 0': 'timestamp'})
starbucks = starbucks.rename(columns = {'Unnamed: 0': 'timestamp'})
target = target.rename(columns = {'Unnamed: 0': 'timestamp'})

## Define getPolarity function that uses the TextBlob library to perform sentiment analysis on the comments

In [4]:
def getPolarity(text):
   return TextBlob(text).sentiment.polarity

## Define sentiment analysis function that cleans the string data and assigns each date a polarity score using the TextBlob library

In [5]:
def sentiment_analysis(df):
    list = []
    for x in df['comments']:
    
        # changes all of the characters to lowercase
        x = str.lower(x)
    
        # removes all special characters
        x = re.sub(r'[^a-zA-Z0-9\s]+', '', x)
    
        # adds the modified column values to a list
        list.append(x)

        # adds the nike_list to a new dataframe
        new_df = pd.DataFrame(list, columns =['comments'])

    # adds the timestamps to the new dataframe
    new_df = new_df.join(df['timestamp'])

    # performs sentiment analysis on the comments from each day
    new_df['polarity_score'] = new_df['comments'].apply(getPolarity)

    # rearrange column order
    new_df = new_df[['timestamp', 'polarity_score','comments']]

    # set timestamp as index
    new_df.set_index('timestamp')
    
    return new_df

## Creates the new dataframe using the sentiment analysis function on each company's dataframe

In [6]:
nike = sentiment_analysis(nike)

In [7]:
starbucks = sentiment_analysis(starbucks)

In [8]:
target = sentiment_analysis(target)

# Machine Learning

## Import Statements

In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_absolute_error

2023-12-07 22:34:39.360256: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## save the stock data into csv files

In [10]:
nike_stock = pd.read_csv('nike_stock.csv')
starbucks_stock = pd.read_csv('starbucks_stock.csv')
target_stock = pd.read_csv('target_stock.csv')

## combine the stock data and sentiment analysis data together. we chose the close price to be our output variable for the LSTM models

In [11]:
def combine_and_clean(stock, sa):
    
    # drop columns not needed from initial data retreival
    stock = stock.drop(['Open', 'High', 'Low', 'Adj Close', 'Volume'], axis = 1)
    
    # change columns to datetime type
    sa['timestamp'] = pd.to_datetime(sa['timestamp'])
    stock['Date'] = pd.to_datetime(stock['Date'])
    
    # perform an inner join to combine the dataframes by the dates available for both the sentiment analysis and the stock data
    sa = sa.merge(stock, how = 'inner', left_on='timestamp', right_on = 'Date')
    
    # drop the date column as it is the same as the timestamp column (since the join was performed) and drop the comments column as the sentiment analysis was already completed
    sa = sa.drop(['Date', 'comments'], axis = 1)
    
    # rename the Close column to close to match the lowercase in the other column names
    sa = sa.rename(columns = {'Close': 'close'})
    
    # rename the timestamp column to date
    sa = sa.rename(columns = {'timestamp':'date'})
    
    # Convert 'date' to numerical feature (number of days since the start)
    sa['days_since_start'] = (sa['date'] - sa['date'].min()).dt.days
        
    #return the cleaned dataframe
    return sa

In [12]:
nike = combine_and_clean(nike_stock, nike)
starbucks = combine_and_clean(starbucks_stock, starbucks)
target = combine_and_clean(target_stock, target)

## Create a long short term memory model for each of the companies that uses the date (normalized as date since the earliest date) and polarity_score from sentiment analysis as input variables and the close price as the output variable

In [13]:
def model(df):
    # feature selection
    features = df[['days_since_start', 'polarity_score', 'close']].values
    
    # scale the values between 0 and 1 (this is optimal for LSTM)
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(features)

    # Define nested function to create dataset with input features and target variable
    def create_dataset(data, length):
        # x represents the input features
        # y represents the output variables
        x, y = [], []
        for i in range(len(data) - length):
            a = data[i:(i + length), :]
            x.append(a)
            y.append(data[i + length, 2])  # 'close' is the third column
        return np.array(x), np.array(y)

    # determines the sequence length for each input
    length = 10

    # create dataset
    x, y = create_dataset(scaled, length)

    # split data into training and testing
    # test size is 20% of data
    # random_state = 42 means that the same training and testing sets are used across executions
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=False)

    # build LSTM model
    # 2 LSTM layers 
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
    model.add(LSTM(units=50))
    # dense output layer
    model.add(Dense(units=1))

    # compile
    model.compile(optimizer='adam', loss='mean_squared_error')

    # train
    model.fit(x_train, y_train, epochs=50, batch_size=32)

    # evaluate
    test_loss = model.evaluate(x_test, y_test)

    # define variable for test_loss output
    test_loss_output = f'Test Loss: {test_loss}'

    # predict
    predictions = model.predict(x_test)

    # unscale the data so that it is back in the original form
    predictions = scaler.inverse_transform(np.concatenate((x_test[:, -1, 0:2], predictions.reshape(-1, 1)), axis=1))[:, 2]
    
    # evaluate
    actual_close = scaler.inverse_transform(np.concatenate((x_test[:, -1, 0:2], y_test.reshape(-1, 1)), axis=1))[:, 2]
  
    mae = mean_absolute_error(actual_close, predictions)
    mae_output = f'Mean Absolute Error: {mae}'
    return predictions, test_loss_output, mae_output

In [14]:
model(nike)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


(array([109.48851782, 109.03080442, 108.54427431, 108.08426195,
        107.66473546, 107.63872813, 107.26443003, 107.01468175,
        106.92678231, 107.06728977, 107.53760384, 108.6310498 ,
        109.62676993, 110.72901827, 111.79984684, 113.22537815,
        112.93016116, 111.95745611]),
 'Test Loss: 0.02926524542272091',
 'Mean Absolute Error: 4.644756034481897')

In [15]:
model(starbucks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


(array([102.08050462, 101.93891967, 101.76612167, 101.79896583,
        101.58724029, 102.19354372, 102.71302512, 103.0236493 ,
        103.64531514, 103.31294003, 102.45362562, 101.50394401,
        100.48122299,  99.77033838,  99.02616161,  99.39032537,
         99.61892862, 100.68793105, 102.90757821, 103.91374955,
        103.26707749, 102.76861607]),
 'Test Loss: 0.04375164210796356',
 'Mean Absolute Error: 4.073746967501189')

In [16]:
model(target)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


(array([133.08718162, 132.74066199, 130.88496608, 131.05109824,
        131.42109176, 131.54368264, 131.35630176, 130.21453942,
        130.28784351, 130.42551232, 131.00896165, 130.3600436 ,
        130.93389633, 130.45707288, 129.52146365, 130.83743408,
        130.51574579, 129.63202655, 129.1026258 , 129.26300171,
        129.59862743, 130.95766877, 130.62926199, 130.94545822,
        132.98812262, 135.43499155, 139.92760061]),
 'Test Loss: 0.015270217321813107',
 'Mean Absolute Error: 5.4967920476471575')

In [17]:
# runtime in seconds
b= time.time()
b - a

27.282397031784058