In [3]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu,True)

In [4]:
gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
import numpy as np
import pandas as pd

%matplotlib inline
from sklearn import metrics

In [6]:
from numpy.random import seed

seed(1)
from tensorflow import random

random.set_seed(2)

In [7]:
df = pd.read_csv('/content/drive/MyDrive/Futurense Hackaton/AAPL_sentiment.csv', index_col="Date", infer_datetime_format=True, parse_dates=True)

df.dropna(inplace=True)
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,ts_polarity,twitter_volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-08-26,51.47,51.8,51.26,51.62,51.12,104174400,0.07234,888.0
2019-08-27,51.97,52.14,50.88,51.04,50.54,103493200,0.117541,962.0
2019-08-28,51.03,51.43,50.83,51.38,50.88,63755200,0.061477,895.0
2019-08-29,52.13,52.33,51.67,52.25,51.74,83962000,0.05646,1083.0
2019-08-30,52.54,52.61,51.8,52.19,51.67,84573600,0.106096,1005.0


In [8]:
df.index

DatetimeIndex(['2016-01-04', '2016-01-05', '2016-01-06', '2016-01-07',
               '2016-01-08', '2016-01-11', '2016-01-12', '2016-01-13',
               '2016-01-14', '2016-01-15',
               ...
               '2019-08-19', '2019-08-20', '2019-08-21', '2019-08-22',
               '2019-08-23', '2019-08-26', '2019-08-27', '2019-08-28',
               '2019-08-29', '2019-08-30'],
              dtype='datetime64[ns]', name='Date', length=922, freq=None)

In [9]:
# Dataframe with Adj close, ts_polarity, twitter_volume of APPL
df = df[["Adj Close", "ts_polarity", "twitter_volume"]]
df.head()

Unnamed: 0_level_0,Adj Close,ts_polarity,twitter_volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-04,24.44,0.070389,1133.0
2016-01-05,23.83,0.133635,1430.0
2016-01-06,23.36,0.072042,1949.0
2016-01-07,22.38,0.074369,2289.0
2016-01-08,22.5,0.051595,2235.0


In [10]:
df.shape

(922, 3)

In [11]:
# pct change based on Adj close value
df["Pct_change"] = df["Adj Close"].pct_change()

# Drop null values
df.dropna(inplace = True)
df.head()

Unnamed: 0_level_0,Adj Close,ts_polarity,twitter_volume,Pct_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-05,23.83,0.133635,1430.0,-0.024959
2016-01-06,23.36,0.072042,1949.0,-0.019723
2016-01-07,22.38,0.074369,2289.0,-0.041952
2016-01-08,22.5,0.051595,2235.0,0.005362
2016-01-11,22.86,0.019443,1222.0,0.016


In [12]:
# This function "window_data" accepts the column number for the features (X) and the target (y)
# It chunks the data up with a rolling window of Xt-n to predict Xt
# It returns a numpy array of X any y
def window_data(df, window, feature_col_number1, feature_col_number2, feature_col_number3, target_col_number):
    # Create empty lists "X_close", "X_polarity", "X_volume" and y
    X_close = []
    X_polarity = []
    X_volume = []
    y = []
    for i in range(len(df) - window):
        
        # Get close, ts_polarity, tw_vol, and target in the loop
        close = df.iloc[i:(i + window), feature_col_number1]
        ts_polarity = df.iloc[i:(i + window), feature_col_number2]
        tw_vol = df.iloc[i:(i + window), feature_col_number3]
        target = df.iloc[(i + window), target_col_number]
        
        # Append values in the lists
        X_close.append(close)
        X_polarity.append(ts_polarity)
        X_volume.append(tw_vol)
        y.append(target)
        
    return np.hstack((X_close,X_polarity,X_volume)), np.array(y).reshape(-1, 1)

In [13]:
window_size = 3

feature_col_number1 = 0
feature_col_number2 = 1
feature_col_number3 = 2
target_col_number = 0
X, y = window_data(df, window_size, feature_col_number1, feature_col_number2, feature_col_number3, target_col_number)

In [14]:
X_split = int(0.7 * len(X))
y_split = int(0.7 * len(y))



X_train = X[: X_split]
X_test = X[X_split:]
y_train = y[: y_split]
y_test = y[y_split:]

In [15]:
X_split

642

In [16]:
df.iloc[641]

Adj Close          46.540000
ts_polarity         0.183509
twitter_volume    515.000000
Pct_change          0.000860
Name: 2018-07-23 00:00:00, dtype: float64

**Scaling Data**

In [17]:
from sklearn.preprocessing import MinMaxScaler

In [18]:
# Use the MinMaxScaler to scale data between 0 and 1.
x_train_scaler = MinMaxScaler()
x_test_scaler = MinMaxScaler()
y_train_scaler = MinMaxScaler()
y_test_scaler = MinMaxScaler()

# Fit the scaler for the Training Data
x_train_scaler.fit(X_train)
y_train_scaler.fit(y_train)

# Scale the training data
X_train = x_train_scaler.transform(X_train)
y_train = y_train_scaler.transform(y_train)

# Fit the scaler for the Testing Data
x_test_scaler.fit(X_test)
y_test_scaler.fit(y_test)

# Scale the y_test data
X_test = x_test_scaler.transform(X_test)
y_test = y_test_scaler.transform(y_test)

**Reshaping Data**

In [19]:
# Reshape the features for the model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [21]:
model = Sequential()

number_units = 9
dropout_fraction = 0.2

model.add(LSTM(
    units=number_units,
    return_sequences=True,
    input_shape=(X_train.shape[1], 1))
    )
    
model.add(Dropout(dropout_fraction))


model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))

model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))

model.add(Dense(1))

In [22]:
model.compile(optimizer="adam", loss="mean_squared_error")

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 9, 9)              396       
                                                                 
 dropout (Dropout)           (None, 9, 9)              0         
                                                                 
 lstm_1 (LSTM)               (None, 9, 9)              684       
                                                                 
 dropout_1 (Dropout)         (None, 9, 9)              0         
                                                                 
 lstm_2 (LSTM)               (None, 9)                 684       
                                                                 
 dropout_2 (Dropout)         (None, 9)                 0         
                                                                 
 dense (Dense)               (None, 1)                 1

In [57]:
model.fit(X_train, y_train, epochs=30, shuffle=False, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fa83e5bfc70>

In [58]:
model.evaluate(X_test, y_test)



0.0048074619844555855

In [59]:
predicted = model.predict(X_test)



In [60]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predicted)))
print('R-squared :', metrics.r2_score(y_test, predicted))



Root Mean Squared Error: 0.06933586269481862
R-squared : 0.9135548828495731


In [61]:
predicted_prices = y_test_scaler.inverse_transform(predicted)
real_prices = y_test_scaler.inverse_transform(y_test.reshape(-1, 1))

In [62]:
stocks = pd.DataFrame({
    "Real": real_prices.ravel(),
    "Predicted": predicted_prices.ravel()
}, index = df.index[-len(real_prices): ]) 

stocks.head()

Unnamed: 0_level_0,Real,Predicted
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-07-27,46.39,48.784634
2018-07-30,46.13,47.357277
2018-07-31,46.22,48.026573
2018-08-01,48.95,47.004257
2018-08-02,50.38,47.857052


In [63]:
stocks.tail()

Unnamed: 0_level_0,Real,Predicted
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-08-26,51.12,50.395844
2019-08-27,50.54,49.593517
2019-08-28,50.88,49.833157
2019-08-29,51.74,49.615726
2019-08-30,51.67,49.770031


In [64]:
print(stocks.columns)

Index(['Real', 'Predicted'], dtype='object')


In [65]:
# select a date range using the loc function
stock_monthly = stocks.loc['2019-06-27':'2019-08-30']

In [68]:
stock_monthly.to_csv('/content/drive/MyDrive/Futurense Hackaton/real-pred_2month_sentiment.csv', index=True)

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(30,10))
plt.plot(stock_monthly.index, stock_monthly["Real"], label="Real")
plt.plot(stock_monthly.index, stock_monthly["Predicted"], label="Predicted")

# add axis labels and legend
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()


# display the plot
plt.show()

In [50]:
import os
model.save(os.path.join('/content/drive/MyDrive/models','Apple_stock_market_model'))

