# Imports

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
import matplotlib.pylab as plt
from pandas.plotting import register_matplotlib_converters
%matplotlib inline
import re
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns
import snscrape.modules.twitter as sntwitter
import pandas
from statsmodels.tsa.seasonal import seasonal_decompose
from arch.unitroot import *
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from statsmodels.tsa.forecasting.theta import ThetaModel
from statsmodels.tsa.arima_model import ARIMA

# Load Data

In [None]:
pd.options.mode.chained_assignment = None

end = datetime.now()
start = datetime(end.year-4, end.month, end.day)
yf.pdr_override()
df = yf.download("AAPL", start=start, end=end, interval = "1d")
df.reset_index()
df = df[['Close']].fillna(method='ffill')
df = df.set_index(df.index).resample('D').ffill()
df.index = df.index.to_period(freq='D')
df

In [None]:
#https://www.macrotrends.net/2516/wti-crude-oil-prices-10-year-daily-chart
dateColumn = "date"
new = "Oil"

temp = pd.read_csv("crudeOilPrices.csv", header = 8)
temp[dateColumn] = pd.DatetimeIndex(temp[dateColumn]).to_period('D')
pointer = 0
df[new] = np.nan
while temp[dateColumn][pointer] < df.index[0]:
    pointer += 1
for date in df.index:
    if temp[dateColumn][pointer] == date:
        df[new].loc[date] = temp.iloc[pointer, 1]
        pointer+=1
    if pointer == len(temp):
        break
print(df[new].isna().sum()/len(df))
df[new] = df[new].ffill()
df

In [None]:
#https://stockanalysis.com/stocks/amzn/financials/balance-sheet/trailing/
temp = pd.read_csv("apple stock analysis - Sheet1.csv")
temp = temp.transpose()
temp.columns = temp.iloc[0]
temp = temp.iloc[1:]
#temp.index = pd.DatetimeIndex(temp.index).to_period('Q')
temp = temp.drop(["Other Current Assets","Net Cash / Debt Growth"], axis = 1)
temp = temp.iloc[::-1]
temp.loc["2022-12-31"] = temp.iloc[len(temp)-1]
temp.index= pd.DatetimeIndex(temp.index).to_period('Q').to_timestamp()


i = 0
while (temp.index[i] < df.index[0]):
    temp = temp.iloc[1:]
    i+=1

for row in range(len(temp)): 
    for column in range(len(temp.columns)):
        temp.iat[row,column] = temp.iloc[row, column].replace(",", "")
        temp.iat[row,column] = temp.iloc[row, column].replace("%", "")
temp = temp.astype(float)

df[temp.columns] = temp.columns
df[temp.columns] = np.nan
analysisData = list(temp.columns)
pointerT = 0
pointerD = 0
while pointerT != len(temp) and pointerD != len(df):
    if temp.index[pointerT].year == df.index[pointerD].year and temp.index[pointerT].quarter == df.index[pointerD].quarter:
        df.loc[df.index[pointerD],temp.columns] = temp.iloc[pointerT]
        pointerD+=1
    else:
        pointerT+=1
        
df
    

In [None]:
numRowPlot = 4
fig, axs = plt.subplots(len(analysis),numRowPlot, sharex=True, sharey=True, figsize = (10,100))

for i in range(len(analysis)):
    axs[int(i/numRowPlot), i%numRowPlot].plot(df.index, df.loc[:, analysis[i]])
    axs[int(i/numRowPlot), i%numRowPlot].set_title(analysis[i])

In [None]:
#https://fred.stlouisfed.org/series/DAAA
dateColumn = "DATE"
new = "Bonds"

temp = pd.read_csv("DAAA.csv")
temp[dateColumn] = pd.DatetimeIndex(temp[dateColumn]).to_period('D').to_timestamp()
pointer = 0
df[new] = np.nan

while temp[dateColumn][pointer] < df.index[0]:
    pointer += 1
for date in df.index:
    if temp[dateColumn][pointer] == date:
        if (temp.iloc[pointer,1] == "."):
            temp.iat[pointer,1] = temp.iloc[pointer-1, 1]
        df[new].loc[date] = temp.iloc[pointer, 1]
        pointer+=1
    if pointer == len(temp):
        break
        
print(df[new].isna().sum()/len(df))
df[new] = df[new].ffill()
df

# Gathering Sentiment Data from Twitter

In [None]:
sources = ["ETTelecom", "CNBCnow", "business", "GoldmanSachs", "pkedrosky", "ritholz", "DavidSchawel", "wallstreetmojo", "howardlindzon", "conorsen", "ReformedBroker", "mark_dow"]
topics = ["market", "stocks", "prices", "employment"]
numTweets = 500
tweets_list = []

end = datetime.now()
start = datetime(end.year-6, end.month, end.day)
formattedEnd = end.strftime("%Y-%m-%d")
formattedStart = start.strftime("%Y-%m-%d")
for source in sources:
    for topic in topics:
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{topic} since:{formattedStart} until:{formattedEnd} from:{source}').get_items()):
            if i>numTweets:
                break
            tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username])

def cleanTxt(text):
    text = re.sub('@[A-Za-z0–9]+', '', text) #Removing @mentions
    text = re.sub('#', '', text) # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text) # Removing RT
    text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F" 
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f" 
        u"\u3030" 
        "]+", re.UNICODE)
    text = re.sub(emoj, '', text)
    return text
tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', "user"])
tweets_df["Text"] = tweets_df["Text"].apply(cleanTxt)
tweets_df
tweets_df.to_csv("tweets.csv")

In [None]:
def cleanTxt(text):
    text = re.sub('@[A-Za-z0–9]+', '', text) #Removing @mentions
    text = re.sub('#', '', text) # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text) # Removing RT
    text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F" 
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f" 
        u"\u3030" 
        "]+", re.UNICODE)
    text = re.sub(emoj, '', text)
    return text
tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', "user"])
tweets_df["Text"] = tweets_df["Text"].apply(cleanTxt)
tweets_df
tweets_df.to_csv("tweets.csv")

In [None]:
tweets = pd.read_csv("tweets.csv")
tweets = tweets.drop(["Tweet Id", "user", "Unnamed: 0"], axis = 1)

sia = SentimentIntensityAnalyzer()
polarity = []
for i in tweets["Text"]:
    polarity.append(sia.polarity_scores(i)["compound"])
tweets["Polarity"] = polarity

tweets.index = pd.to_datetime(tweets["Datetime"]).dt.date
tweets.index = pd.DatetimeIndex(tweets.index).to_period('D')
tweets = tweets.drop(["Datetime", "Text"], axis = 1)
tweets = tweets.sort_index()
tweets.to_csv("tweetsPolarity.csv")
tweets

In [None]:
tweets = pd.read_csv('tweetsPolarity.csv')
tweets["Datetime"] = pd.DatetimeIndex(tweets["Datetime"]).to_period('D')
tweets.index = tweets["Datetime"]
tweets = tweets.drop("Datetime", axis = 1)
tweets.index = tweets.index.to_timestamp()
tweets

In [None]:
pointer = 0
compiledTweets = pd.DataFrame(columns = ["Polarity", "Date"])
while pointer+1 < len(tweets):
    tempAggregate = []
    while tweets.index[pointer] == tweets.index[pointer+1]:
        tempAggregate.append(tweets["Polarity"][pointer])
        pointer+=1
        if pointer+1 == len(tweets)-1:
            tempAggregate.append(tweets["Polarity"][pointer+1])
            break
    if len(tempAggregate) != 0:
        avgPolarity = sum(tempAggregate)/len(tempAggregate)
    else:
        avgPolarity = tweets["Polarity"][pointer]
    row = [avgPolarity, tweets.index[pointer]]
    compiledTweets.loc[len(compiledTweets)] = row
    pointer += 1
compiledTweets

In [None]:
pointer = 0
df["Polarity"] = np.nan
while compiledTweets["Date"][pointer] < df.index[0]:
    pointer += 1
for date in df.index:
    if compiledTweets["Date"][pointer] == date:
        df["Polarity"].loc[date] = compiledTweets["Polarity"][pointer]
        pointer+=1
    if pointer == len(compiledTweets):
        break
print(df["Polarity"].isna().sum()/len(df))
df["Polarity"] = df["Polarity"].bfill()
df

In [None]:
import dataframe_image as dfi
dfi.export(df[:5],"mytable.png")

In [None]:
df.index = df.index.to_timestamp()

# Analysis and Transformations

In [None]:
adf_ct = ADF(df["Close"])
adf_ct.summary()

In [None]:
plt.plot(df["Close"])

In [None]:
result = seasonal_decompose(df["Close"], model='multiplicative')
fig = plt.figure()  
fig = result.plot()  
fig.set_size_inches(16, 9)
trend = result.trend

In [None]:
df_transform = df["Close"].apply(np.log)
df_transform = pd.DataFrame(df_transform.apply(np.sqrt))
plt.figure(figsize = (10,6))
plt.plot(df_transform["Close"])
df_shift = pd.DataFrame(df_transform - df_transform.shift())
df_shift.dropna(inplace=True)

for i in df.columns:
    if i!="Close":
        df_transform[i] = df[i]
for i in df.columns:
    if i!="Close":
        df_shift[i] = df[i]

In [None]:
adf_ct = ADF(df_shift["Close"])
adf_ct.summary()

In [None]:
sc = StandardScaler()
mm = MinMaxScaler(feature_range =(-1, 1))

features = ["Oil", "Bonds"]
for feature in features:
    df_shift[feature] = mm.fit_transform(df[[feature]][1:])

In [None]:
df["Bonds"] = df["Bonds"].astype(float)

In [None]:
features = ["Oil", "Bonds"] 

for feature in features:
    df_transform[feature] = df[feature].apply(np.log)
    df_transform[feature] = df[feature].apply(np.sqrt)
    df_shift[feature] = pd.DataFrame(df_transform[feature] - df_transform[feature].shift())

df_shift.dropna(inplace=True)


# df_transform["Oil"] = df["Oil"].apply(np.log)
# df_transform["Oil"] = df["Oil"].apply(np.sqrt)

# #DIFFERENCING
# df_shift["Oil"] = pd.DataFrame(df_transform["Oil"] - df_transform["Oil"].shift())
# df_shift.dropna(inplace=True)

In [None]:
numRows = 4
fig, axs = plt.subplots(int(len(df.columns)/numRows)+1, numRows, sharex=True, sharey=True, figsize = (10,5))

for i in range(len(df.columns)):
    axs[int(i/numRows), i%numRows].plot(df_shift.index, df_shift.iloc[:,i])
    axs[int(i/numRows), i%numRows].set_title(df_shift.columns[i])

for ax in fig.get_axes():
    ax.label_outer()
    ax.tick_params(labelrotation=45)


# Building the Model

In [None]:
print(options)

In [None]:
options = ["Close", "Polarity", "Bonds"] 
SETX = df_shift[options]
SETY = df_shift["Close"].to_numpy()

X = []
y = []

nPast = 60
nFuture = 20

for i in range(len(df_shift) - (nPast+nFuture)):
    X.append(SETX[i: i+nPast])
    y.append(SETY[i+nPast: i+nPast+nFuture])
    
X = np.array(X)
X = X.reshape((X.shape[0], X.shape[1], X.shape[2]))
y = np.array(y)

split = int(0.9*(len(X)))
print(len(X)-split)

split = 150
X_train = X[:-split]
X_test = X[-split:]
y_train = y[:-split]
y_test = y[-split:]

train_original = df["Close"].iloc[:-split]
test_original = df["Close"].iloc[-split:]

plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Closing Prices')
plt.plot(train_original, 'b', label='Train data')
plt.plot(test_original, 'g', label='Test data')
plt.legend()

In [None]:
def basicLSTM():
    model = Sequential()    
    model.add(LSTM(units=50, input_shape=(60, X_train.shape[2])))
    model.add(Dense(nFuture))
    return model

model = basicLSTM()
model.summary()
model.compile(optimizer='adam', 
              loss='mean_squared_error',
              metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()])
model.fit(X_train, 
          y_train, 
          epochs=20, 
          batch_size = 32)

In [None]:
print("Evaluate on test data")
results = model.evaluate(X_test, y_test, batch_size=32)
print("Test MSE:", results[0])
print("Test MAE:", results[1])
print(f"Test RMSE: {results[2]}")

# 10 epochs; test batch size = 150 days

## Just close
Test MSE: 1.791499016690068e-05
Test MAE: 0.0029031261801719666
Test RMSE: 0.004232610110193491

## Close and polarity
Test MSE: 1.778067962732166e-05
Test MAE: 0.0028643100522458553
Test RMSE: 0.004216714296489954

## Close and polarity 
Test MSE: 1.811951915442478e-05
Test MAE: 0.00288763246499002
Test RMSE: 0.004256702959537506

## Close and oil
Test MSE: 2.3337508537224494e-05
Test MAE: 0.003509829519316554
Test RMSE: 0.0048308908008039

## Close, oil, polarity
Test MSE: 2.134706301148981e-05
Test MAE: 0.003339627990499139
Test RMSE: 0.00462028803303837


## Close, polarity, bonds (differenced)
Test MSE: 1.7875701814773493e-05
Test MAE: 0.002903253072872758
Test RMSE: 0.0042279670014977455

In [None]:
# Prediction
y_pred = model.predict(X_test)

# the_day is the day from which we will study the n_steps_out-th dayS of prediction into 
# the future. Note: The first day start at index 0
the_day = 0
y_pred_days = y_pred[the_day,:]

plt.figure(figsize=(10,6))
plt.grid(True)
plt.plot(y_test[the_day,:],label='Orginal data - transformed')
plt.plot(y_pred_days, color='red',label='Predictions - transformed')
plt.xlabel('Time (days)')
plt.ylabel('Closing Prices amplitude in the transformed space')
plt.title('Original data vs predictions in the transformed space')

In [None]:
train_original = df["Close"].iloc[:-split]
test_original = df["Close"].iloc[-split:]

pred_diff_cumsum = y_pred_days.cumsum()
base_number = df_transform["Close"].values[-split+the_day+nPast-1]
idx = test_original.iloc[the_day:the_day+nFuture].index

pred_tf = pd.Series(base_number, index=idx)
pred_tf = pred_tf.add(pred_diff_cumsum,fill_value=0)

pred_log = pred_tf.apply(np.square)
pred = pred_log.apply(np.exp)
print(pred)

In [None]:
# Plot actual prices vs predicted prices 
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Closing Prices')
plt.plot(test_original.iloc[max(0,the_day-30):the_day+nFuture],'b',label='Actual prices')
plt.plot(pred, '-o',color='orange',label='Predicted prices')

plt.legend()

# LSTM with Theta

### LSTM Component

In [None]:
df_shift

In [None]:
#where the test will occur; set to 0 if forecasting out-of-sample :)
boundary = 20

SETX = df_shift[["Close", "Polarity", "Bonds"]]
SETY = df_shift["Close"].to_numpy()

X = []
y = []


nPast = 60
nFuture = 20

for i in range(len(df_shift) - (nPast+nFuture)):
    X.append(SETX[i: i+nPast])
    y.append(SETY[i+nPast: i+nPast+nFuture])
if boundary < 20:
    for i in range(len(df_shift) - (nPast+nFuture-boundary)):
        X.append(SETX[i: i+nPast])

X = np.array(X)
X = X.reshape((X.shape[0], X.shape[1], X.shape[2]))
X_train = X[:-boundary]
X_test = X[-boundary:]

y = np.array(y)
y_train = y[:-boundary]
y_test = y[:-boundary]

def vanilla_multistep_LSTM():
    model = Sequential()    
    model.add(LSTM(units=50, input_shape=(nPast, X_train.shape[2])))
    model.add(Dense(nFuture))
    return model

model = vanilla_multistep_LSTM()
model.summary()
model.compile(optimizer='adam', 
              loss='mean_squared_error',
              metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()])
model.fit(X_train, 
          y_train, 
          epochs=10, 
          batch_size = 32)

y_pred = model.predict(X_test)

### Adding Theta Model Component

In [None]:
tm = ThetaModel(df["Close"][:-boundary])
theta = tm.fit()
print(theta.summary())
thetaResults = theta.forecast(steps = 20)
print(thetaResults)

In [None]:
y_pred_days = y_pred[0,:]

thetaResultsTransform = thetaResults.apply(np.log)
thetaResultsTransform = pd.Series(thetaResultsTransform.apply(np.sqrt))
pred_diff_cumsum = y_pred_days.cumsum()
pred_tf = thetaResultsTransform.add(pred_diff_cumsum,fill_value=0)
pred_log = pred_tf.apply(np.square)
pred = pred_log.apply(np.exp)

baseNum = df_transform["Close"][-boundary-1]
idx = []
for i in range(nFuture):
    idx.append(df.index[-boundary]+timedelta(days=i))
testPredSeries = pd.Series(baseNum, index = idx)
testPred = testPredSeries.add(pred_diff_cumsum, fill_value = 0)
testPred_log = testPred.apply(np.square)
testPredFinal = testPred_log.apply(np.exp)


plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Closing Prices')
plt.plot(df["Close"][-nPast-boundary:],'b',label='Actual prices')
plt.plot(pred, '-o',color='orange',label='Predicted prices')
plt.plot(thetaResults, color = "green")
plt.plot(testPredFinal, color = "red")

plt.legend()

PREDICTIONS = pred