# Imports

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
import matplotlib.pylab as plt
from pandas.plotting import register_matplotlib_converters
%matplotlib inline
import re
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns
import snscrape.modules.twitter as sntwitter
import pandas
from statsmodels.tsa.seasonal import seasonal_decompose
from arch.unitroot import *
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from statsmodels.tsa.forecasting.theta import ThetaModel

In [None]:
STOCK= "TSLA"

# Load Data

In [None]:
pd.options.mode.chained_assignment = None

end = datetime.now()
start = datetime(end.year-4, end.month, end.day)
yf.pdr_override()
df = yf.download(STOCK, start=start, end=end, interval = "1d")
df.reset_index()
df = df[['Close']].fillna(method='ffill')
df = df.set_index(df.index).resample('D').ffill()
df.index = df.index.to_period(freq='D')
df

#3 Run everything from here on once until the heading says STOP; you don't have to do this after the first time running all the way through

In [None]:
#https://fred.stlouisfed.org/series/DAAA
dateColumn = "DATE"
new = "Bonds"

temp = pd.read_csv("DAAA.csv")
temp[dateColumn] = pd.DatetimeIndex(temp[dateColumn]).to_period('D')
pointer = 0
df[new] = np.nan

while temp[dateColumn][pointer] < df.index[0]:
    pointer += 1
for date in df.index:
    if temp[dateColumn][pointer] == date:
        if (temp.iloc[pointer,1] == "."):
            temp.iat[pointer,1] = temp.iloc[pointer-1, 1]
        df[new].loc[date] = temp.iloc[pointer, 1]
        pointer+=1
    if pointer == len(temp):
        break
        
print(df[new].isna().sum()/len(df))
df[new] = df[new].ffill()
df

# Gathering Sentiment Data from Twitter

In [None]:
tweets = pd.read_csv('tweetsPolarity.csv')
tweets["Datetime"] = pd.DatetimeIndex(tweets["Datetime"]).to_period('D')
tweets.index = tweets["Datetime"]
tweets = tweets.drop("Datetime", axis = 1)
tweets

In [None]:
pointer = 0
compiledTweets = pd.DataFrame(columns = ["Polarity", "Date"])
while pointer+1 < len(tweets):
    tempAggregate = []
    while tweets.index[pointer] == tweets.index[pointer+1]:
        tempAggregate.append(tweets["Polarity"][pointer])
        pointer+=1
        if pointer+1 == len(tweets)-1:
            tempAggregate.append(tweets["Polarity"][pointer+1])
            break
    if len(tempAggregate) != 0:
        avgPolarity = sum(tempAggregate)/len(tempAggregate)
    else:
        avgPolarity = tweets["Polarity"][pointer]
    row = [avgPolarity, tweets.index[pointer]]
    compiledTweets.loc[len(compiledTweets)] = row
    pointer += 1
compiledTweets

In [None]:
pointer = 0
df["Polarity"] = np.nan
while compiledTweets["Date"][pointer] < df.index[0]:
    pointer += 1
for date in df.index:
    if compiledTweets["Date"][pointer] == date:
        df["Polarity"].loc[date] = compiledTweets["Polarity"][pointer]
        pointer+=1
    if pointer == len(compiledTweets):
        break
print(df["Polarity"].isna().sum()/len(df))
df["Polarity"] = df["Polarity"].bfill()
df

In [None]:
df.index = df.index.to_timestamp()

# Analysis and Transformations

In [None]:
adf_ct = ADF(df["Close"])
adf_ct.summary()

In [None]:
plt.plot(df["Close"])

In [None]:
result = seasonal_decompose(df["Close"], model='multiplicative')
fig = plt.figure()  
fig = result.plot()  
fig.set_size_inches(16, 9)
trend = result.trend

In [None]:
df_transform = df["Close"].apply(np.log)
df_transform = pd.DataFrame(df_transform.apply(np.sqrt))
plt.figure(figsize = (10,6))
plt.plot(df_transform["Close"])

#DIFFERENCING
df_shift = pd.DataFrame(df_transform - df_transform.shift())
df_shift.dropna(inplace=True)

for i in df.columns:
    if i!="Close":
        df_transform[i] = df[i]
for i in df.columns:
    if i!="Close":
        df_shift[i] = df[i]

In [None]:
adf_ct = ADF(df_shift["Close"])
adf_ct.summary()

In [None]:
sc = StandardScaler()
mm = MinMaxScaler(feature_range =(-1, 1))

df_shift["Bonds"] = mm.fit_transform(df[["Bonds"]][1:])


In [None]:
df["Bonds"] = df["Bonds"].astype(float)

In [None]:
df_transform["Bonds"] = df["Bonds"].apply(np.log)
df_transform["Bonds"] = df["Bonds"].apply(np.sqrt)

#DIFFERENCING
df_shift["Bonds"] = pd.DataFrame(df_transform["Bonds"] - df_transform["Bonds"].shift())
df_shift.dropna(inplace=True)

In [None]:
fig, axs = plt.subplots(2,2, sharex=True, sharey=True, figsize = (10,10))

axs[0,1].plot(df_shift.index, df_shift.iloc[:,0])
axs[0, 1].set_title(df_shift.columns[0])
axs[1,1].plot(df_shift.index, df_shift.iloc[:,1])
axs[1, 1].set_title(df_shift.columns[1])
axs[0,0].plot(df_shift.index, df_shift.iloc[:,2])
axs[0, 0].set_title(df_shift.columns[2])

for ax in fig.get_axes():
    ax.label_outer()


### LSTM Component

In [None]:
#where the test will occur; set to 0 if forecasting out-of-sample :)
boundary = 0

SETX = df_shift[["Close", "Polarity", "Bonds"]]
SETY = df_shift["Close"].to_numpy()

X_train = []
y_train = []


nPast = 60
nFuture = 20

for i in range(len(df_shift) - (nPast+nFuture)):
    y_train.append(SETY[i+nPast: i+nPast+nFuture])

for i in range(len(df_shift) - (nPast+nFuture)):
    X_train.append(SETX[i: i+nPast])
X_train = np.array(X_train)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2]))

y_train = np.array(y_train)


def vanilla_multistep_LSTM():
    model = Sequential()    
    model.add(LSTM(units=50, input_shape=(nPast, X_train.shape[2])))
    model.add(Dense(nFuture))
    return model

model = vanilla_multistep_LSTM()
model.summary()
model.compile(optimizer='adam', 
              loss='mean_squared_error',
              metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()])
model.fit(X_train, 
          y_train, 
          epochs=20, 
          batch_size = 32)

y_pred = model.predict(X_train)

### Adding Theta Model Component

In [None]:
tm = ThetaModel(df["Close"])
theta = tm.fit()
print(theta.summary())
thetaResults = theta.forecast(steps = 20)
print(thetaResults)

In [None]:
y_pred_days = y_pred[0,:]

thetaResultsTransform = thetaResults.apply(np.log)
thetaResultsTransform = pd.Series(thetaResultsTransform.apply(np.sqrt))
pred_diff_cumsum = y_pred_days.cumsum()
pred_tf = thetaResultsTransform.add(pred_diff_cumsum,fill_value=0)
pred_log = pred_tf.apply(np.square)
pred = pred_log.apply(np.exp)

baseNum = df_transform["Close"][len(df)-1]
idx = []
for i in range(nFuture):
    idx.append(df.index[len(df.index)-1]+timedelta(days=i))
testPredSeries = pd.Series(baseNum, index = idx)
testPred = testPredSeries.add(pred_diff_cumsum, fill_value = 0)
testPred_log = testPred.apply(np.square)
testPredFinal = testPred_log.apply(np.exp)


plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Closing Prices')
plt.plot(df["Close"][-nPast-boundary:],'b',label='Actual prices')
plt.plot(pred, '-o',color='orange',label='Predicted prices')
plt.plot(thetaResults, color = "green")
plt.plot(testPredFinal, color = "red")

plt.legend()

export = pd.DataFrame(pred)
export = export.rename(columns = {"forecast": STOCK})
export.to_csv(f"{STOCK} predictions")