# Imports

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
import matplotlib.pylab as plt
from pandas.plotting import register_matplotlib_converters
%matplotlib inline
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns
import snscrape.modules.twitter as sntwitter
import pandas
from statsmodels.tsa.seasonal import seasonal_decompose
from arch.unitroot import *
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM

# Load Stock Data

In [None]:
pd.options.mode.chained_assignment = None

end = datetime.now()
start = datetime(end.year-4, end.month, end.day)
yf.pdr_override()
df = yf.download("AAPL", start=start, end=end, interval = "1d")
df.reset_index()
df = df[['Close']].fillna(method='ffill')
df = df.set_index(df.index).resample('D').ffill()
df.index = df.index.to_period(freq='D')
df

In [None]:
temp = pd.read_csv("SP500 (2).csv")
temp.index = pd.to_datetime(temp.iloc[:,0], format='%Y-%m-%d').dt.date

temp.index = pd.DatetimeIndex(temp.index).to_period('D')
temp = temp.drop(temp.columns[0], axis = "columns")
temp = temp.set_index(temp.index).resample('D').ffill()

sample = temp.index[0]
while sample < df.index[0]
startingPoint = 0
for i in range(len(temp)):
    if (temp.index[i] == df.index[0]):
        startingPoint = i
        break
for i in range(len(temp)):
    if temp.iloc[i].values == ".":
        temp.iloc[i] = np.NaN
temp = temp.fillna(method = "ffill")

df["SP500"] = temp[i:].astype(float)

temp

# Gathering Sentiment Data from Twitter 

In [None]:
sources = ["ETTelecom", "CNBCnow", "business", "GoldmanSachs", "pkedrosky", "ritholz", "DavidSchawel", "wallstreetmojo", "howardlindzon", "conorsen", "ReformedBroker", "mark_dow"]
topics = ["market", "stocks", "prices", "employment"]
numTweets = 500
tweets_list = []

end = datetime.now()
start = datetime(end.year-6, end.month, end.day)
formattedEnd = end.strftime("%Y-%m-%d")
formattedStart = start.strftime("%Y-%m-%d")


for source in sources:
    for topic in topics:
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{topic} since:{formattedStart} until:{formattedEnd} from:{source}').get_items()):
            if i>numTweets:
                break
            tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username])

def cleanTxt(text):
    text = re.sub('@[A-Za-z0–9]+', '', text) #Removing @mentions
    text = re.sub('#', '', text) # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text) # Removing RT
    text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F" 
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f" 
        u"\u3030" 
        "]+", re.UNICODE)
    text = re.sub(emoj, '', text)
    return text

tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', "user"])
tweets_df["Text"] = tweets_df["Text"].apply(cleanTxt)
tweets_df
tweets_df.to_csv("tweets.csv")

In [None]:
tweets = pd.read_csv("tweets.csv")
tweets = tweets.drop(["Tweet Id", "user", "Unnamed: 0"], axis = 1)

sia = SentimentIntensityAnalyzer()
polarity = []
for i in tweets["Text"]:
    polarity.append(sia.polarity_scores(i)["compound"])
tweets["Polarity"] = polarity

tweets.index = pd.to_datetime(tweets["Datetime"]).dt.date
tweets.index = pd.DatetimeIndex(tweets.index).to_period('D')
tweets = tweets.drop(["Datetime", "Text"], axis = 1)
tweets = tweets.sort_index()
tweets

In [None]:
pointer = 0
compiledTweets = pd.DataFrame(columns = ["Polarity", "Date"])
while pointer+1 < len(tweets):
    tempAggregate = []
    while tweets.index[pointer] == tweets.index[pointer+1]:
        tempAggregate.append(tweets["Polarity"][pointer])
        pointer+=1
        if pointer+1 == len(tweets)-1:
            tempAggregate.append(tweets["Polarity"][pointer+1])
            break
    if len(tempAggregate) != 0:
        avgPolarity = sum(tempAggregate)/len(tempAggregate)
    else:
        avgPolarity = tweets["Polarity"][pointer]
    row = [avgPolarity, tweets.index[pointer]]
    compiledTweets.loc[len(compiledTweets)] = row
    pointer += 1
compiledTweets

In [None]:
pointer = 0
df["Polarity"] = np.nan
while compiledTweets["Date"][pointer] < df.index[0]:
    pointer += 1
print(compiledTweets["Date"][pointer], df.index[0])
for date in df.index:
    if compiledTweets["Date"][pointer] == date:
        df.loc[date]["Polarity"] = compiledTweets["Polarity"][pointer]
        pointer+=1
    if pointer == len(compiledTweets):
        break
print(df["Polarity"].isna().sum()/len(df))
df["Polarity"] = df["Polarity"].bfill()

In [None]:
df.index = df.index.to_timestamp()

# Analysis and Transformations

In [None]:
adf_ct = ADF(df["Close"])
adf_ct.summary()

In [None]:
plt.plot(df["Close"])

In [None]:
result = seasonal_decompose(df["Close"], model='multiplicative')
fig = plt.figure()  
fig = result.plot()  
fig.set_size_inches(16, 9)

In [None]:
df_transform = df["Close"].apply(np.log)
df_transform = df_transform.apply(np.sqrt)
plt.figure(figsize = (10,6))
plt.plot(df_transform)
df_shift = df_transform - df_transform.shift()
df_shift.dropna(inplace=True)

In [None]:
adf_ct = ADF(df_shift)
adf_ct.summary()

# Building the Model

In [None]:
def preprocess_multistep_lstm(sequence, n_steps_in, n_steps_out, features):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        # check if we are beyond the sequence
        if out_end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix:out_end_ix]
        X.append(seq_x)
        y.append(seq_y)

    X = np.array(X)
    y = np.array(y)

    X = X.reshape((X.shape[0], X.shape[1], n_features))
    
    return X, y

#days into future
n_steps_out = 10

#days to base predictions
nb_days = 60

n_features = 1

X, y = preprocess_multistep_lstm(df_shift.to_numpy(), nb_days, n_steps_out, n_features)

In [None]:
test_days = 150 

X_train, y_train = X[:-test_days], y[:-test_days]
X_test, y_test = X[-test_days:], y[-test_days:]

train_original = df["Close"].iloc[:-test_days]
test_original = df["Close"].iloc[-test_days:]

plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Closing Prices')
plt.plot(train_original, 'b', label='Train data')
plt.plot(test_original, 'g', label='Test data')
plt.legend()


In [None]:
def vanilla_multistep_LSTM():
    model = Sequential()    
    model.add(LSTM(units=50, input_shape=(nb_days, n_features)))
    model.add(Dense(n_steps_out))
    return model

In [None]:
model = vanilla_multistep_LSTM()
model.summary()
model.compile(optimizer='adam', 
              loss='mean_squared_error',
              metrics=[tf.keras.metrics.MeanAbsoluteError()])

In [None]:
model.fit(X_train, 
          y_train, 
          epochs=10, 
          batch_size = 32)

In [None]:
# Evaluate the model on the test set
print("Evaluate on test data")
results = model.evaluate(X_test, y_test, batch_size=32)

print("Test MSE:", results[0])
print("Test MAE:", results[1])

In [None]:
# Prediction
y_pred = model.predict(X_test)

# the_day is the day from which we will study the n_steps_out-th dayS of prediction into 
# the future. Note: The first day start at index 0
the_day = 0
y_pred_days = y_pred[the_day,:]

plt.figure(figsize=(10,6))
plt.grid(True)
plt.plot(y_test[the_day,:],label='Orginal data - transformed')
plt.plot(y_pred_days, color='red',label='Predictions - transformed')
plt.xlabel('Time (days)')
plt.ylabel('Closing Prices amplitude in the transformed space')
plt.title('Original data vs predictions in the transformed space')

In [None]:
pred_diff_cumsum = y_pred_days.cumsum()
base_number = df_transform.values[-test_days+the_day+nb_days-1]
idx = test_original.iloc[the_day:the_day+n_steps_out].index

pred_tf = pd.Series(base_number, index=idx)
print(pred_tf)
pred_tf = pred_tf.add(pred_diff_cumsum,fill_value=0)

print(pred_tf)

In [None]:
# Take the square, and the exponent
pred_log = pred_tf.apply(np.square)
pred = pred_log.apply(np.exp)
print(pred)

In [None]:
# Plot actual prices vs predicted prices 
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Closing Prices')
plt.plot(test_original.iloc[max(0,the_day-30):the_day+n_steps_out],'b',label='Actual prices')
plt.plot(pred, '-o',color='orange',label='Predicted prices')

plt.legend()

# Random crap

In [None]:
pd.options.mode.chained_assignment = None

end = datetime.now()
start = datetime(end.year-6, end.month, end.day)
yf.pdr_override()
df = yf.download("AAPL", start='2012-11-1', end=end, interval = "1h")
df.reset_index()
df = df[['Close']].fillna(method='ffill')
df = df.set_index(df.index).resample('h').ffill()
df.index = df.index.to_period(freq='h')
df = df.reset_index()
df["time_idx"] = df.index
df["Constant"] = [0]*len(df)
df

In [None]:
dataset = TimeSeriesDataSet(
    df,
    group_ids = ["Constant"],
    target="Close",
    time_idx="time_idx",
    min_encoder_length=5,
    max_encoder_length=5,
    min_prediction_length=2,
    max_prediction_length=2,
)

In [None]:
max_encoder_length = 60
max_prediction_length = 20

training_cutoff = df.index.max() - max_prediction_length

context_length = max_encoder_length
prediction_length = max_prediction_length

training = TimeSeriesDataSet(
    df[lambda x: x.index <= training_cutoff],
    time_idx="time_idx",
    target="Close",
    categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["series"],
    # only unknown variable is "value" - and N-HiTS can also not take any additional variables
    time_varying_unknown_reals=["value"],
    max_encoder_length=context_length,
    max_prediction_length=prediction_length,
)

validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training_cutoff + 1)
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

In [None]:
import random

y = df["Close"]
X = pd.Series([random.randint(3, 9)]*len(df))
X.index = df.index
fh = np.arange(1, 37)

pipe = ForecastX(  
    forecaster_X=VAR(),
    forecaster_y=ARIMA(),
)
pipe = pipe.fit(y, X=X, fh=fh)  
y_pred = pipe.predict(X=X)

y_pred

In [None]:
y = df["Close"]
fh = np.arange(1, 37)
forecaster = NaiveForecaster(strategy="drift")
forecaster.fit(y, fh=fh)
y_pred = forecaster.predict()
y_pred

In [None]:
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_relevant_features

extracted_features = extract_features(df, column_id="SP500", column_sort="time")

impute(extracted_features)
features_filtered = select_features(extracted_features, y)

features_filtered_direct = extract_relevant_features(timeseries, y,
                                                     column_id='SP500', column_sort='time')

In [None]:
df = df.reset_index()
df["Ints"] = df.index
df

In [None]:
df = df.drop("Date", axis = 1)

In [None]:
df

In [None]:
dataset = TimeSeriesDataSet(
    df,
    group_ids=["SP500"],
    target="Close",
    time_idx="Ints",
    min_encoder_length=5,
    max_encoder_length=5,
    min_prediction_length=2,
    max_prediction_length=2,
    time_varying_unknown_reals=["value"],
)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
import torch

from pytorch_forecasting import Baseline, NHiTS, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_forecasting.data.examples import generate_ar_data
from pytorch_forecasting.metrics import SMAPE, QuantileLoss, MQF2DistributionLoss




data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=100, seed=42)
data["static"] = 2
data["date"] = pd.Timestamp("2020-01-01") + pd.to_timedelta(data.time_idx, "D")
data.head()
max_encoder_length = 60
max_prediction_length = 20

training_cutoff = data["time_idx"].max() - max_prediction_length

context_length = max_encoder_length
prediction_length = max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="value",
    categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["series"],
    # only unknown variable is "value" - and N-HiTS can also not take any additional variables
    time_varying_unknown_reals=["value"],
    max_encoder_length=context_length,
    max_prediction_length=prediction_length,
)

validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training_cutoff + 1)
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

pl.seed_everything(42)
trainer = pl.Trainer(gpus=0, gradient_clip_val=1.0)
net = NHiTS.from_dataset(
    training,
    learning_rate=3e-2,
    weight_decay=1e-2,
    loss=MQF2DistributionLoss(prediction_length=max_prediction_length),
    backcast_loss_ratio=0.0,
    hidden_size=64,
)

res = trainer.tuner.lr_find(net, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5)
print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()
net.hparams.learning_rate = res.suggestion()

early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
trainer = pl.Trainer(
    max_epochs=5,
    gpus=0,
    enable_model_summary=True,
    gradient_clip_val=1.0,
    callbacks=[early_stop_callback],
    limit_train_batches=30,
    enable_checkpointing=True,
)


net = NHiTS.from_dataset(
    training,
    learning_rate=0.09,
    log_interval=10,
    log_val_interval=1,
    weight_decay=1e-2,
    backcast_loss_ratio=0.0,
    hidden_size=64,
    loss=MQF2DistributionLoss(prediction_length=max_prediction_length),
)

trainer.fit(
    net,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

best_model_path = trainer.checkpoint_callback.best_model_path
best_model = NHiTS.load_from_checkpoint(best_model_path)


actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_model.predict(val_dataloader)
(actuals - predictions).abs().mean()


raw_predictions, x = best_model.predict(val_dataloader, mode="raw", return_x=True)

for idx in range(10):  # plot 10 examples
    best_model.plot_prediction(x, raw_predictions, idx=idx, add_loss_to_title=True);

# sample 500 paths
samples = best_model.loss.sample(raw_predictions["prediction"][[0]], n_samples=500)[0]

# plot prediction
fig = best_model.plot_prediction(x, raw_predictions, idx=0, add_loss_to_title=True)
ax = fig.get_axes()[0]
# plot first two sampled paths
ax.plot(samples[:, 0], color="g", label="Sample 1")
ax.plot(samples[:, 1], color="r", label="Sample 2")
fig.legend();

In [None]:
!juyp