<a href="https://colab.research.google.com/github/iasonkoutsoulis/Tralgo/blob/main/Tralgo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# This is the algorithm I'll use to do automated trading.
from bs4 import BeautifulSoup as bs
from datetime import datetime
import json
import re
import requests

def link_collect(soop):
    all_linx = []
    for link in soop.find_all('a'):
        nlink = link.get('href')
        all_linx.append(nlink)
        all_linx = list(filter(lambda item: item is not None, all_linx))
    return all_linx

def year_collect(soop):
    years = []
    for timet in soop.find_all('time', {'class': 'fc-date-headline'}):
        years.append(re.findall(r'\d+', timet.string)[-1])
    years = list(dict.fromkeys(years))
    return years

def tl_collect(all_links, years):
    for yeart in years:
        expr = r'https:\/\/www\.theguardian\.com\/\S+\/' + yeart + r'\/\S+'
        text_links = []
        for link in all_links:
            if re.search(r'/all$', link):
                pass
            elif re.search(expr, link):
                text_links.append(link)
    return text_links

#
# main script

bimon_arts = dict()
for page in range(287, 0, -1):
    print(str(page))

    #
    # initialize using the front-page links

    url = 'https://www.theguardian.com/business/stock-markets?page=' + str(page) # total pages = 287
    html = requests.get(url).text
    soup = bs(html, 'lxml')

    #
    # get all article links from the page we've opened (we use the year they include to identify them)

    all_links = link_collect(soup)
    years = year_collect(soup)
    text_links = tl_collect(all_links, years)

    #
    # now we open all of the articles on the page and collect them into our bimonthly datasets
    # we create a dictionary/log entry which holds all text for a span of 15 days.

    for tlink in text_links:
        subhtml = requests.get(tlink).text
        subsoup = bs(subhtml, 'lxml')
        texpr = r'^.*?(?= \||$)'
        try:
          title = re.search(texpr, subsoup.title.string).group(0)
        except Exception:
          pass

        timet = subsoup.find('meta', {'property':'article:published_time'})
        try:
            fdate = timet['content']
        except Exception:
            pass
        dt_date = datetime.strptime(fdate, '%Y-%m-%dT%H:%M:%S.%fZ')
        art_date = str(dt_date.year) + '-' + str(dt_date.month)
        bimon = 'B2' if dt_date.day >= 15 else 'B1'

        if not (art_date + '-' + bimon) in bimon_arts:
            bimon_arts[art_date + '-' + bimon] = []
        else:
            pass

        article = [title]
        for textlink in subsoup.find_all('p'):
            article.append(textlink.string)
            article = list(filter(lambda item: item is not None, article))
            art_str = " ".join(article)
        if art_str in bimon_arts[art_date + '-' + bimon]:
            pass
        else:
            bimon_arts[art_date + '-' + bimon].append(art_str)

#
# instead of text files I'll try the JSON stuff now

with open('/content/drive/MyDrive/Tralgo articles/article_container.json', "w") as f:
    json.dump(bimon_arts, f)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
# this fetches the financial data and tests for stationarity
import numpy as np
import pyarrow.feather as feather
from statsmodels.tsa.stattools import adfuller
import yfinance as yf

tics = 'msft aapl goog tsla'

df = yf.download(tics, interval = "1wk", start='2008-10-01')
df = df[['Adj Close']].dropna()
df.columns = df.columns.droplevel()
df.index = df.index.strftime('%Y-%m') + '-' + np.where(df.index.day>=15, 'B2', 'B1')
df = df.groupby(df.index).mean()
for col in df:
    df[col + '_dif'] = df[col].diff()
    df[col + '_indicator'] = np.where(df[col + '_dif'] >=0, 1, 0)
    df[col + '_future_indicator'] = df[col + '_indicator'].shift(-1)

adfuller(df['GOOG_dif'].dropna())

feather.write_feather(df, '/content/drive/MyDrive/Tralgo articles/financial_container.csv')

[*********************100%***********************]  4 of 4 completed


In [42]:
'''
with the present we'll train the text data along with the financial data and get our trained net.
this will require some manipulation of the current state of the data, which makes sense that is done here,
in order to promote homogeneity and peace of mind...
'''
from datetime import datetime
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import json
import numpy as np
import pandas as pd
import pyarrow.feather as feather
import re
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim

#
# Fetch articles data

with open('/content/drive/MyDrive/Tralgo articles/article_container.json', 'r') as f:
    articles_text = json.load(f)


In [6]:
#
# we tokenize our text data with their relevant tags from the dictionary

tagged_arts = []
for period, articles in articles_text.items():
    for article in articles:
        tag = period
        tagged_arts.append(TaggedDocument(words=article.lower().split(), tags=[tag]))

#
# here we train our Doc2Vec model on every word of every article with attention to tags

st = time.time()
print('Running doc2vec, maybe consider getting the google stuff')
doc2vec_model = Doc2Vec(tagged_arts, vector_size=256, min_count=10, epochs=100, workers=2)
en = time.time()
print('Time elapsed: ', en-st)

doc2vec_model.save('/content/drive/MyDrive/Tralgo articles/d2v_M.model')


Running doc2vec, maybe consider getting the google stuff
Time elapsed:  862.9126558303833


In [None]:
doc2vec_model = Doc2Vec.load('/content/drive/MyDrive/Tralgo articles/d2v_M.model')
#
# next we'll work on the embeddings a bit

doc_embeds = {}
for period, embeds in articles_text.items():
    doc_embeds[period] = doc2vec_model.dv[period]

doc_embeds_tens = {}
for period, embeds in doc_embeds.items():
    doc_embeds_tens[period] = np.array(embeds)

#
# in this part we transform words into vectors

vectorizer = TfidfVectorizer()
vectors = {period: vectorizer.fit_transform(articles).toarray().astype(np.float32).flatten() for period, articles in articles_text.items()}

#
# and now we combine document embeddings with TF-IDF vectors for each period's articles

X_data = {}
for period in articles_text.keys():
    embeds_vecs = np.concatenate((vectors[period], doc_embeds_tens[period]), axis=0)
    X_data[period] = torch.tensor(embeds_vecs, dtype=torch.float32)

#
# pad the tensors to the same length before creating the dataframe

padding = pad_sequence(list(X_data.values()), batch_first=True)

#
# here we create a date function to sort our index similarly in X and Y

def Datelist(input_dates):
    datelist = list(input_dates)
    datelist.sort(key=lambda date: datetime.strptime(date.zfill(2), "%Y-%m-B%d"))
    datelist = [re.sub(r'(?<=\d{4}-)\d{1}(?=-)', lambda match: match.group(0).zfill(2), date) for date in datelist]
    return datelist

#
# create our X data

X = pd.DataFrame(padding.numpy(), index=Datelist(articles_text.keys()), columns=[period for period in range(padding.shape[1])])
feat_names = vectorizer.get_feature_names_out()
X.columns = feat_names.tolist() + [f"col_{i}" for i in range(len(feat_names), len(X.columns))]
X.index.name = "Date"

#
# create our Y data and intersect our datasets

Y_data = feather.read_feather('/content/drive/MyDrive/Tralgo articles/financial_container.csv')
Y = pd.DataFrame(Y_data['GOOG_future_indicator'], index=Datelist(Y_data.index))
Y.index.name = "Date"

tot_df = pd.merge(X, Y, how='inner', on='Date')
X = tot_df.iloc[0:-1,0:-1]
Y = tot_df.iloc[0:-1,-1]

#
# now we'll do some preprocessing of our data

mm_scaler = preprocessing.MinMaxScaler()
X_scale = mm_scaler.fit_transform(X)

X_train, X_val_test, Y_train, Y_val_test = train_test_split(X_scale, Y, test_size=0.2)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, test_size=0.5)

#
# after this point we begin to code the neural network!

class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(len(X_train[0]), 64)
        self.l2 = nn.Linear(64, 128)
        self.l3 = nn.Linear(128, 64)
        self.l4 = nn.Linear(64, 32)
        self.l5 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout_prob = dropout_prob

    def forward(self, x):
        x = self.relu(self.l1(x))
        x = F.dropout(x, p=self.dropout_prob, training=self.training)
        x = self.relu(self.l2(x))
        x = F.dropout(x, p=self.dropout_prob, training=self.training)
        x = self.relu(self.l3(x))
        x = F.dropout(x, p=self.dropout_prob, training=self.training)
        x = self.relu(self.l4(x))
        x = F.dropout(x, p=self.dropout_prob, training=self.training)
        x = self.sigmoid(self.l5(x))
        return x

dropout_prob = 0.3
weight_decay = 0.01

model = NeuralNet()

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), weight_decay=weight_decay)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
Y_val_tensor = torch.tensor(Y_val, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32).view(-1, 1)

num_epochs = 100
batch_size = 4
best_val_loss = float('inf')
patience = 10
for epoch in range(num_epochs):
    model.train()
    num_batches = len(X_train_tensor) // batch_size
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        inputs = X_train_tensor[start_idx:end_idx]
        targets = Y_train_tensor[start_idx:end_idx]

        # Forward pass
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs, targets)

        # Zero the gradients, backward pass, and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, Y_val_tensor)

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

    # Early stopping based on validation loss
    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        patience = 10  # Reset patience
    else:
        patience -= 1
        if patience == 0:
            print("Early stopping...")
            break

# model.load_state_dict(torch.load('/content/drive/MyDrive/Tralgo articles/model.pt'))

model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_accuracy = ((test_outputs >= 0.5).float() == Y_test_tensor).float().mean()

print(f"Test Accuracy: {test_accuracy.item():.4f}")

torch.save(model.state_dict(), '/content/drive/MyDrive/Tralgo articles/model.pt')

# if I ever want to load:
# model.load_state_dict(torch.load('E:/Tralgo/model.pt'))

Epoch [1/100], Loss: 0.5427, Val Loss: 0.7239
Epoch [2/100], Loss: 0.6260, Val Loss: 0.6917
Epoch [3/100], Loss: 0.6840, Val Loss: 0.6917
Epoch [4/100], Loss: 0.5995, Val Loss: 0.6860
Epoch [5/100], Loss: 0.6774, Val Loss: 0.6855
Epoch [6/100], Loss: 0.7608, Val Loss: 0.6918
Epoch [7/100], Loss: 0.6398, Val Loss: 0.6869
Epoch [8/100], Loss: 0.7543, Val Loss: 0.6816
Epoch [9/100], Loss: 0.7073, Val Loss: 0.6841
Epoch [10/100], Loss: 0.6864, Val Loss: 0.6854
Epoch [11/100], Loss: 0.7231, Val Loss: 0.6871
Epoch [12/100], Loss: 0.6109, Val Loss: 0.6809
