In [8]:
import pandas as pd
import datetime
from time import sleep
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="ProsusAI/finbert")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [9]:
def summar_to_score(summ):
    # negative => < 0
    # positive => > 0
    # neutral  => = 0
    if len(summ.split()) > 350:
        print('summary too long, summarizing...')
        summ = summarizer(summ)[0]['summary_text']
    result = pipe(summ)[0]
    match result['label']:
        case 'positive':
            return result['score']
        case 'negative':
            return -result['score']
        case 'neutral':
            return 0

news = pd.read_csv('data/news.csv')
news['score'] = news['summary'].apply(summar_to_score)

summary too long, summarizing...
summary too long, summarizing...
summary too long, summarizing...
summary too long, summarizing...
summary too long, summarizing...
summary too long, summarizing...


# Combine prices normolized by z-score with sentient score

In [37]:
import pandas as pd

# Reading CSV files
stocks = pd.read_csv('data/5min.csv')
news = pd.read_csv('data/news.csv')

# Convert 't' and 'datetime' columns to datetime type
stocks['date'] = pd.to_datetime(stocks['t'])
news['date'] = pd.to_datetime(news['datetime'])

# Initialize 'score' column in stocks with empty lists
stocks['score'] = stocks.apply(lambda x: [], axis=1)

# Sort dataframes by date for efficient searching
stocks.sort_values('date', inplace=True)
news.sort_values('date', inplace=True)

# Iterate through each row in news to find the closest stock entry
for i, news_row in news.iterrows():
    news_time = news_row['date']

    # Find the closest stock time
    closest_stock_time = stocks['date'].iloc[(stocks['date'] - news_time).abs().argsort()[:1]]
    closest_index = closest_stock_time.index[0]

    # Append the score to the closest stock entry
    stocks.at[closest_index, 'score'].append(news_row['score'])

# Replace empty lists with zero
stocks['score'] = stocks['score'].apply(lambda x: 0 if len(x) == 0 else x)

# Save the updated stocks DataFrame to CSV
stocks.to_csv('data/5min_score.csv', index=False)

In [35]:
df = pd.read_csv('data/5min_score_mean.csv')
df['o'].std() * 0.035

0.034363252645622754

In [1]:
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import pandas as pd

class StockDataset(Dataset):
    def __init__(self, array_data, window):
        self.array_data = array_data
        self.window = window

    def __len__(self):
        return len(self.array_data) - self.window

    def __getitem__(self, idx):
        x = torch.tensor(self.array_data[idx:idx+self.window], dtype=torch.float32)
        y = torch.tensor(self.array_data[idx+self.window], dtype=torch.float32)
        return x,y

# Load your data into a DataFrame and then convert to a NumPy array
df = pd.read_csv('data/5min_score_mean.csv')
array_data = df[['z_o', 'z_c', 'z_v', 'score']].to_numpy()

# Create a dataset object
window = 25  # Specify your window size
dataset = StockDataset(array_data, window)

# Create a DataLoader object
batch_size = 32

# Compute lengths of splits
total_len = len(dataset)
train_len = int(0.7 * total_len)
val_len = int(0.2 * total_len)
test_len = total_len - train_len - val_len

# Randomly split dataset
train_set, val_set, test_set = random_split(dataset, [train_len, val_len, test_len])

# Create DataLoader objects for each split
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_set, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_set, batch_size=batch_size, shuffle=True)

# TRAIN AI FINALLY!!!

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
from torch.optim.lr_scheduler import StepLR

if os.name == 'posix':
    if os.uname()[0] == 'Linux':
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    elif os.uname()[0] == 'Darwin':
        device = torch.device('mps')  # 'mps' is not a recognized device in PyTorch, default to CPU on macOS

input_dim = 4  # 'z_o', 'z_c', 'z_v', 'score'
output_dim = 2  # 'z_o', 'z_c'
class StockPredictor(nn.Module):
    def __init__(self, input_dim):
        super(StockPredictor, self).__init__()
        self.lstm_step1 = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 4096),
            nn.ReLU(),
            nn.LSTM(4096, 8192, batch_first=True, num_layers=7),
        )
        self.conv_step2 = nn.Sequential(
            nn.Conv2d(1, 128, kernel_size=1),
            nn.Sigmoid(),
            nn.Conv2d(128, 512, kernel_size=3),
            nn.Sigmoid(),
            nn.Conv2d(512, 32, kernel_size=3),
            nn.MaxPool2d(kernel_size=2),
            # Expand
            nn.Conv2d(32, 2048, kernel_size=3),
            nn.MaxPool2d(kernel_size=2),
            nn.SELU(),
            nn.Conv2d(2048, 4096, kernel_size=3),
            nn.MaxPool2d(kernel_size=2),
        )
        self.lstm_step3 = nn.LSTM(14, 32, batch_first=True, num_layers=20)

    def forward(self, x):

        # LSTM step 1
        x, (h,_) = self.lstm_step1(x)
        h = h[-1:].permute(1,0,2)
        x = torch.concat((x,h), dim=1)

        # CONV step 2
        x = x.unsqueeze(1)
        x = self.conv_step2(x)
        # x = x.squeeze(2)

        # # LSTM step 3
        # x, (h,_) = self.lstm_step3(x)
        return x


#test:
model = StockPredictor(input_dim).to(device)
x,y = next(iter(train_loader))
x = x.to(device); y = y.to(device)
print("x:",torch.flatten(x, start_dim=1).shape)

output = model(x)

print("y:", output.shape)
total_neurons = 0
for name, param in model.named_parameters():
    if 'weight' in name:
        neurons = param.size(0)  # Number of neurons in the current layer
        total_neurons += neurons
        print(f"{name} has {neurons} neurons")

print(f"Total number of neurons: {total_neurons}")

RuntimeError: MPS backend out of memory (MPS allocated: 17.20 GB, other allocations: 407.33 MB, max allowed: 18.13 GB). Tried to allocate 1024.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [4]:
# Initialize model, loss, and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00046)
scheduler = StepLR(optimizer, step_size=6, gamma=0.7)  # Reduce lr by a factor of 0.7 every 10 epochs

# Function to compute "accuracy"
def compute_accuracy(pred, target, threshold=0.035):
    return (torch.abs(pred - target) < threshold).float().mean() * 100

# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_accuracy = 0
    count = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        output = model(x_batch)
        loss = criterion(output, y_batch)
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_accuracy += compute_accuracy(output[:][:][:2], y_batch[:][:][:2])
        count += 1

    scheduler.step()
    
    print(f'Train Epoch: {epoch}, Loss: {total_loss/count:.4f}, Accuracy: {total_accuracy/count:.2f}%')
    
    # Validation
    model.eval()
    with torch.no_grad():
        total_val_loss = 0
        total_val_accuracy = 0
        count = 0
        for x_batch, y_batch in val_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            
            output = model(x_batch)
            val_loss = criterion(output, y_batch)

            total_val_loss += val_loss.item()
            total_val_accuracy += compute_accuracy(output, y_batch)
            count += 1

    print(f'Val Epoch: {epoch}, Loss: {total_val_loss/count:.4f}, Accuracy: {total_val_accuracy/count:.2f}%')

RuntimeError: input.size(-1) must be equal to input_size. Expected 4, got 256