### Notebook Summary
In this notebook I used a similar to the notebook in which I selected features, an LSTM algorithm that only on a sequence of the last X days (depending on the type of label) also, similar to the previous notebooks, I resampled the data to create a balance between the labeling and scaling of the data itself. You can see after each run the results of the models in different indices

###results
The model results are better in all parameters than the results of the base algorithm, this time the model succeeds in the opposite way, better prediction for a larger time range. My hypothesis is that the weekly and monthly range are less noisy compared to the smaller range but still not "too far"

In [1]:
from google.colab import drive
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
Consecutive_stocks =  pd.read_csv('/content/drive/MyDrive/stocks_data/Consecutive_stocks.csv')
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.utils import resample
from collections import Counter

def preprocess_data_train_val(df, label_col='label_week', seq_length=10, start_date='2022-12-31', exclude_columns=[], alpha=0.8,device = 'cpu'):

    df = df.copy()
    df = df[df[label_col].isna()==False].fillna(-1)

    all_columns = df.columns
    columns_to_keep = list(set(all_columns) - set(exclude_columns))

    df = df[pd.to_datetime(df['date']) >= pd.to_datetime(start_date)]
    min_date = pd.to_datetime(df['date']).min()
    max_date = pd.to_datetime(df['date']).max()
    num_days = (max_date - min_date).days
    split_date = min_date + pd.DateOffset(days=int(alpha * num_days))
    print(f'Train dates are {min_date} - {split_date}, test dates are {split_date} - {max_date}')

    df.set_index(['date', 'Symbol'], inplace=True)
    label_col_copy = df[label_col].copy()

    scaler = StandardScaler()
    df[columns_to_keep] = scaler.fit_transform(df[columns_to_keep])
    df[label_col] = label_col_copy

    train_data = df[pd.to_datetime(df.index.get_level_values('date')) <= split_date].loc[:, columns_to_keep]
    test_data = df[pd.to_datetime(df.index.get_level_values('date')) > split_date].loc[:, columns_to_keep]

    sequences_train, labels_train = [], []
    for symbol in train_data.index.get_level_values('Symbol').unique():
        symbol_data = train_data.loc[train_data.index.get_level_values('Symbol') == symbol]
        symbol_sequences, symbol_labels = create_sequences(symbol_data[train_data.columns[:-1]], symbol_data[label_col], seq_length)
        sequences_train.append(symbol_sequences)
        labels_train.append(symbol_labels)

    sequences_test, labels_test = [], []
    for symbol in test_data.index.get_level_values('Symbol').unique():
        symbol_data = test_data.loc[test_data.index.get_level_values('Symbol') == symbol]
        symbol_sequences, symbol_labels = create_sequences(symbol_data[test_data.columns[:-1]], symbol_data[label_col], seq_length)
        sequences_test.append(symbol_sequences)
        labels_test.append(symbol_labels)

    sequences_test = torch.cat(sequences_test).to(device)
    labels_test = torch.cat(labels_test).to(device)
    sequences_train = torch.cat(sequences_train).to(device)
    labels_train = torch.cat(labels_train).to(device)
    #Handlling unbalanced data with taking randonly equaly nuber of samples from each class

    # Extract unique class labels
    unique_classes = set(labels_train.cpu().numpy())

    # Find the minimum count among classes
    min_samples = min(labels_train.tolist().count(c) for c in unique_classes)

    class_sequences = []
    class_labels = []
    for c in unique_classes:
        indices = [i for i, label in enumerate(labels_train) if torch.equal(label, torch.tensor(c))]
        indices_tensor = torch.tensor(indices)
        random_indices_tensor = torch.randperm(indices_tensor.size(0))[:min_samples]
        random_indices = indices_tensor[random_indices_tensor]

        class_sequences.extend([sequences_train[i] for i in random_indices])
        class_labels.extend([labels_train[i] for i in random_indices])

    # Combine the sequences and labels for each class
    sequences_train = torch.stack(class_sequences)
    labels_train = torch.stack(class_labels)
    unique_classes = set(labels_train.cpu().numpy())

    return sequences_train, labels_train, sequences_test, labels_test


def create_sequences(data, labels, seq_length=7):
    sequences, next_labels = [], []
    for i in range(len(data) - seq_length):
        seq = data.iloc[i:i + seq_length]
        label = labels.iloc[i + seq_length]
        if label == -1:
          continue
        sequences.append(seq.values)
        next_labels.append(label)

    sequences = np.array(sequences, dtype=np.float32)
    next_labels = np.array(next_labels, dtype=np.int64)

    return torch.tensor(sequences).to(device), torch.tensor(next_labels).to(device)



In [4]:
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, classification_report,confusion_matrix

def lstm_pipline(df, label_col='label_week',columns_to_use=[], seq_length=10, hidden_size=64, num_layers=2, num_classes=8, lr=0.01, num_epochs=5, batch_size=128, alpha_lasso=0.01, alpha=0.8, start_date='2022-12-31',device = 'cpu'):
    # Preprocess data and get train and test data
    # Exclude some columns from the 'df' DataFrame
    if label_col == 'label_week':
        columns_to_exclude = ['label_day',  'label_month', 'Symbol', 'date','pct_change_week','pct_change_month', 'stock']
    elif label_col == 'label_day':
        columns_to_exclude = [ 'label_week', 'label_month', 'Symbol', 'date','pct_change_week','pct_change_month', 'stock']
    elif label_col == 'label_month':
        columns_to_exclude = ['label_day', 'label_week',  'Symbol', 'date','pct_change_week','pct_change_month', 'stock']


    all_columns = df.columns
    columns_to_exclude = list(set(all_columns) - set(columns_to_use+[label_col]))

    print(f'Preprocessing and creating sequences, label column is {label_col}...')
    sequences_train, labels_train, sequences_test, labels_test = preprocess_data_train_val(df, label_col=label_col, seq_length=seq_length, start_date=start_date, exclude_columns=columns_to_exclude, alpha=alpha)

    # Create DataLoader for training
    train_dataset = TensorDataset(sequences_train, labels_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Create DataLoader for testing
    test_dataset = TensorDataset(sequences_test, labels_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    class LSTM(nn.Module):
        def __init__(self, input_size, hidden_size, num_layers, num_classes):
            super(LSTM, self).__init__()
            self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
            self.fc = nn.Linear(hidden_size, num_classes)

        def forward(self, x):
            out, _ = self.lstm(x)
            out = self.fc(out[:, -1, :])
            return out

    # Automatically determine the input size from the data
    input_size = sequences_train.size(2)

    model = LSTM(input_size, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes)
    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_loss = float('inf')  # Initialize with positive infinity
    best_epoch = 0
    best_model_state_dict = None

    print('Start training')

    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader, 1):
            optimizer.zero_grad()
            outputs = model(inputs)

            # Total loss with L1 regularization
            loss = F.cross_entropy(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if i % 1500 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{i}/{len(train_loader)}], Loss: {running_loss / i:.4f}')

        epoch_loss = running_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

        # Check if the current epoch's loss is the best so far
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            best_epoch = epoch
            best_model_state_dict = model.state_dict()

    print(f'Best Loss: {best_loss:.4f} at Epoch [{best_epoch+1}/{num_epochs}]')

    # Load the best model state
    model.load_state_dict(best_model_state_dict)
    print('Evaluating')

    # In case i want to use this part to get predictions and there probabilities for the stocks
    model.eval()
    correct, total = 0, 0
    all_predictions, all_probabilities, all_real_labels = [], [], []

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            probabilities = F.softmax(outputs, dim=1)

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            probabilities = F.softmax(outputs, dim=1)
            all_predictions.extend(predicted.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
            all_real_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_real_labels, all_predictions)
    f1 = f1_score(all_real_labels, all_predictions, average='weighted')
    report = classification_report(all_real_labels, all_predictions)
    unique_classes = list(set(all_real_labels))
    cm = confusion_matrix(all_real_labels, all_predictions, labels=unique_classes)

    print(f'Accuracy {accuracy* 100:.2f}')
    print(f'F1-{f1}')
    print(f'report-')
    print(report)
    print('confusion_matrix-')
    print(cm)

    return model,accuracy, f1, report,cm




In [20]:
train_test_ratio=0.8
top_X_features = 20
Nepochs = 5
start_date='2021-12-31'
hidden_size = 64
batch_size = 64
learning_rate = 0.01

In [21]:
loaded_df = pd.read_csv('/content/drive/MyDrive/stocks_data/selected_columns.csv')

# Retrieve lists from loaded DataFrame
columns_to_use_day = loaded_df['columns_to_use_day'].tolist()
columns_to_use_week = loaded_df['columns_to_use_week'].tolist()
columns_to_use_month = loaded_df['columns_to_use_month'].tolist()

###Day

In [22]:
model_day,accuracy_day, f1_day, report_day,cm_day  = lstm_pipline(df=Consecutive_stocks, label_col='label_day',columns_to_use=columns_to_use_day ,seq_length=7, hidden_size=hidden_size, num_layers=2, num_classes=8, lr=learning_rate, num_epochs=Nepochs, batch_size=batch_size, alpha_lasso=0.01, alpha=train_test_ratio, start_date=start_date)


Preprocessing and creating sequences, label column is label_day...
Start training
Epoch [1/5], Batch [1500/1679], Loss: 1.9234
Epoch [1/5], Loss: 1.9204
Epoch [2/5], Batch [1500/1679], Loss: 1.8774
Epoch [2/5], Loss: 1.8747
Epoch [3/5], Batch [1500/1679], Loss: 1.8145
Epoch [3/5], Loss: 1.8107
Epoch [4/5], Batch [1500/1679], Loss: 1.7513
Epoch [4/5], Loss: 1.7497
Epoch [5/5], Batch [1500/1679], Loss: 1.7100
Epoch [5/5], Loss: 1.7081
Best Loss: 1.7081 at Epoch [5/5]
Evaluating
Accuracy 28.51
F1-0.2978056493854241
report-
              precision    recall  f1-score   support

           0       0.04      0.17      0.07      2784
           1       0.07      0.10      0.09     13728
           2       0.11      0.06      0.08     36673
           3       0.42      0.19      0.26    146096
           4       0.46      0.47      0.46    156735
           5       0.11      0.25      0.15     33878
           6       0.07      0.08      0.08     14853
           7       0.05      0.39      0.

###Week

In [23]:
model_week,accuracy_week, f1_week, report_week, cm_week = lstm_pipline(df=Consecutive_stocks, label_col='label_week',columns_to_use=columns_to_use_week, seq_length=12, hidden_size=hidden_size*2, num_layers=2, num_classes=8, lr=learning_rate, num_epochs=Nepochs, batch_size=batch_size*2, alpha_lasso=0.01, alpha=train_test_ratio, start_date=start_date)


Preprocessing and creating sequences, label column is label_week...
Start training
Epoch [1/5], Batch [1500/6863], Loss: 1.3847
Epoch [1/5], Batch [3000/6863], Loss: 1.3555
Epoch [1/5], Batch [4500/6863], Loss: 1.3412
Epoch [1/5], Batch [6000/6863], Loss: 1.3309
Epoch [1/5], Loss: 1.3253
Epoch [2/5], Batch [1500/6863], Loss: 1.2802
Epoch [2/5], Batch [3000/6863], Loss: 1.2771
Epoch [2/5], Batch [4500/6863], Loss: 1.2744
Epoch [2/5], Batch [6000/6863], Loss: 1.2720
Epoch [2/5], Loss: 1.2705
Epoch [3/5], Batch [1500/6863], Loss: 1.2523
Epoch [3/5], Batch [3000/6863], Loss: 1.2492
Epoch [3/5], Batch [4500/6863], Loss: 1.2466
Epoch [3/5], Batch [6000/6863], Loss: 1.2441
Epoch [3/5], Loss: 1.2429
Epoch [4/5], Batch [1500/6863], Loss: 1.2303
Epoch [4/5], Batch [3000/6863], Loss: 1.2285
Epoch [4/5], Batch [4500/6863], Loss: 1.2265
Epoch [4/5], Batch [6000/6863], Loss: 1.2257
Epoch [4/5], Loss: 1.2255
Epoch [5/5], Batch [1500/6863], Loss: 1.2144
Epoch [5/5], Batch [3000/6863], Loss: 1.2157
Epo

###Month

In [24]:
model_month,accuracy_month, f1_month, report_month, cm_month = lstm_pipline(df=Consecutive_stocks, label_col='label_month',columns_to_use=columns_to_use_month, seq_length=20, hidden_size=hidden_size*4, num_layers=2, num_classes=8, lr=learning_rate, num_epochs=Nepochs, batch_size=batch_size*4, alpha_lasso=0.01, alpha=train_test_ratio, start_date=start_date)


Preprocessing and creating sequences, label column is label_month...
Start training
Epoch [1/5], Batch [1500/3589], Loss: 1.2331
Epoch [1/5], Batch [3000/3589], Loss: 1.2141
Epoch [1/5], Loss: 1.2086
Epoch [2/5], Batch [1500/3589], Loss: 1.1682
Epoch [2/5], Batch [3000/3589], Loss: 1.1622
Epoch [2/5], Loss: 1.1593
Epoch [3/5], Batch [1500/3589], Loss: 1.1347
Epoch [3/5], Batch [3000/3589], Loss: 1.1293
Epoch [3/5], Loss: 1.1274
Epoch [4/5], Batch [1500/3589], Loss: 1.1129
Epoch [4/5], Batch [3000/3589], Loss: 1.1099
Epoch [4/5], Loss: 1.1100
Epoch [5/5], Batch [1500/3589], Loss: 1.0989
Epoch [5/5], Batch [3000/3589], Loss: 1.1044
Epoch [5/5], Loss: 1.1047
Best Loss: 1.1047 at Epoch [5/5]
Evaluating
Accuracy 65.03
F1-0.6523654576011914
report-
              precision    recall  f1-score   support

           0       0.85      0.87      0.86     37726
           1       0.53      0.60      0.56     22524
           2       0.35      0.35      0.35     14269
           3       0.37      0

In [36]:
import numpy as np

def calculate_probability_for_individual_classes(original_conf_matrix, interval = 'week'):

    # Initialize an array to store the probability for each class
    probability_per_class_dict = {}

    for i in range(8):
        # Extract the confusion matrix for the current class
        class_conf_matrix = original_conf_matrix[:, i]

        # Calculate the total counts for the current class
        total_class = class_conf_matrix.sum()
        if i in [0,1,2,3]:
          class_count = class_conf_matrix[:4].sum()

        else:
          class_count = class_conf_matrix[4:].sum()

        # Calculate the probability of the current class being positive
        probability_class = class_count / total_class

        # Append the probability array for the current class to the result
        probability_per_class_dict[interval +" "+ str(i)]= probability_class

    print(f"\n{interval}ly probability for Each Class to be in its category- up/down")
    print(probability_per_class_dict)

    return probability_per_class_dict


In [37]:
class_probabilities_day = calculate_probability_for_individual_classes(cm_day, 'Day')

class_probabilities_week = calculate_probability_for_individual_classes(cm_week,'Week')

class_probabilities_month = calculate_probability_for_individual_classes(cm_month,'Month')




Dayly probability for Each Class to be in its category- up/down
{'Day 0': 0.4916719759716028, 'Day 1': 0.4939886658545628, 'Day 2': 0.48204709127126, 'Day 3': 0.4906336293753265, 'Day 4': 0.5278302214420569, 'Day 5': 0.5033673224416563, 'Day 6': 0.48684654300168634, 'Day 7': 0.4930342825177806}

Weekly probability for Each Class to be in its category- up/down
{'Week 0': 0.9274033428396087, 'Week 1': 0.8864368193285067, 'Week 2': 0.7519157088122606, 'Week 3': 0.6346312660004955, 'Week 4': 0.684865619666592, 'Week 5': 0.8363111694774675, 'Week 6': 0.9109460788980926, 'Week 7': 0.9030425317262253}

Monthly probability for Each Class to be in its category- up/down
{'Month 0': 0.9875832986255727, 'Month 1': 0.9372711163614885, 'Month 2': 0.841461680648372, 'Month 3': 0.6000229476220527, 'Month 4': 0.7664181753638623, 'Month 5': 0.9081868154924259, 'Month 6': 0.973199852528289, 'Month 7': 0.9947665758645214}
