In [2]:
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
warnings.filterwarnings("ignore")



# Define the Transformer model
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = tf.keras.layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = tf.keras.layers.Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(res)
    x = tf.keras.layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

# Build and train the model
input_shape = (7,1)
head_size = 46
num_heads = 60
ff_dim = 55
num_transformer_blocks = 5
mlp_units = [256]
dropout = 0.14
mlp_dropout = 0.4

def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0,
):
    inputs = tf.keras.Input(shape=input_shape)
    x = inputs
    

    for _ in range(num_transformer_blocks):  # This is what stacks our transformer blocks
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = tf.keras.layers.GlobalAveragePooling1D(data_format="channels_first")(x)

    for dim in mlp_units:
        x = tf.keras.layers.Dense(dim, activation="relu")(x)
        x = tf.keras.layers.Dropout(mlp_dropout)(x)

    outputs = tf.keras.layers.Dense(1, activation="softmax")(x)  # this is a pass-through

    return tf.keras.Model(inputs, outputs)

# Define the learning rate scheduler
def lr_scheduler(epoch, lr, warmup_epochs=30, decay_epochs=100, initial_lr=1e-6, base_lr=1e-3, min_lr=5e-5):
    if epoch <= warmup_epochs:
        pct = epoch / warmup_epochs
        return ((base_lr - initial_lr) * pct) + initial_lr

    if epoch > warmup_epochs and epoch < warmup_epochs+decay_epochs:
        pct = 1 - ((epoch - warmup_epochs) / decay_epochs)
        return ((base_lr - min_lr) * pct) + min_lr

    return min_lr

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='2000-01-01', end=end_date)
    return data

# def label_data(data):
#     # Calculate the percentage change in price from one day to the next
#     data['Percentage Change'] = data['Close'].pct_change()
#     data['Percentage Change'] = data['Percentage Change'].shift(-1)
#     data['Sentiment'] = pd.Series(np.where(data['Percentage Change'] > 0.025, 1, np.where(data['Percentage Change'] < -0.025, -1, 0)), index=data.index)
#     # Drop any rows with missing values
#     data.dropna(inplace=True)
#     data.drop('Percentage Change',axis=1 , inplace=True)
#     return data

def train_transformer(symbol_to_fetch,start_date ,end_date,no_model = None):
    #fetching data 
    stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)
    stock = stock.fillna(method="ffill", axis=0)
    stock = stock.fillna(method="bfill", axis=0)
    # stock.index = stock.index.date

    # Split the data into training and test sets
    train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
    train_data = stock.iloc[:train_data_index]
    test_data = stock.loc[start_date:]
    train_data = label_data(train_data)
    test_data = label_data(test_data)

    #trian & test data
    X_train_data = train_data.iloc[:,:-1]
    y_train_data = train_data.iloc[:,-1]
    X_test_data = test_data.iloc[:,:-1]
    y_test_data = test_data.iloc[:,-1]
    print(len(X_test_data))
    # Normalize the data
    normalizer = MinMaxScaler()
    X_train_data_normalizer = normalizer.fit_transform(X_train_data)
    X_test_data_normalizer = normalizer.transform(X_test_data)

    # # Reshape X_train_data_normalizer
    X_train = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
    X_test = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)
    if not no_model :
        model = build_model(
            input_shape,
            head_size=head_size,
            num_heads=num_heads,
            ff_dim=ff_dim,
            num_transformer_blocks=num_transformer_blocks,
            mlp_units=mlp_units,
            mlp_dropout=mlp_dropout,
            dropout=dropout,
        )

        model.compile(
            loss="mean_squared_error",
            optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
            metrics=["mean_squared_error"],
        )

        callbacks = [
            tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
            tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
        ]

        # model.summary()
        history = model.fit(
            X_train,
            y_train_data,
            validation_split=0.2,
            epochs=100,
            batch_size=20,
            callbacks=callbacks,
        )
        model.save('models/transformer_'+f"{symbol_to_fetch}"+"_model.h5")
        model.save('models/transformer_'+f"{symbol_to_fetch}"+"_model.keras")
        no_model = model
    return no_model,X_test,test_data
    #predictions
def prepare_sentiment_from_transformer(symbol_to_fetch,start_date,end_date):
    try :
        print("entered")
        model = tf.keras.models.load_model('models/transformer_'+f"{symbol_to_fetch}"+"_model.keras")
        print("passed")
        _,X_test,test_data = train_transformer(symbol_to_fetch = symbol_to_fetch, start_date = start_date, end_date = end_date,no_model=model)
    
    except:
        model,X_test,test_data = train_transformer(symbol_to_fetch = symbol_to_fetch, start_date = start_date, end_date = end_date)
    y_pred = model.predict(X_test) # this is the sentiment data 

    test_data['transformer_sentiment'] = y_pred

    test_data.index = test_data.index.date
    test_data.to_csv('data/transformer_sentiment.csv')
    return test_data,'data/transformer_sentiment.csv'
    """next steps :  we need to additionally train the model if model is already present
    or take nearly 30 stocks and train the model with the huge data 
    or take every 5 mins data nad trian with it, and at last mix the test data with day wise"""

In [63]:
def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['Percentage Change'] = data['Close'].pct_change()
    data['Percentage Change'] = data['Percentage Change'].shift(-1)
    data['Sentiment'] = pd.Series(np.where(data['Percentage Change'] > 0.025, 2, np.where(data['Percentage Change'] < -0.025, 0, 1)), index=data.index)
    # Drop any rows with missing values
    data.dropna(inplace=True)
    data.drop('Percentage Change',axis=1 , inplace=True)
    return data


import xgboost as xgb
from sklearn.metrics import accuracy_score
#fetching data 
symbol_to_fetch = 'AAPL'
start_date = '2022-05-01'
end_date = '2022-08-01'
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas for open, high, low, and close columns
for i in range(1, 90):  # Calculate deltas up to 5 days
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
        # Rolling mean and standard deviation of OHLC prices
    stock['Rolling_Mean_Open_{i}day'] = stock['Open'].rolling(window=i).mean()
    stock['Rolling_Mean_High_{i}day'] = stock['High'].rolling(window=i).mean()
    stock['Rolling_Mean_Low_{i}day'] = stock['Low'].rolling(window=i).mean()
    stock['Rolling_Mean_Close_{i}day'] = stock['Close'].rolling(window=i).mean()

    stock['Rolling_Std_Open_{i}day'] = stock['Open'].rolling(window=i).std()
    stock['Rolling_Std_High_{i}day'] = stock['High'].rolling(window=i).std()
    stock['Rolling_Std_Low_{i}day'] = stock['Low'].rolling(window=i).std()
    stock['Rolling_Std_Close_{i}day'] = stock['Close'].rolling(window=i).std()
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
# stock.index = stock.index.date
# Add date-related features
stock['Year'] = stock.index.year
stock['Month'] = stock.index.month
stock['Day'] = stock.index.day
stock['Weekday'] = stock.index.weekday

# Split the data into training and test sets
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:train_data_index]
test_data = stock.loc[start_date:]
train_data = label_data(train_data)
test_data = label_data(test_data)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data))
# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # Reshape X_train_data_normalizer
X_train = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_test = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)


61


In [64]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Create an XGBoost classifier
xgb_model = XGBClassifier(random_state=42)

# Train the model on the training data
xgb_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = xgb_model.predict(X_test_data_normalizer)

# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)

print(y_pred)

Accuracy: 0.6229508196721312
[0 1 1 1 1 1 1 1 2 2 2 1 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create a logistic regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the training data
logistic_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = logistic_model.predict(X_test_data_normalizer)

# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
y_pred

Accuracy: 0.36065573770491804


array([1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 2, 0, 0, 1, 2,
       0, 2, 2, 2, 1, 2, 2, 2, 0, 0, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1])

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = rf_model.predict(X_test_data_normalizer)

# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
y_pred


Accuracy: 0.3770491803278688


array([1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1])

In [67]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create an SVM classifier
svm_model = SVC(kernel='linear', random_state=42)

# Train the model on the training data
svm_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = svm_model.predict(X_test_data_normalizer)

# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
print(y_pred)


Accuracy: 0.6721311475409836
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [68]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Create a LightGBM classifier
lgbm_model = LGBMClassifier(random_state=42)

# Train the model on the training data
lgbm_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = lgbm_model.predict(X_test_data_normalizer)

# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
print(y_pred)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 94177
[LightGBM] [Info] Number of data points in the train set: 5617, number of used features: 374
[LightGBM] [Info] Start training from score -2.303832
[LightGBM] [Info] Start training from score -0.244420
[LightGBM] [Info] Start training from score -2.145869
Accuracy: 0.6557377049180327
[1 1 1 1 2 2 1 2 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [69]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Create a Gradient Boosting classifier
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model on the training data
gb_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = gb_model.predict(X_test_data_normalizer)

# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
y_pred

Accuracy: 0.2459016393442623


array([2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1])

In [71]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score

# Define the LSTM model with more layers
lstm_model = Sequential([
    LSTM(256, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    Dropout(0.3),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32, return_sequences=True),
    Dropout(0.3),
     LSTM(16, return_sequences=True),
    Dropout(0.3),
     LSTM(8, return_sequences=True),
    Dropout(0.3),
     LSTM(4, return_sequences=True),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='softmax')  # Adjust num_classes according to your problem
])

# Compile the model
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(patience=10, monitor='val_loss', mode='min', restore_best_weights=True)

# Define model checkpoint to save the best model during training
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')

# Train the model
history = lstm_model.fit(X_train_data_normalizer, y_train_data, epochs=100, batch_size=64, validation_split=0.1, callbacks=[early_stopping, model_checkpoint])

# Load the best saved model
best_model = tf.keras.models.load_model('best_model.keras')

# Predict labels for the test set using the best model
y_pred_probs = best_model.predict(X_test_data_normalizer)
y_pred = tf.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)

y_pred

Epoch 1/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m462s[0m 6s/step - accuracy: 0.7859 - loss: 5.9269 - val_accuracy: 0.7936 - val_loss: 5.9269
Epoch 2/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m402s[0m 5s/step - accuracy: 0.7833 - loss: 5.9269 - val_accuracy: 0.7936 - val_loss: 5.9269
Epoch 3/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m402s[0m 5s/step - accuracy: 0.7774 - loss: 5.9269 - val_accuracy: 0.7936 - val_loss: 5.9269
Epoch 4/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m412s[0m 5s/step - accuracy: 0.7747 - loss: 5.9269 - val_accuracy: 0.7936 - val_loss: 5.9269
Epoch 5/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 5s/step - accuracy: 0.7806 - loss: 5.9269 - val_accuracy: 0.7936 - val_loss: 5.9269
Epoch 6/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 5s/step - accuracy: 0.7769 - loss: 5.9269 - val_accuracy: 0.7936 - val_loss: 5.9269
Epoch 7/100
[1m79/79[0m [

In [52]:
X_train.shape

(5617, 27, 1)

In [54]:
X_train_data.shape

(5617, 27)