<a href="https://colab.research.google.com/github/kushsharma2910/FGQ/blob/kush/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
#######################################################################################
################################# CREATING LARGEDF  ###################################
#######################################################################################

def change_index_to_time(df):
    if "Time (UTC)" in df.columns:
      df['Time (UTC)'] = pd.to_datetime(df['Time (UTC)'])
      df.set_index('Time (UTC)', inplace = True, drop = True)
    
#Finds the dataframe with largest number of time stamps and returns an empty dataframe with same index
def create_empty_df(dir_name):
    largest_timeframe = 0
    path_to_largest = ""

    for asset_class in os.listdir(dir_name):
        for asset in os.listdir(dir_name +  "/" + asset_class):    
            df = pd.read_csv(dir_name +  "/" + asset_class + "/" + asset)
            if df.shape[0] > largest_timeframe:
                largest_timeframe = df.shape[0]
                path_to_largest = dir_name +  "/" + asset_class + "/" + asset

    largest_dataframe = pd.read_csv(path_to_largest)
    change_index_to_time(largest_dataframe)
    largest_dataframe = pd.DataFrame(index = largest_dataframe.index)
    return largest_dataframe

#######################################################################################
################################# FEATURE BUILDING  ###################################
#######################################################################################

def trim_fft(df):
    all_fft = []
    for column in df.columns:
        if "fft" in column:
            all_fft.append(column)
    df = df.drop(columns = all_fft)
    return df

def fill_large_df(dir_name, large_df):
    asset67 = []
    for asset_class in os.listdir(dir_name):
        for asset in os.listdir(dir_name +  "/" + asset_class): 
            extract_name = asset.split("_")
            asset67.append(extract_name[0][:-4])
            df = pd.read_csv(dir_name +  "/" + asset_class + "/" + asset)
            change_index_to_time(df)
            df = trim_fft(df)
            large_df = large_df.join(df)
            rename_dic = {}
            for column in df.columns:    
                rename_dic[column] = column + "@" + extract_name[0][:-4]
            large_df = large_df.rename(columns = rename_dic) 
            
    return large_df, asset67

def add_lag_features(only_feat_df):
  lag_feat = only_feat_df.copy()
  BASE_FEATURES = ["Open@" + target_asset, "High@" + target_asset, "Low@" + target_asset, "Close@" + target_asset]
  N_WINDOW = [4, 24, 128, 256]
  prevlag = 1 
  for window in N_WINDOW:
    rolled = lag_feat[BASE_FEATURES].shift(prevlag).rolling(window=window)
    lag_feat = lag_feat.join(rolled.mean().add_suffix(f'_window_{window}_mean'))
    lag_feat = lag_feat.join(rolled.max().add_suffix(f'_window_{window}_max'))
    lag_feat = lag_feat.join(rolled.min().add_suffix(f'_window_{window}_min'))
    lag_feat = lag_feat.join(rolled.std().add_suffix(f'_window_{window}_std'))
  return lag_feat

def remove_spread(df, target_asset):
    target_asset_spread = []
    features = ["Open", "High", "Low", "Close", "Volume "]

    for feature in features:
        target_asset_spread.append(feature + "_spread@" + target_asset)

    remove_columns = []
    for column in only_feat_df.columns:
        if (column not in target_asset_spread) and ("spread" in column):
            remove_columns.append(column)

    df = df.drop(columns = remove_columns)
    return df

def extract_features(large_df, target_asset, target_window, asset67, add_lag = False, keep_spread = True):
    df = large_df.copy()

    #remove the spread columns of assets
    #other than the target asset
    if keep_spread == False:
        df = remove_spread(df, target_asset)

    #comment out this line if you 
    #don't want to use the lag features
    if add_lag == True:
      df = add_lag_features(df)

    df["H-" + str(target_window) + "hr" ] = df.pop("H-" + str(target_window) + "hr@" + target_asset)
    df["L-" + str(target_window) + "hr" ] = df.pop("L-" + str(target_window) + "hr@" + target_asset)
    
    #removing targets of assets other
    # than the target assset
    for asset in asset67:
        if asset != target_asset:
            df = df.drop(columns = ["H-4hr@" + asset, "L-4hr@" + asset, "H-12hr@" + asset,
                               "L-12hr@" + asset, "H-24hr@" + asset,"L-24hr@" + asset])
            
    #this loop asserts that labels for target asset is
    #not present for other windows
    windows = ["4", "12", "24"]
    for window in windows:
      if window != str(target_window):
        high = "H-" + window + "hr@" + target_asset
        low = "L-" + window + "hr@" + target_asset
        if high in df.columns:
          df = df.drop(columns = [high])
        if low in df.columns:
          df = df.drop(columns = [low])

    change_index_to_time(df)        
    return df

def normalize_df(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

#######################################################################################
################################# MODEL BUILDING  #####################################
#######################################################################################


#predicting the H as previous value
def create_baseline(only_feat_df, direction, target_window):     
    preds = np.array(pd.Series(only_feat_df[direction + "-" + str(target_window) + "hr"]).shift(1).bfill(axis = 0))
    actual = np.array(only_feat_df[direction + "-" + str(target_window) + "hr"])
    return preds, actual

#######################################################################################
############################# ACCURACY CALCULATIONS  ##################################
#######################################################################################

def acc_abs(preds, actual, tolerance):
    tolerance /= 100
    return np.sum(np.abs(preds/actual-1) <= tolerance)

def acc_dir(preds, actual, tolerance, direction):
    tolerance /= 100
    score = 0
    for ix in range(preds.shape[0]):
        if actual[ix] > preds[ix]*(1+tolerance)  and direction == "H":
            score += 1
        elif actual[ix] < preds[ix]*(1-tolerance)  and direction == "L":
             score += 1
    return score

def min_pips(preds, actual, opens, tolerance, pips, direction):
    pips /= 100
    tolerance /= 100
    score = 0
    for ix in range(preds.shape[0]):
        if preds[ix] < actual[ix]*(1+tolerance)  and (direction == "H" and (actual[ix] - opens[ix])>=pips):
            score += 1 
        elif preds[ix] > actual[ix]*(1+tolerance)  and (direction == "L" and (opens[ix] - actual[ix])>=pips):
             score += 1
    return score

In [0]:
#name of the root directory which contains asset classes which further contain asset csv ask and bid pairs
# dir_name = "Data"
# large_df = create_empty_df(dir_name)

In [0]:
#asset67 contains the 67 selected assets which have data for more than 6 years
# large_df_, asset67 = fill_large_df(dir_name, large_df)

In [0]:
#Filling in previous values for values in middle
# large_df_ = large_df_.ffill(axis=0)

#Filling in next value for initial nan values
#This is used for assets which do not have initial data
# large_df_ = large_df_.bfill(axis=0)

In [0]:
#to extract to a given path
# import zipfile
# with zipfile.ZipFile("drive/My Drive/large_df_.zip", 'r') as zip_ref:
#     zip_ref.extractall("drive/My Drive/")

#to extract to the home dir
!unzip "drive/My Drive/large_df_.zip"

Archive:  drive/My Drive/large_df_.zip
replace large_df_.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [0]:
#When using google drive
large_df_ = pd.read_csv("large_df_.csv")

#List of assets present
#in large_df_
asset67 = np.array(pd.read_csv("drive/My Drive/asset67.csv").iloc[:, 1])

In [0]:
#input variables
target_asset = "XAGUSD"
target_window = 12
direction = "L"
#give tolerance in percent
tolerance = 0.04

only_feat_df = extract_features(large_df_, target_asset, target_window, asset67, add_lag=False, keep_spread = False)

In [0]:
#in the xlsx file, first row and columns have been removed for efficient reading
pip_xl = pd.read_excel("drive/My Drive/Labels to Forecast.xlsx")

#querying pips
pips = pip_xl[pip_xl["Label"] == target_asset][str(target_window) + " Hour"].values[0]

In [0]:
# preds, actual = create_baseline(only_feat_df, direction, target_window)

# #calculate accuracies
# print(acc_abs(preds, actual, tolerance))
# print(acc_dir(preds, actual, tolerance, direction))
# print(min_pips(preds, actual, only_feat_df["Open@" + target_asset], tolerance, pips, direction))

In [0]:
# norm_feat = normalize_df(only_feat_df.iloc[:, :-2])
# target = only_feat_df[direction + "-" + str(target_window) + "hr"]

only_norm = normalize_df(only_feat_df.iloc[:, :-2])
only_norm[direction + "-" + str(target_window) + "hr"] = only_feat_df[direction + "-" + str(target_window) + "hr"]

In [0]:
import os
import time
from tqdm._tqdm_notebook import tqdm_notebook
import pickle
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
from keras import optimizers
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import logging

params = {
    "batch_size": 256, 
    "epochs": 300,
    "lr": 0.00010000,
    "time_steps": 512
}

iter_changes = "dropout_layers_0.4_0.4"

TIME_STEPS = params["time_steps"]
BATCH_SIZE = params["batch_size"]

def trim_dataset(mat,batch_size):
    no_of_rows_drop = mat.shape[0]%batch_size
    if no_of_rows_drop > 0:
        return mat[:-no_of_rows_drop]
    else:
        return mat

def build_timeseries(mat, y_col_index = -1):
    dim_0 = mat.shape[0] - TIME_STEPS
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    y = np.zeros((dim_0,))
    for i in tqdm_notebook(range(dim_0)):
        x[i] = mat[i:TIME_STEPS+i]
        y[i] = mat[TIME_STEPS+i, y_col_index]
    return x, y

def create_model():
    lstm_model = Sequential()
    # (batch_size, timesteps, data_dim)
    lstm_model.add(LSTM(100, batch_input_shape=(BATCH_SIZE, TIME_STEPS, x_t.shape[2]),
                        dropout=0.0, recurrent_dropout=0.0, stateful=True, return_sequences=True,
                        kernel_initializer='random_uniform'))
    lstm_model.add(Dropout(0.4))
    lstm_model.add(LSTM(60, dropout=0.0))
    lstm_model.add(Dropout(0.4))
    lstm_model.add(Dense(20,activation='relu'))
    lstm_model.add(Dense(1,activation='sigmoid'))
    optimizer = optimizers.RMSprop(lr=params["lr"])
    # optimizer = optimizers.SGD(lr=0.000001, decay=1e-6, momentum=0.9, nesterov=True)
    lstm_model.compile(loss='mean_squared_error', optimizer=optimizer)
    return lstm_model

In [0]:
df_train, df_test = train_test_split(only_norm, train_size=0.8, test_size=0.2, shuffle=False)
print("Train--Test size", len(df_train), len(df_test))

x_t, y_t = build_timeseries(df_train, -1)
x_t = trim_dataset(x_t, BATCH_SIZE)
y_t = trim_dataset(y_t, BATCH_SIZE)
print("Batch trimmed size",x_t.shape, y_t.shape)

x_temp, y_temp = build_timeseries(x_test, -1)
x_val, x_test_t = np.split(trim_dataset(x_temp, BATCH_SIZE),2)
y_val, y_test_t = np.split(trim_dataset(y_temp, BATCH_SIZE),2)
print("Test size", x_test_t.shape, y_test_t.shape, x_val.shape, y_val.shape)

model = create_model()

y_pred = model.predict(trim_dataset(x_test_t, BATCH_SIZE), batch_size=BATCH_SIZE)
y_pred = y_pred.flatten()