In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [47]:
#######################################################################################
################################# CREATING LARGEDF  ###################################
#######################################################################################

def change_index_to_time(df):
    df['Time (UTC)'] = pd.to_datetime(df['Time (UTC)'])
    df.set_index('Time (UTC)', inplace = True, drop = True)


#Finds the dataframe with largest number of time stamps and returns an empty dataframe with same index
def create_empty_df(dir_name):
    largest_timeframe = 0
    path_to_largest = ""

    for asset_class in os.listdir(dir_name):
        for asset in os.listdir(dir_name +  "/" + asset_class):    
            df = pd.read_csv(dir_name +  "/" + asset_class + "/" + asset)
            if df.shape[0] > largest_timeframe:
                largest_timeframe = df.shape[0]
                path_to_largest = dir_name +  "/" + asset_class + "/" + asset

    largest_dataframe = pd.read_csv(path_to_largest)
    change_index_to_time(largest_dataframe)
    largest_dataframe = pd.DataFrame(index = largest_dataframe.index)
    return largest_dataframe

#######################################################################################
################################# FEATURE BUILDING  ###################################
#######################################################################################

def trim_fft(df):
    all_fft = []
    for column in df.columns:
        if "fft" in column:
            all_fft.append(column)
    df = df.drop(columns = all_fft)
    return df

def fill_large_df(dir_name, large_df):
    asset67 = []
    for asset_class in os.listdir(dir_name):
        for asset in os.listdir(dir_name +  "/" + asset_class): 
            extract_name = asset.split("_")
            asset67.append(extract_name[0][:-4])
            df = pd.read_csv(dir_name +  "/" + asset_class + "/" + asset)
            change_index_to_time(df)
            df = trim_fft(df)
            large_df = large_df.join(df)
            rename_dic = {}
            for column in df.columns:    
                rename_dic[column] = column + "@" + extract_name[0][:-4]
            large_df = large_df.rename(columns = rename_dic) 
            
    return large_df, asset67

def extract_only_features(large_df, target_asset, target_window, asset67):
    df = large_df
    df["H-" + str(target_window) + "hr" ] = df.pop("H-" + str(target_window) + "hr@" + target_asset)
    df["L-" + str(target_window) + "hr" ] = df.pop("L-" + str(target_window) + "hr@" + target_asset)
    
    for asset in asset67:
        if asset != target_asset:
            df = df.drop(columns = ["H-4hr@" + asset, "L-4hr@" + asset, "H-12hr@" + asset,
                               "L-12hr@" + asset, "H-24hr@" + asset,"L-24hr@" + asset])
    return df


#######################################################################################
################################# MODEL BUILDING  #####################################
#######################################################################################


#predicting the H as previous value
def create_baseline(only_feat_df, direction, target_window):     
    preds = np.array(pd.Series(only_feat_df[direction + "-" + str(target_window) + "hr"]).shift(1).bfill(axis = 0))
    actual = np.array(only_feat_df[direction + "-" + str(target_window) + "hr"])
    return preds, actual

def model:
    pass

#######################################################################################
############################# ACCURACY CALCULATIONS  ##################################
#######################################################################################

def acc_abs(preds, actual, tolerance):
    tolerance /= 100
    return np.sum(np.abs(preds/actual-1) <= tolerance)

def acc_dir(preds, actual, tolerance, direction):
    tolerance /= 100
    score = 0
    for ix in range(records):
        if actual[ix] > preds[ix]*(1+tolerance)  and direction == "H":
            score += 1
        elif actual[ix] < preds[ix]*(1-tolerance)  and direction == "L":
             score += 1
    return score

def min_pips(preds, actual, opens, tolerance, pips, direction):
    pips /= 100
    tolerance /= 100
    score = 0
    for ix in range(records):
        if preds[ix] < actual[ix]*(1+tolerance)  and (direction == "H" and (actual[ix] - opens[ix])>=pips):
            score += 1 
        elif preds[ix] > actual[ix]*(1+tolerance)  and (direction == "L" and (opens[ix] - actual[ix])>=pips):
             score += 1
    return score

In [3]:
#name of the root directory which contains asset classes which further contain asset csv ask and bid pairs
dir_name = "Data"
large_df = create_empty_df(dir_name)

In [4]:
#asset67 contains the 67 selected assets which have data for more than 6 years
large_df_, asset67 = fill_large_df(dir_name, large_df)

In [5]:
#Filling in previous values for values in middle
large_df_ = large_df_.ffill(axis=0)

#Filling in next value for initial nan values
#This is used for assets which do not have initial data
large_df_ = large_df_.bfill(axis=0)

In [94]:
#input variables
target_asset = "XAGUSD"
target_window = 12
direction = "L"

only_feat_df = extract_only_features(large_df_, target_asset, target_window, asset67)
preds, actual = create_baseline(only_feat_df, direction, target_window)

In [96]:
#in the xlsx file, first row and columns have been removed for efficient reading
pip_xl = pd.read_excel("Labels to Forecast.xlsx")

#give tolerance in percent
tolerance = 0.04

#querying pips
pips = pip_xl[pip_xl["Label"] == target_asset][str(target_window) + " Hour"].values[0]

#calculate accuracies
print(acc_abs(preds, actual, tolerance))
print(acc_dir(preds, actual, tolerance, direction))
print(min_pips(preds, actual, only_feat_df["Open@" + target_asset], tolerance, pips, direction))

105908
7871
999
