In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
file_path = 'trades(final).csv'
data = pd.read_csv(file_path)

# Split the data into train (80%), and temp (20%)
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)

# Split the temp data into validation (10%) and test (10%) sets
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Interpolating missing values
train_data = train_data.interpolate(method='linear', limit_direction='both')
val_data = val_data.interpolate(method='linear', limit_direction='both')
test_data = test_data.interpolate(method='linear', limit_direction='both')

# Define features to be shifted
features_to_shift = [
    'Open', 'High', 'Low', 'Last', 'MACD_At_Entry', 'Day_Of_Week_At_Entry', 'ROC14_At_Entry',
    'SMA5_At_Entry', 'SMA7_At_Entry', 'SMA10_At_Entry', 'SMA15_At_Entry',
    'EMA5_At_Entry', 'EMA7_At_Entry', 'EMA10_At_Entry', 'EMA15_At_Entry',
    'RSI5_At_Entry', 'RSI10_At_Entry', 'RSI15_At_Entry',
    'ATR5_At_Entry', 'ATR15_At_Entry',
    'BB5_High_At_Entry', 'BB5_Low_At_Entry', 'BB5_MAvg_At_Entry',
    'BB10_High_At_Entry', 'BB10_Low_At_Entry', 'BB10_MAvg_At_Entry',
    'BB15_High_At_Entry', 'BB15_Low_At_Entry', 'BB15_MAvg_At_Entry',
    'Stoch5_K_At_Entry', 'Stoch5_D_At_Entry', 'Stoch7_K_At_Entry', 'Stoch7_D_At_Entry',
    'Stoch21_K_At_Entry', 'Stoch21_D_At_Entry'
]

# Shifting the features back by one period to prevent lookahead bias
train_data[features_to_shift] = train_data[features_to_shift].shift(1)
val_data[features_to_shift] = val_data[features_to_shift].shift(1)
test_data[features_to_shift] = test_data[features_to_shift].shift(1)

# Drop rows with NaNs after shifting
train_data.dropna(inplace=True)
val_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# Save the splits into separate files
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

print("Data split, interpolated, shifted, and saved successfully.")


Data split, interpolated, shifted, and saved successfully.


redoinig splitting ensuring temporal nature respected, also includes a hold-out set between train
and validate


In [5]:
import pandas as pd

# Load the data
file_path = 'trades(final).csv'
data = pd.read_csv(file_path)

# Ensure data is sorted by time (assuming 'Time' is your time column)
data = data.sort_values(by='Time')

# Define the sizes for train, hold-out, validation, and test sets
train_size = 0.75  # 75% of the original data for training
holdout_size = 0.05  # 5% of the original data for holdout
val_size = 0.10  # 10% of the original data for validation
test_size = 0.10  # 10% of the original data for testing

# Calculate the index boundaries for each split
n = len(data)
train_end = int(train_size * n)
holdout_end = int((train_size + holdout_size) * n)
val_end = int((train_size + holdout_size + val_size) * n)

# Split the data while maintaining temporal order
train_data = data.iloc[:train_end]
holdout_data = data.iloc[train_end:holdout_end]
val_data = data.iloc[holdout_end:val_end]
test_data = data.iloc[val_end:]

# Interpolating missing values
train_data = train_data.interpolate(method='linear', limit_direction='both')
val_data = val_data.interpolate(method='linear', limit_direction='both')
test_data = test_data.interpolate(method='linear', limit_direction='both')
holdout_data = holdout_data.interpolate(method='linear', limit_direction='both')

# Define features to be shifted
features_to_shift = [
    'Open', 'High', 'Low', 'Last', 'MACD_At_Entry', 'Day_Of_Week_At_Entry', 'ROC14_At_Entry',
    'SMA5_At_Entry', 'SMA7_At_Entry', 'SMA10_At_Entry', 'SMA15_At_Entry',
    'EMA5_At_Entry', 'EMA7_At_Entry', 'EMA10_At_Entry', 'EMA15_At_Entry',
    'RSI5_At_Entry', 'RSI10_At_Entry', 'RSI15_At_Entry',
    'ATR5_At_Entry', 'ATR15_At_Entry',
    'BB5_High_At_Entry', 'BB5_Low_At_Entry', 'BB5_MAvg_At_Entry',
    'BB10_High_At_Entry', 'BB10_Low_At_Entry', 'BB10_MAvg_At_Entry',
    'BB15_High_At_Entry', 'BB15_Low_At_Entry', 'BB15_MAvg_At_Entry',
    'Stoch5_K_At_Entry', 'Stoch5_D_At_Entry', 'Stoch7_K_At_Entry', 'Stoch7_D_At_Entry',
    'Stoch21_K_At_Entry', 'Stoch21_D_At_Entry'
]

# Shifting the features back by one period to prevent lookahead bias
train_data[features_to_shift] = train_data[features_to_shift].shift(1)
val_data[features_to_shift] = val_data[features_to_shift].shift(1)
test_data[features_to_shift] = test_data[features_to_shift].shift(1)
holdout_data[features_to_shift] = holdout_data[features_to_shift].shift(1)

# Drop rows with NaNs after shifting
train_data.dropna(inplace=True)
val_data.dropna(inplace=True)
test_data.dropna(inplace=True)
holdout_data.dropna(inplace=True)

# Save the splits into separate files
train_data.to_csv('train_data(new).csv', index=False)
val_data.to_csv('val_data(new).csv', index=False)
test_data.to_csv('test_data(new).csv', index=False)
holdout_data.to_csv('holdout_data(new).csv', index=False)

print("Data split, interpolated, shifted, and saved successfully.")

FileNotFoundError: [Errno 2] No such file or directory: 'trades(final).csv'