Retrying splitting, ensuring temporatility respected and included a hold-out set

In [1]:
import pandas as pd

# Load the data
file_path = 'trades(final).csv'
data = pd.read_csv(file_path)

# Ensure data is sorted by Entry_Date
data['Entry_Date'] = pd.to_datetime(data['Entry_Date'])  # Convert to datetime if not already
data = data.sort_values(by='Entry_Date')

# Define the sizes for train, hold-out, validation, and test sets
train_size = 0.75  # 75% of the original data for training
holdout_size = 0.05  # 5% of the original data for holdout
val_size = 0.10  # 10% of the original data for validation
test_size = 0.10  # 10% of the original data for testing

# Calculate the index boundaries for each split
n = len(data)
train_end = int(train_size * n)
holdout_end = int((train_size + holdout_size) * n)
val_end = int((train_size + holdout_size + val_size) * n)

# Split the data while maintaining temporal order
train_data = data.iloc[:train_end]
holdout_data = data.iloc[train_end:holdout_end]
val_data = data.iloc[holdout_end:val_end]
test_data = data.iloc[val_end:]

# Interpolating missing values
train_data = train_data.interpolate(method='linear', limit_direction='both')
val_data = val_data.interpolate(method='linear', limit_direction='both')
test_data = test_data.interpolate(method='linear', limit_direction='both')
holdout_data = holdout_data.interpolate(method='linear', limit_direction='both')

# Define features to be shifted
features_to_shift = [
    'Open', 'High', 'Low', 'Last', 'MACD_At_Entry', 'Day_Of_Week_At_Entry', 'ROC14_At_Entry',
    'SMA5_At_Entry', 'SMA7_At_Entry', 'SMA10_At_Entry', 'SMA15_At_Entry',
    'EMA5_At_Entry', 'EMA7_At_Entry', 'EMA10_At_Entry', 'EMA15_At_Entry',
    'RSI5_At_Entry', 'RSI10_At_Entry', 'RSI15_At_Entry',
    'ATR5_At_Entry', 'ATR15_At_Entry',
    'BB5_High_At_Entry', 'BB5_Low_At_Entry', 'BB5_MAvg_At_Entry',
    'BB10_High_At_Entry', 'BB10_Low_At_Entry', 'BB10_MAvg_At_Entry',
    'BB15_High_At_Entry', 'BB15_Low_At_Entry', 'BB15_MAvg_At_Entry',
    'Stoch5_K_At_Entry', 'Stoch5_D_At_Entry', 'Stoch7_K_At_Entry', 'Stoch7_D_At_Entry',
    'Stoch21_K_At_Entry', 'Stoch21_D_At_Entry'
]

# Shifting the features back by one period to prevent lookahead bias
train_data[features_to_shift] = train_data[features_to_shift].shift(1)
val_data[features_to_shift] = val_data[features_to_shift].shift(1)
test_data[features_to_shift] = test_data[features_to_shift].shift(1)
holdout_data[features_to_shift] = holdout_data[features_to_shift].shift(1)

# Drop rows with NaNs after shifting
train_data.dropna(inplace=True)
val_data.dropna(inplace=True)
test_data.dropna(inplace=True)
holdout_data.dropna(inplace=True)

# Save the splits into separate files
train_data.to_csv('train_data(new).csv', index=False)
val_data.to_csv('val_data(new).csv', index=False)
test_data.to_csv('test_data(new).csv', index=False)
holdout_data.to_csv('holdout_data(new).csv', index=False)

print("Data split, interpolated, shifted, and saved successfully.")


  train_data = train_data.interpolate(method='linear', limit_direction='both')
  val_data = val_data.interpolate(method='linear', limit_direction='both')
  test_data = test_data.interpolate(method='linear', limit_direction='both')
  holdout_data = holdout_data.interpolate(method='linear', limit_direction='both')


Data split, interpolated, shifted, and saved successfully.


Checking the original data. 

In [6]:
import pandas as pd

# Load the dataset
file_path = 'trades(final).csv'
df = pd.read_csv(file_path)

# Group by Entry_Date to find instances where multiple strategies traded on the same day
grouped = df.groupby('Entry_Date')

# Initialize a list to store any discrepancies found
discrepancies = []

# Iterate over each group
for entry_date, group in grouped:
    if len(group) > 1:  # Only consider dates where more than one trade occurred
        # Drop the columns that are not supposed to match, like Trade_ID and Strategy
        comparison_columns = [col for col in df.columns if col not in ['Strategy', 'Trade_ID']]
        
        # Check if all rows in the group are identical
        for col in comparison_columns:
            if not group[col].nunique() == 1:
                discrepancies.append({
                    'Entry_Date': entry_date,
                    'Column': col,
                    'Unique Values': group[col].unique(),
                    'Rows': group[[col, 'Strategy', 'Trade_ID']].to_dict(orient='records')
                })

# Output the result
if discrepancies:
    print("Discrepancies found:")
    for discrepancy in discrepancies:
        print(f"\nEntry Date: {discrepancy['Entry_Date']}, Column: {discrepancy['Column']}, Unique Values: {discrepancy['Unique Values']}")
        print("Affected Rows:")
        for row in discrepancy['Rows']:
            print(row)
else:
    print("No discrepancies found. All columns match for trades on the same entry date.")


No discrepancies found. All columns match for trades on the same entry date.


Changin splitting to ensure that mutiple trades on the same day have the same feature values when shiting

In [11]:
import pandas as pd

# Load the data
file_path = 'trades(final).csv'
data = pd.read_csv(file_path)

# Ensure data is sorted by Entry_Date
data['Entry_Date'] = pd.to_datetime(data['Entry_Date'])  # Convert to datetime if not already
data = data.sort_values(by='Entry_Date')

# Define the sizes for train, hold-out, validation, and test sets
train_size = 0.75  # 75% of the original data for training
holdout_size = 0.05  # 5% of the original data for holdout
val_size = 0.10  # 10% of the original data for validation
test_size = 0.10  # 10% of the original data for testing

# Calculate the index boundaries for each split
n = len(data)
train_end = int(train_size * n)
holdout_end = int((train_size + holdout_size) * n)
val_end = int((train_size + holdout_size + val_size) * n)

# Split the data while maintaining temporal order
train_data = data.iloc[:train_end]
holdout_data = data.iloc[train_end:holdout_end]
val_data = data.iloc[holdout_end:val_end]
test_data = data.iloc[val_end:]

# Interpolating missing values
train_data = train_data.interpolate(method='linear', limit_direction='both')
val_data = val_data.interpolate(method='linear', limit_direction='both')
test_data = test_data.interpolate(method='linear', limit_direction='both')
holdout_data = holdout_data.interpolate(method='linear', limit_direction='both')

# Shift the target variable (e.g., Profit_Loss) up by one period
target_variable = 'Target'  # Change this to the actual target column name
train_data[target_variable] = train_data[target_variable].shift(-1)
val_data[target_variable] = val_data[target_variable].shift(-1)
test_data[target_variable] = test_data[target_variable].shift(-1)
holdout_data[target_variable] = holdout_data[target_variable].shift(-1)

# Drop rows with NaNs after shifting (these will be the last rows in each set)
train_data.dropna(inplace=True)
val_data.dropna(inplace=True)
test_data.dropna(inplace=True)
holdout_data.dropna(inplace=True)

# Save the splits into separate files
train_data.to_csv('train_data(new2).csv', index=False)
val_data.to_csv('val_data(new2).csv', index=False)
test_data.to_csv('test_data(new2).csv', index=False)
holdout_data.to_csv('holdout_data(new2).csv', index=False)

print("Data split, interpolated, target shifted, and saved successfully.")


  train_data = train_data.interpolate(method='linear', limit_direction='both')
  val_data = val_data.interpolate(method='linear', limit_direction='both')
  test_data = test_data.interpolate(method='linear', limit_direction='both')
  holdout_data = holdout_data.interpolate(method='linear', limit_direction='both')


Data split, interpolated, target shifted, and saved successfully.


In [12]:
import pandas as pd

# Load the dataset
file_path = 'train_data(new2).csv'
df = pd.read_csv(file_path)

# Group by Entry_Date to find instances where multiple strategies traded on the same day
grouped = df.groupby('Entry_Date')

# Initialize a list to store any discrepancies found
discrepancies = []

# Iterate over each group
for entry_date, group in grouped:
    if len(group) > 1:  # Only consider dates where more than one trade occurred
        # Drop the columns that are not supposed to match, like Trade_ID and Strategy
        comparison_columns = [col for col in df.columns if col not in ['Strategy', 'Trade_ID']]
        
        # Check if all rows in the group are identical
        for col in comparison_columns:
            if not group[col].nunique() == 1:
                discrepancies.append({
                    'Entry_Date': entry_date,
                    'Column': col,
                    'Unique Values': group[col].unique(),
                    'Rows': group[[col, 'Strategy', 'Trade_ID']].to_dict(orient='records')
                })

# Output the result
if discrepancies:
    print("Discrepancies found:")
    for discrepancy in discrepancies:
        print(f"\nEntry Date: {discrepancy['Entry_Date']}, Column: {discrepancy['Column']}, Unique Values: {discrepancy['Unique Values']}")
        print("Affected Rows:")
        for row in discrepancy['Rows']:
            print(row)
else:
    print("No discrepancies found. All columns match for trades on the same entry date.")


Discrepancies found:

Entry Date: 1990-03-13, Column: Target, Unique Values: [1. 0.]
Affected Rows:
{'Target': 1.0, 'Strategy': 'Strategy_2', 'Trade_ID': 'Strategy_2_T1'}
{'Target': 0.0, 'Strategy': 'Strategy_4', 'Trade_ID': 'Strategy_4_T2'}

Entry Date: 1990-03-21, Column: Target, Unique Values: [1. 0.]
Affected Rows:
{'Target': 1.0, 'Strategy': 'Strategy_2', 'Trade_ID': 'Strategy_2_T10'}
{'Target': 0.0, 'Strategy': 'Strategy_4', 'Trade_ID': 'Strategy_4_T11'}

Entry Date: 1990-03-22, Column: Target, Unique Values: [0. 1.]
Affected Rows:
{'Target': 0.0, 'Strategy': 'Strategy_2', 'Trade_ID': 'Strategy_2_T12'}
{'Target': 1.0, 'Strategy': 'Strategy_4', 'Trade_ID': 'Strategy_4_T13'}

Entry Date: 1990-04-02, Column: Target, Unique Values: [1. 0.]
Affected Rows:
{'Target': 1.0, 'Strategy': 'Strategy_2', 'Trade_ID': 'Strategy_2_T21'}
{'Target': 0.0, 'Strategy': 'Strategy_4', 'Trade_ID': 'Strategy_4_T22'}

Entry Date: 1990-05-23, Column: Target, Unique Values: [1. 0.]
Affected Rows:
{'Target':