Before running each cell, be sure to change names of files to use and files to be generated, alongside how many cores were used and how m any files are in range.

In [None]:
import csv
import os

#########################################################
# This script merges all the training data files csv's 
# generated inside each core for each race simulation 
# specified, into one single csv for that particular core. 

#########################################################


# Define the base path and core directories
base_path = "/Users/joesouber/XGBoost_TBBE"
cores = ["Core1", "Core2", "Core3", "Core4", "Core5", "Core6", "Core7", "Core8"]

for core in cores:
    # Generate filenames programmatically for each core
    files = [os.path.join(base_path, core, "TBBE_OD_XGboost", "Application", "getXGBOOstTrainingData_{}.csv".format(i)) for i in range(400)]
    
    # Specify the merged file location for each core
    merged_filename = os.path.join(base_path, core, "TBBE_OD_XGboost", "Application", "merged_result_2_{}.csv".format(core))
    
    with open(merged_filename, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        
        for index, filename in enumerate(files):
            try:
                with open(filename, 'r') as infile:
                    reader = csv.reader(infile)
                    
                    # Skip header only if it's not the first file
                    if index != 0:
                        next(reader, None)
                        
                    # Write rows from current file to the output file
                    writer.writerows(reader)
            except FileNotFoundError:
                print(f"Warning: {filename} not found and will be skipped.")
            except OSError as e:
                print(f"Error: {e} - {filename} could not be processed.")

In [None]:
import csv
import os

#####################
# This script merges all those merged files above for each core into one single csv file for the entire simulation.
#####################


# Define the base path and the cores
base_path = "/Users/joesouber/XGBoost_TBBE"
cores = ["Core1", "Core2", "Core3", "Core4", "Core5", "Core6", "Core7", "Core8"]

# Generate filenames for the merged core files
merged_files = [os.path.join(base_path, core, "TBBE_OD_XGboost", "Application", "merged_result_2_{}.csv".format(core)) for core in cores]

# Specify the final merged file location
final_merged_filename = os.path.join(base_path, "final_merged_result_2.csv")

with open(final_merged_filename, 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    
    for index, filename in enumerate(merged_files):
        try:
            with open(filename, 'r') as infile:
                reader = csv.reader(infile)
                
                # Skip header only if it's not the first file
                if index != 0:
                    next(reader, None)
                    
                # Write rows from current file to the output file
                writer.writerows(reader)
        except FileNotFoundError:
            print(f"Warning: {filename} not found and will be skipped.")
        except OSError as e:
            print(f"Error: {e} - {filename} could not be processed.")


In [None]:
import csv
import os

#####################
# This script is to be used when an entirely new simulation has been run and the two functions above have been used. 
# It merges the old combined simulation csv with the newly generated one, so one csv of all simulations is formed.
#####################



# Define the paths to the files to be merged
file1 = "/Users/joesouber/XGBoost_TBBE/final_merged_result_1.csv"
file2 = "/Users/joesouber/XGBoost_TBBE/final_merged_result_2.csv"

# Define the path to the final merged file
final_merged_filename = "/Users/joesouber/XGBoost_TBBE/final_merged_combined.csv"

# Function to merge the files
def merge_files(file_list, output_file):
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        
        for index, filename in enumerate(file_list):
            try:
                with open(filename, 'r') as infile:
                    reader = csv.reader(infile)
                    
                    # Skip header only if it's not the first file
                    if index != 0:
                        next(reader, None)
                        
                    # Write rows from current file to the output file
                    writer.writerows(reader)
            except FileNotFoundError:
                print(f"Warning: {filename} not found and will be skipped.")
            except OSError as e:
                print(f"Error: {e} - {filename} could not be processed.")

# Merge the specified files
merge_files([file1, file2], final_merged_filename)

print(f"Merged files into {final_merged_filename}")

# Now for training data preprocessing.

In [None]:
import pandas as pd
import sys


#########################################################
# This script extracts the top 20% and bottom 20% of the
# dataset based on the 'balance' column and assigns a label
# of 1 to the top 20% and 0 to the bottom 20%. The new
# dataset is arranged by time and saved to a CSV file.
#########################################################


def extract_training_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Sort the DataFrame by the 'balance' column in descending order
    df_sorted = df.sort_values(by='balance', ascending=False)

    # Determine the number of rows that constitute the top 20% and bottom 20%
    top_20_count = int(len(df_sorted) * 0.2)
    bottom_20_count = int(len(df_sorted) * 0.2)

    # Extract the top 20% and bottom 20% of the sorted DataFrame
    top_20_df = df_sorted.head(top_20_count)
    bottom_20_df = df_sorted.tail(bottom_20_count)

    # Assign label 1 to the top 20% and label 0 to the bottom 20%
    top_20_df['label'] = 1
    bottom_20_df['label'] = 0

    # Combine the top and bottom DataFrames
    combined_df = pd.concat([top_20_df, bottom_20_df])

    # Sort the combined DataFrame by the 'time' column
    combined_df = combined_df.sort_values(by='time')

    # Ensure 'label' column is the penultimate column and 'decision' is the last column
    cols = list(combined_df.columns)
    cols.remove('label')
    cols.insert(-1, 'label')
    combined_df = combined_df[cols]

    # Save the new dataset to a CSV file
    combined_df.to_csv('new_dataset.csv', index=False)

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python extract_training_data.py <file_path>")
    else:
        file_path = sys.argv[1]
        extract_training_data(file_path)


In [None]:
import pandas as pd
import boto3
from sklearn.preprocessing import MinMaxScaler
import os

#########################################################
# This script preprocesses the data by dropping the 'type'
# column, mapping the 'decision' column to 1 and 0, and
# performing MinMax scaling. The preprocessed data is saved
# to a CSV file and optionally uploaded to an S3 bucket.
#########################################################



def preprocess_data(input_path, output_path, bucket_name):
    # Load data
    data = pd.read_csv(input_path)
    
    # Drop 'type' column and NaNs
    data.drop(columns=['type'], inplace=True)
    data.dropna(inplace=True)
    
    # Map 'decision' column
    data['decision'] = data['decision'].map({'backer': 1, 'layer': 0})
    
    # MinMax Scaling
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data)
    data = pd.DataFrame(scaled_data, columns=data.columns)
    
    # Save preprocessed data
    output_file = os.path.join(output_path, 'preprocessed_data.csv')
    data.to_csv(output_file, index=False)
    
    # Upload to S3
    s3 = boto3.client('s3')
    s3.upload_file(output_file, bucket_name, 'preprocessed-output/preprocessed_data.csv')  # Ensure this path is correct

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-path', type=str, required=True)
    parser.add_argument('--output-path', type=str, required=True)
    parser.add_argument('--bucket-name', type=str, required=True)
    
    args = parser.parse_args()
    preprocess_data(args.input_path, args.output_path, args.bucket_name)