In [1]:
import pandas as pd
import os
from functools import reduce

In [16]:
idle_directory = "../data/combined data files/idle/"
attack_directory = "../data/combined data files/spectre/"

idle_files = [file for file in os.listdir(idle_directory)]
attack_files = [file for file in os.listdir(attack_directory)]


In [20]:
# Function to combine and save rows in batches of specified size
def combine_in_batches(files, isIdle, output_file, batch_size):
    batch_data = []
    for file in files:
        if isIdle:
            fileName = idle_directory + file
        else:
            fileName = attack_directory + file
#         \print(fileName)
        df = pd.read_csv(fileName)  # Read the CSV file
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i:i + batch_size]  # Get a batch of rows
            if len(batch) == batch_size:  # Only include full batches
                batch_data.append(batch)
    
    # Concatenate all full batches
    combined_df = pd.concat(batch_data, ignore_index=True)
    combined_df.to_csv(output_file, index=False)  # Save to master file
    return combined_df

In [21]:
batch_size = 5

# Combine idle files with a batch size of 5 and save
idle_combined_df = combine_in_batches(idle_files, True, "../data/combined data files/combined_idle_batches.csv", batch_size)
print("Idle files combined into '../data/combined data files/combined_idle_batches.csv'")

# Combine attack files with a batch size of 5 and save
attack_combined_df = combine_in_batches(attack_files, False,"../data/combined data files/combined_attack_batches.csv", batch_size)
print("Attack files combined into '../data/combined data files/combined_attack_batches.csv'")

Idle files combined into '../data/combined data files/combined_idle_batches.csv'
Attack files combined into '../data/combined data files/combined_attack_batches.csv'


In [23]:
# Print the sizes of the DataFrames
print(f"Size of idle_combined_df: {idle_combined_df.shape}")
print(f"Size of attack_combined_df: {attack_combined_df.shape}")


Size of idle_combined_df: (242670, 21)
Size of attack_combined_df: (260155, 21)


In [31]:
import pandas as pd
import numpy as np
import csv
import sys

# Function to generate a progress bar
def progress_bar(current, total, bar_length=50):
    progress = current / total
    bar = '=' * int(progress * bar_length) + '-' * (bar_length - int(progress * bar_length))
    sys.stdout.write(f'\r[{bar}] {progress * 100:.2f}%')
    sys.stdout.flush()

# Function to write shuffled batches directly to CSV
def write_shuffled_batches_to_csv(idle_df, attack_df, batch_size, output_file):
    # Calculate the maximum number of batches that can be drawn from each DataFrame
    max_batches_idle = len(idle_df) // batch_size
    max_batches_attack = len(attack_df) // batch_size
    total_batches = min(max_batches_idle, max_batches_attack) * 2  # Equal distribution
    
    # Initialize batch counters
    idle_start, attack_start = 0, 0
    batches = []
    
    print("\nCreating batches...")
    for i in range(total_batches // 2):
        # Create one batch from idle data
        idle_batch = idle_df.iloc[idle_start:idle_start + batch_size].copy()
        idle_batch['Label'] = 0  # Label for idle
        batches.append(idle_batch)
        idle_start += batch_size
        
        # Create one batch from attack data
        attack_batch = attack_df.iloc[attack_start:attack_start + batch_size].copy()
        attack_batch['Label'] = 1  # Label for attack
        batches.append(attack_batch)
        attack_start += batch_size
        
        # Update progress bar for batch creation
        progress_bar(i + 1, total_batches // 2)
    
    # Shuffle the batches
    print("\n\nShuffling batches...")
    np.random.shuffle(batches)
    
    # Write the shuffled batches to CSV
    print("\nWriting batches to CSV...")
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        # Write the header row
        writer.writerow(list(idle_df.columns) + ['Label'])
        
        for i, batch in enumerate(batches):
            writer.writerows(batch.values)
            
            # Update progress bar for writing batches
            progress_bar(i + 1, len(batches))
    
    print("\nShuffled data successfully written to CSV.")

# Specify batch size and output file
batch_size = 5
output_file = "../data/combined data files/shuffled_master_data.csv"

# Write shuffled batches directly to CSV
write_shuffled_batches_to_csv(idle_combined_df, attack_combined_df, batch_size, output_file)



Creating batches...

Shuffling batches...

Writing batches to CSV...
Shuffled data successfully written to CSV.


In [27]:
# Save the master DataFrame to a new CSV file
master_df.to_csv("../data/combined data files/master_data.csv", index=False)
print("Master DataFrame saved to '../data/combined data files/master_data.csv'")

Master DataFrame saved to '../data/combined data files/master_data.csv'
