In [1]:
import json
import random
from sklearn.model_selection import train_test_split

def load_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

def modify_labels_and_remove_key(data):
    for item in data:
        # Change labels from numeric to string
        if item['label'] == 0:
            item['label'] = 'machine'
        elif item['label'] == 1:
            item['label'] = 'human'
        
        # Remove 'src' key if it exists
        item.pop('src', None)

def add_ids_to_data(data):
    for index, item in enumerate(data, start=1):
        item['id'] = index

def downsize_data(data, label0_count=300, label1_count=0):
    # Adjust to use string labels correctly
    label0_data = [item for item in data if item['label'] == 'machine']
    label1_data = [item for item in data if item['label'] == 'human']

    # Randomly select items from each label group
    selected_label0 = random.sample(label0_data, min(label0_count, len(label0_data)))
    selected_label1 = random.sample(label1_data, min(label1_count, len(label1_data)))
    
    # Combine and shuffle the downsized dataset
    downsized_data = selected_label0 + selected_label1
    random.shuffle(downsized_data)
    return downsized_data

def split_data(data, train_size=0.7, test_size=0.15, validation_size=0.15):
    train_data, test_val_data = train_test_split(data, train_size=train_size)
    test_data, val_data = train_test_split(test_val_data, test_size=test_size / (test_size + validation_size))
    return train_data, test_data, val_data

def save_data(data, filename):
    with open(filename, 'w') as file:
        for item in data:
            file.write(json.dumps(item) + '\n')

def process_dataset(file_path):
    # Load data
    data = load_data(file_path)
    # Modify the labels and remove unnecessary keys
    modify_labels_and_remove_key(data)
    # Downsize and shuffle data
    #downsized_data = downsize_data(data)
    add_ids_to_data(data)
    # Split data into train, test, and validation sets
    train_data, test_data, val_data = split_data(data)

    # Save data to JSONL files
    save_data(train_data, '../DFTData/train_data_perturbed.jsonl')
    save_data(test_data, '../DFTData/test_data_perturbed.jsonl')
    save_data(val_data, '../DFTData/val_data_perturbed.jsonl')
    print("Data processing complete and files saved.")

# Call the function with your file path
process_dataset('../DFTData/Perturbed_Train_Data.jsonl')

Data processing complete and files saved.
