# Preprocess the raw dataset

Dataset is available at Kaggle: https://www.kaggle.com/datasets/finalboolean/aim-dataset-for-minecraft

In [None]:
import pandas as pd
import gc
from os import listdir
from os.path import isfile, join
import tqdm
import sys

In [None]:
cheater_data = pd.read_csv("data/cheater.csv")
legit_data = pd.read_csv("data/legit.csv")

In [None]:
# Sort the datasets into their groups with label
# Then free the memory associated with the dataframes
cheat_groups = dict()
for val in cheater_data.values:
    if val[0] not in cheat_groups:
        cheat_groups[val[0]] = ([], 1)
    cheat_groups[val[0]][0].append(val[1:])
print("Finished sorting cheat_groups")

legit_groups = dict()
for val in legit_data.values:
    if val[0] not in legit_groups:
        legit_groups[val[0]] = ([], 0)
    legit_groups[val[0]][0].append(val[1:])
print("Finished sorting legit_groups")

# This will free the memory from the two large dataframes
del cheater_data
del legit_data
gc.collect()
cheater_data = pd.DataFrame()
legit_data = pd.DataFrame()

In [None]:
# Write the data to the processed_data folder
import os
if not os.path.exists("./processed_data"):
    os.makedirs("./processed_data")
os.makedirs("./processed_data/cheater")
os.makedirs("./processed_data/legit")

def write_groups(groups, category):
    with tqdm.tqdm(total=len(groups), file=sys.stdout) as pbar:
        pbar.set_description(f'Processing: {category}')
        for group in groups:
            f = open(f"./processed_data/{category}/{group}.csv", "w+")
            
            # Sort the row by time stamp to ensure the samples are in order
            groups[group][0].sort(key=lambda x: x[13])
            
            # Write the csv header
            f.write("yaw,pitch,delta_yaw,delta_pitch,accel_yaw,accel_pitch,target_x,target_y,target_z,position_x,position_y,position_z,sensitivity,time,new_sequence\n")
            for i, sample in enumerate(groups[group][0]):
                f.write(','.join(map(str, sample)) + "\n")
            f.close()
            pbar.update(1)
    
write_groups(cheat_groups, "cheater")
write_groups(legit_groups, "legit")