Install xror package

In [None]:
pip install git+https://github.com/MetaGuard/xror.git#egg=xror

Import necessary libraries and set up logging

In [2]:
import pandas as pd
import os
import glob
import logging
from xror import XROR  # Ensure xror module is available in the notebook environment

logging.basicConfig(filename= 'xror_parsing_errors.log', level=logging.INFO, format= '%(asctime)s - %(levelname)s - %(message)s')

# Define the column names based on the number of sensors/measurements
frame_columns = [
    "spawnTime", "saberSpeed", "saberDirX", "saberDirY", "saberDirZ",
    "cutDirDeviation", "cutPointX", "cutPointY", "cutPointZ",
    "cutNormalX", "cutNormalY", "cutNormalZ", "cutDistanceToCenter",
    "cutAngle", "beforeCutRating", "afterCutRating", "noteID",
    "speedOK", "directionOK", "saberTypeOK", "wasCutTooSoon", "saberType"
]

# List of root folders containing subdirectories with XROR files
root_folders = ['chunk1', 'chunk2', 'chunk3']

# Check if CSV files already exist
def csv_files_exist(root_folders):
    for root_folder in root_folders:
        csv_files = glob.glob(os.path.join(root_folder, '**/*.csv'), recursive=True)
        if not csv_files:
            return False
    return True

Process XROR files if CSV files don't exist

In [3]:
def process_xror_files(root_folder_path):
    # Recursively find all .xror files in the root folder and its subdirectories
    xror_files = glob.glob(os.path.join(root_folder_path, '**/*.xror'), recursive=True)

    # Process each XROR file
    for file_path in xror_files:
        try:
            base_name = os.path.splitext(file_path)[0]
            csv_file_path = base_name + '.csv'

            if os.path.exists(csv_file_path):
                logging.info(f"CSV file already exists for {file_path}. Skipping.")
                continue

            with open(file_path, 'rb') as f:
                binary_data = f.read()

            xror_data = XROR.unpack(binary_data)

            if len(xror_data.data['frames'][0]) != len(frame_columns):
                logging.warning(f"Column mismatch in {file_path}. Expected {len(frame_columns)} columns, found {len(xror_data.data['frames'][0])}.")
                continue

            df_frames = pd.DataFrame(xror_data.data['frames'], columns=frame_columns)
            df_frames['directionOK'] = df_frames.apply(lambda row: row['saberDirX'] > 0 and row['saberDirY'] > 0, axis=1)
            df_frames.to_csv(csv_file_path, index=False)
            logging.info(f"DataFrame saved to CSV successfully for file: {file_path}")

        except Exception as e:
            logging.error(f"Error processing {file_path}: {e}")

# Process files in all specified root folders if CSV files don't exist
if not csv_files_exist(root_folders):
    for root_folder in root_folders:
        process_xror_files(root_folder)
else:
    print("CSV files already exist. Skipping XROR processing.")

print the length

In [4]:
# Verify that the CSV files were created successfully
created_csv_files = []
for root_folder in root_folders:
    csv_files = glob.glob(os.path.join(root_folder, '**/*.csv'), recursive=True)
    created_csv_files.extend(csv_files)

print(f"Total CSV files created: {len(created_csv_files)}")

Total CSV files created: 241196


Normalize CSV files if not already normalized

In [None]:
def normalize_csv_files(root_folders):
    for root_folder in root_folders:
        # Recursively find all .csv files in the root folder and its subdirectories
        csv_files = glob.glob(os.path.join(root_folder, '**/*.csv'), recursive=True)
        csv_files = [f for f in csv_files if '_normalized.csv' not in f]
        
        # Process each CSV file
        for file_path in csv_files:
            try:
                # Construct the path for the normalized file
                normalized_file_path = file_path.replace('.csv', '_normalized.csv')
                
                # If normalized file already exists, skip to avoid reprocessing
                if os.path.exists(normalized_file_path):
                    continue
                
                # Load the data
                data = pd.read_csv(file_path)
                
                # Normalize the data
                scaler = StandardScaler()
                numeric_cols = data.select_dtypes(include=['float64', 'int']).columns
                data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
                
                # Save the normalized data back to disk
                data.to_csv(normalized_file_path, index=False)
                print(f"Data normalized and saved successfully for file: {file_path}")

            except Exception as e:
                print(f"Error normalizing file {file_path}: {e}")

# Check if normalization is already done
def normalization_done(root_folders):
    for root_folder in root_folders:
        normalized_files = glob.glob(os.path.join(root_folder, '**/*_normalized.csv'), recursive=True)
        if not normalized_files:
            return False
    return True

if not normalization_done(root_folders):
    normalize_csv_files(root_folders)
else:
    print("CSV files are already normalized. Skipping normalization.")

Print current Dask configuration

In [6]:
import dask

# Print current Dask configuration
print("Current Dask configuration:")
print(dask.config.config)

Current Dask configuration:
{'temporary-directory': None, 'visualization': {'engine': None}, 'tokenize': {'ensure-deterministic': False}, 'dataframe': {'shuffle-compression': None, 'parquet': {'metadata-task-size-local': 512, 'metadata-task-size-remote': 16}}, 'array': {'svg': {'size': 120}, 'slicing': {'split-large-chunks': None}}, 'optimization': {'fuse': {'active': None, 'ave-width': 1, 'max-width': None, 'max-height': inf, 'max-depth-new-edges': None, 'subgraphs': None, 'rename-keys': True}}}


Aggregate and feature engineer data if not already done

In [None]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import os

def aggregate_and_feature_engineer(root_folders):
    file_patterns = [os.path.join(root, '**/*_normalized.csv') for root in root_folders]
    ddf = dd.read_csv(file_patterns, include_path_column=True)
    
    ddf['user_id'] = ddf['path'].str.extract(r'/([^/]+)/[^/]+\.csv$')[0]
    ddf['chunk_id'] = ddf['path'].str.extract(r'/(chunk\d+)/')[0]
    
    ddf = ddf.drop('path', axis=1)
    
    aggregations = {
        'saberSpeed': ['mean', 'std', 'min', 'max'],
        'saberDirX': ['mean', 'std', 'min', 'max'],
        'saberDirY': ['mean', 'std', 'min', 'max'],
        'saberDirZ': ['mean', 'std', 'min', 'max']
    }
    
    grouped_ddf = ddf.groupby(['chunk_id', 'user_id']).agg(aggregations)
    grouped_ddf.columns = ['_'.join(col).strip() for col in grouped_ddf.columns.values]

    with ProgressBar():
        # Save each part after computing
        results = []
        for i, part in enumerate(grouped_ddf.to_delayed()):
            part_df = part.compute()
            if not part_df.empty:
                part_df.to_csv(f'aggregated_and_featured_data_part_{i}.csv', index=False)
                results.append(part_df)
                print(f"Part {i} processed and saved.")

            # Optionally save a checkpoint for every 10000 parts processed
            if i % 10000 == 0 and results:
                dd.concat(results).to_csv('aggregated_and_featured_data_checkpoint.csv', index=False)
                results = []  # Clear the results list to free up memory

        # Final save
        if results:
            dd.concat(results).to_csv('aggregated_and_featured_data_final.csv', index=False)
            print("Final data processing complete and saved.")

# Check if aggregation is already done
def aggregation_done():
    final_file_path = 'aggregated_and_featured_data_final.csv'
    return os.path.exists(final_file_path)

if not aggregation_done():
    aggregate_and_feature_engineer(root_folders)
else:
    print("Data is already aggregated and feature engineered. Skipping this step.")

[########################################] | 100% Completed | 30hr 27m
Part 0 processed and saved.


Reverse Lookup of a user

In [None]:
import pandas as pd
import glob
import os
from dask import delayed, compute
from dask.diagnostics import ProgressBar
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance

# Function to load normalized data and compute features
def compute_features(file_path):
    data = pd.read_csv(file_path)
    user_id = os.path.basename(file_path).split('_')[0]
    chunk_id = os.path.basename(file_path).split('_')[1]
    
    features = {
        'user_id': user_id,
        'chunk_id': chunk_id,
        'saberSpeed_mean': data['saberSpeed'].mean(),
        'saberSpeed_std': data['saberSpeed'].std(),
        'saberSpeed_min': data['saberSpeed'].min(),
        'saberSpeed_max': data['saberSpeed'].max(),
        'saberDirX_mean': data['saberDirX'].mean(),
        'saberDirX_std': data['saberDirX'].std(),
        'saberDirX_min': data['saberDirX'].min(),
        'saberDirX_max': data['saberDirX'].max(),
        'saberDirY_mean': data['saberDirY'].mean(),
        'saberDirY_std': data['saberDirY'].std(),
        'saberDirY_min': data['saberDirY'].min(),
        'saberDirY_max': data['saberDirY'].max(),
        'saberDirZ_mean': data['saberDirZ'].mean(),
        'saberDirZ_std': data['saberDirZ'].std(),
        'saberDirZ_min': data['saberDirZ'].min(),
        'saberDirZ_max': data['saberDirZ'].max(),
    }
    
    return features

# Function to find the closest match using parallel processing
def find_closest_match(given_row, root_folders):
    file_paths = []
    for root_folder in root_folders:
        file_paths.extend(glob.glob(os.path.join(root_folder, '**/*_normalized.csv'), recursive=True))
    
    # Compute features in parallel
    delayed_results = [delayed(compute_features)(file_path) for file_path in file_paths]
    
    with ProgressBar():
        all_features = compute(*delayed_results)
    
    # Convert all_features to DataFrame
    df_features = pd.DataFrame(all_features)
    
    # Compute the distance between the given row and each user's features
    min_distance = float('inf')
    closest_user = None
    for index, row in df_features.iterrows():
        user_features = row.drop(['user_id', 'chunk_id']).values
        dist = distance.euclidean(user_features, given_row)
        if dist < min_distance:
            min_distance = dist
            closest_user = row[['user_id', 'chunk_id']]
    
    return closest_user

# Given row values (as an example)
given_row = [
    -8.631126502990014e-17, 1.000000700190627, -11.797579026170633, 4.126330121837754,
    1.0614630720503478e-16, 1.000000700190627, -111.94098228264774, 3.6317406424051977,
    8.844734305656087e-17, 1.000000700190627, -6.758512157339628, 6.857955228245537,
    -3.6887394447039675e-16, 1.000000700190627, -9.91977109738808, 10.866403895736536
]

# Root folders
root_folders = ['chunk1', 'chunk2', 'chunk3']

# Find the closest match
closest_user = find_closest_match(given_row, root_folders)
print("The closest user match is:", closest_user)

deep motion masking

In [None]:
import pandas as pd
import dask.dataframe as dd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from dask.diagnostics import ProgressBar

# Load the data in chunks using Dask
file_path = 'aggregated_and_featured_data_final.csv'
dask_df = dd.read_csv(file_path)

# Normalize function
def normalize_chunk(chunk):
    scaler = StandardScaler()
    chunk_scaled = pd.DataFrame(scaler.fit_transform(chunk), columns=chunk.columns)
    return chunk_scaled, scaler

# Autoencoder architecture
def build_autoencoder(input_dim):
    encoding_dim = int(input_dim / 2)
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation="relu")(input_layer)
    decoder = Dense(input_dim, activation="sigmoid")(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss="mean_squared_error")
    return autoencoder

# Train autoencoder on a subset of the data (initial training)
sample_df = dask_df.sample(frac=0.1).compute()  # Take a sample of 10% of the data for training
features = sample_df.drop(['user_id', 'chunk_id'], axis=1)
features_scaled, scaler = normalize_chunk(features)

input_dim = features_scaled.shape[1]
autoencoder = build_autoencoder(input_dim)
autoencoder.fit(features_scaled, features_scaled, epochs=50, batch_size=256, shuffle=True, validation_split=0.2)

# Function to process each chunk
def process_chunk(chunk, autoencoder, scaler, chunk_id):
    user_ids = chunk['user_id']
    chunk_ids = chunk['chunk_id']
    chunk = chunk.drop(['user_id', 'chunk_id'], axis=1)
    
    chunk_scaled = scaler.transform(chunk)
    encoded_data = autoencoder.predict(chunk_scaled)
    
    masked_chunk = pd.DataFrame(encoded_data, columns=chunk.columns)
    masked_chunk['user_id'] = user_ids
    masked_chunk['chunk_id'] = chunk_ids
    
    masked_chunk.to_csv(f'masked_data_chunk_{chunk_id}.csv', index=False)
    print(f"Processed and saved chunk {chunk_id}")

# Process data in chunks
def process_data_in_chunks(dask_df, autoencoder, scaler):
    with ProgressBar():
        chunk_id = 0
        for chunk in dask_df.to_delayed():
            chunk_df = chunk.compute()
            process_chunk(chunk_df, autoencoder, scaler, chunk_id)
            chunk_id += 1

process_data_in_chunks(dask_df, autoencoder, scaler)
print("Deep motion masking applied and data saved in chunks.")

GAN Architecture

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.optimizers import Adam
import pandas as pd
import dask.dataframe as dd
from sklearn.preprocessing import StandardScaler
from dask.diagnostics import ProgressBar

# Generator model
def build_generator(input_dim, output_dim):
    model = tf.keras.Sequential()
    model.add(Dense(128, input_dim=input_dim))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(output_dim, activation='tanh'))
    return model

# Discriminator model
def build_discriminator(input_dim):
    model = tf.keras.Sequential()
    model.add(Dense(256, input_dim=input_dim))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(128))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))
    return model

# Combined GAN model
def build_gan(generator, discriminator):
    discriminator.trainable = False
    gan_input = Input(shape=(generator.input_shape[1],))
    x = generator(gan_input)
    gan_output = discriminator(x)
    gan = Model(gan_input, gan_output)
    gan.compile(optimizer=Adam(0.0002, 0.5), loss='binary_crossentropy')
    return gan

# Normalize function
def normalize_chunk(chunk):
    scaler = StandardScaler()
    chunk_scaled = pd.DataFrame(scaler.fit_transform(chunk), columns=chunk.columns)
    return chunk_scaled, scaler

# Load the data in chunks using Dask
file_path = 'aggregated_and_featured_data_final.csv'
dask_df = dd.read_csv(file_path)

# Sample a subset of the data for training
sample_df = dask_df.sample(frac=0.1).compute()
features = sample_df.drop(['user_id', 'chunk_id'], axis=1)
features_scaled, scaler = normalize_chunk(features)

input_dim = features_scaled.shape[1]

# Build and compile the GAN
generator = build_generator(input_dim=input_dim, output_dim=input_dim)
discriminator = build_discriminator(input_dim=input_dim)
discriminator.compile(optimizer=Adam(0.0002, 0.5), loss='binary_crossentropy', metrics=['accuracy'])
gan = build_gan(generator, discriminator)

# Train the GAN
epochs = 10000
batch_size = 256

# Training loop
for epoch in range(epochs):
    # Select a random half batch of data
    idx = np.random.randint(0, features_scaled.shape[0], batch_size)
    real_data = features_scaled[idx]

    # Generate a half batch of new data
    noise = np.random.normal(0, 1, (batch_size, input_dim))
    generated_data = generator.predict(noise)

    # Train the discriminator
    d_loss_real = discriminator.train_on_batch(real_data, np.ones((batch_size, 1)))
    d_loss_fake = discriminator.train_on_batch(generated_data, np.zeros((batch_size, 1)))
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # Train the generator
    noise = np.random.normal(0, 1, (batch_size, input_dim))
    g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

    # Print the progress
    if epoch % 1000 == 0:
        print(f"{epoch} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]}%] [G loss: {g_loss}]")

# Function to process each chunk
def process_chunk(chunk, generator, scaler, chunk_id):
    user_ids = chunk['user_id']
    chunk_ids = chunk['chunk_id']
    chunk = chunk.drop(['user_id', 'chunk_id'], axis=1)
    
    chunk_scaled = scaler.transform(chunk)
    noise = np.random.normal(0, 1, (chunk_scaled.shape[0], input_dim))
    generated_data = generator.predict(noise)
    
    anonymized_chunk = pd.DataFrame(generated_data, columns=chunk.columns)
    anonymized_chunk['user_id'] = user_ids
    anonymized_chunk['chunk_id'] = chunk_ids
    
    anonymized_chunk.to_csv(f'anonymized_data_chunk_{chunk_id}.csv', index=False)
    print(f"Processed and saved chunk {chunk_id}")

# Process data in chunks
def process_data_in_chunks(dask_df, generator, scaler):
    with ProgressBar():
        chunk_id = 0
        for chunk in dask_df.to_delayed():
            chunk_df = chunk.compute()
            process_chunk(chunk_df, generator, scaler, chunk_id)
            chunk_id += 1

process_data_in_chunks(dask_df, generator, scaler)
print("GAN-based anonymization applied and data saved in chunks.")