In [1]:
pip install git+https://github.com/MetaGuard/xror.git#egg=xror

Collecting xror
  Cloning https://github.com/MetaGuard/xror.git to /tmp/pip-install-bcdstqwa/xror_9b00d4a304474e7b92b240e797c0649f
  Running command git clone --filter=blob:none --quiet https://github.com/MetaGuard/xror.git /tmp/pip-install-bcdstqwa/xror_9b00d4a304474e7b92b240e797c0649f
  Resolved https://github.com/MetaGuard/xror.git to commit b2177253d97066038ab52bf59714f1966ece903e
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os
import glob
import logging
from xror import XROR  # Ensure xror module is available in the notebook environment

logging.basicConfig(filename= 'xror_parsing_errors.log', level=logging.INFO, format= '%(asctime)s - %(levelname)s - %(message)s')
# Define the column names based on the number of sensors/measurements
frame_columns = [
    "spawnTime", "saberSpeed", "saberDirX", "saberDirY", "saberDirZ",
    "cutDirDeviation", "cutPointX", "cutPointY", "cutPointZ",
    "cutNormalX", "cutNormalY", "cutNormalZ", "cutDistanceToCenter",
    "cutAngle", "beforeCutRating", "afterCutRating", "noteID",
    "speedOK", "directionOK", "saberTypeOK", "wasCutTooSoon", "saberType"
]

# List of root folders containing subdirectories with XROR files
root_folders = ['chunk1', 'chunk2', 'chunk3']

In [3]:
def process_xror_files(root_folder_path):
    # Recursively find all .xror files in the root folder and its subdirectories
    xror_files = glob.glob(os.path.join(root_folder_path, '**/*.xror'), recursive=True)

    # Process each XROR file
    for file_path in xror_files:
        try:
            base_name = os.path.splitext(file_path)[0]
            csv_file_path = base_name + '.csv'

            if os.path.exists(csv_file_path):
                logging.info(f"CSV file already exists for {file_path}. Skipping.")
                continue

            with open(file_path, 'rb') as f:
                binary_data = f.read()

            xror_data = XROR.unpack(binary_data)

            if len(xror_data.data['frames'][0]) != len(frame_columns):
                logging.warning(f"Column mismatch in {file_path}. Expected {len(frame_columns)} columns, found {len(xror_data.data['frames'][0])}.")
                continue

            df_frames = pd.DataFrame(xror_data.data['frames'], columns=frame_columns)
            df_frames['directionOK'] = df_frames.apply(lambda row: row['saberDirX'] > 0 and row['saberDirY'] > 0, axis=1)
            df_frames.to_csv(csv_file_path, index=False)
            logging.info(f"DataFrame saved to CSV successfully for file: {file_path}")

        except Exception as e:
            logging.error(f"Error processing {file_path}: {e}")

# Process files in all specified root folders
for root_folder in root_folders:
    process_xror_files(root_folder)

In [4]:
# Verify that the CSV files were created successfully
created_csv_files = []
for root_folder in root_folders:
    csv_files = glob.glob(os.path.join(root_folder, '**/*.csv'), recursive=True)
    created_csv_files.extend(csv_files)

print(f"Total CSV files created: {len(created_csv_files)}")

Total CSV files created: 241196


['chunk1/0062dfc1-f071-4690-aeb1-8b1a72fb18a4/da3b7041-59d3-42ef-88b9-46a224fc9c44_normalized.csv',
 'chunk1/0062dfc1-f071-4690-aeb1-8b1a72fb18a4/b7cb0530-7a3b-4c38-a87c-8fd5d90ddc03.csv',
 'chunk1/0062dfc1-f071-4690-aeb1-8b1a72fb18a4/10db8741-e69e-4258-9889-3c156483c3c3.csv',
 'chunk1/0062dfc1-f071-4690-aeb1-8b1a72fb18a4/8bf7cba6-6c70-4cff-952a-d619817bc604.csv',
 'chunk1/0062dfc1-f071-4690-aeb1-8b1a72fb18a4/19585113-7981-4b0b-ad0d-833a5cb3f781.csv',
 'chunk1/0062dfc1-f071-4690-aeb1-8b1a72fb18a4/75bc5ed3-e258-4daf-935f-fcc9d556cf06_normalized.csv',
 'chunk1/0062dfc1-f071-4690-aeb1-8b1a72fb18a4/d363d14d-5df8-4bb3-b4fb-10aebfcf831b.csv',
 'chunk1/0062dfc1-f071-4690-aeb1-8b1a72fb18a4/23c2a162-a4ab-46f5-b9a0-1886aef35f5f.csv',
 'chunk1/0062dfc1-f071-4690-aeb1-8b1a72fb18a4/42cf12a6-16b8-4931-b936-16db720ab32d.csv',
 'chunk1/0062dfc1-f071-4690-aeb1-8b1a72fb18a4/ea5c169f-a403-4abc-beec-f8114386f8d8.csv']

In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os
import glob

def normalize_csv_files(root_folders):
    for root_folder in root_folders:
        # Recursively find all .csv files in the root folder and its subdirectories
        csv_files = glob.glob(os.path.join(root_folder, '**/*.csv'), recursive=True)
        csv_files = [f for f in csv_files if '_normalized.csv' not in f]
        
        # Process each CSV file
        for file_path in csv_files:
            try:
                # Construct the path for the normalized file
                normalized_file_path = file_path.replace('.csv', '_normalized.csv')
                
                # If normalized file already exists, skip to avoid reprocessing
                if os.path.exists(normalized_file_path):
                    #print(f"Normalized file already exists for {file_path}. Skipping.")
                    continue
                
                # Load the data
                data = pd.read_csv(file_path)
                
                # Normalize the data
                scaler = StandardScaler()
                numeric_cols = data.select_dtypes(include=['float64', 'int']).columns
                data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
                
                # Save the normalized data back to disk
                data.to_csv(normalized_file_path, index=False)
                print(f"Data normalized and saved successfully for file: {file_path}")

            except Exception as e:
                print(f"Error normalizing file {file_path}: {e}")

# Specify the root folder paths where your CSV files are located
root_folders = ['chunk1', 'chunk2', 'chunk3']  # Update this path to where your CSV files are stored
normalize_csv_files(root_folders)

In [6]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import os
import glob

def aggregate_and_feature_engineer(root_folders):
    # Create a list to hold all file patterns
    file_patterns = [os.path.join(root, '**/*_normalized.csv') for root in root_folders]
    
    # Read data from all chunks using Dask
    ddf = dd.read_csv(file_patterns, include_path_column=True)
    
    # Extract user and chunk identifiers from the file path
    ddf['user_id'] = ddf['path'].apply(lambda x: x.split('/')[2], meta=('x', 'str'))  # Adjust indexing based on your path structure
    ddf['chunk_id'] = ddf['path'].apply(lambda x: x.split('/')[1], meta=('x', 'str'))  # Adjust indexing based on your path structure
    
    # Drop the path column if no longer needed
    ddf = ddf.drop('path', axis=1)
    
    # Perform feature engineering
    # Calculate mean, std, min, and max for saberSpeed, saberDirX, saberDirY for each user in each chunk
    aggregation = {
        'saberSpeed': ['mean', 'std', 'min', 'max'],
        'saberDirX': ['mean', 'std', 'min', 'max'],
        'saberDirY': ['mean', 'std', 'min', 'max'],
    }

    # Perform the aggregation per user and chunk
    grouped_ddf = ddf.groupby(['chunk_id', 'user_id']).agg(aggregations)

    # Flatten the multi-level column index
    grouped_ddf.columns = ['_'.join(col) for col in grouped_ddf.columns.values]

    # Compute a small sample to print
    sample_df = grouped_ddf.head()
    print(sample_df)

    # Compute and save the results
    with ProgressBar():
        result_df = grouped_ddf.compute()
        result_df.to_csv('aggregated_and_featured_data.csv', index=False)

    print("Data processing, features engineered, and results saved.")

# Specify the root folder path where your CSV files are located
root_folders = ['chunk1', 'chunk2', 'chunk3']
aggregate_and_feature_engineer(root_folders)

[########################################] | 100% Completed | 33hr 55m
Data processed, features engineered, and results saved.
