# Part 1A Data Preparation and Setup

In [None]:
# Install required packages
!pip install polars==0.20.31
!pip install tqdm



In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import pandas as pd
import polars as pl
from tqdm import tqdm
import gc
import zipfile

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Configuration

In [None]:
# Configuration - Update these paths according to your Google Drive structure
class DataConfig:
    # Path where you uploaded the OTTO dataset zip file
    DRIVE_BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content'

    # Dataset paths (update these based on your folder structure)
    DATASET_ZIP_PATH = f'{DRIVE_BASE_PATH}/otto-recommender-system.zip'
    EXTRACT_PATH = f'{DRIVE_BASE_PATH}/otto-data-extracted/'

    # Final processed data paths
    DATA_PATH = f'{DRIVE_BASE_PATH}/otto-data/'
    OUTPUT_PATH = f'{DRIVE_BASE_PATH}/otto-output/'

    # Create directories
    os.makedirs(EXTRACT_PATH, exist_ok=True)
    os.makedirs(DATA_PATH, exist_ok=True)
    os.makedirs(OUTPUT_PATH, exist_ok=True)

config = DataConfig()

print("Configuration complete")
print(f"Dataset zip path: {config.DATASET_ZIP_PATH}")
print(f"Extract path: {config.EXTRACT_PATH}")
print(f"Data path: {config.DATA_PATH}")
print(f"Output path: {config.OUTPUT_PATH}")

Configuration complete
Dataset zip path: /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-recommender-system.zip
Extract path: /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-data-extracted/
Data path: /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-data/
Output path: /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-output/


## Dataset Download and Extraction

In [None]:
def download_and_extract_data():
    """
    Extract and prepare OTTO dataset files
    """
    print("Checking if dataset exists...")

    # Check if zip file exists
    if not os.path.exists(config.DATASET_ZIP_PATH):
        print(f"❌ Dataset not found at {config.DATASET_ZIP_PATH}")
        print("Please upload the otto-recommender-system.zip file to your Google Drive")
        print("You can download it from: https://www.kaggle.com/competitions/otto-recommender-system/data")
        return False

    print(f"✅ Dataset found at {config.DATASET_ZIP_PATH}")

    # Extract if not already extracted
    train_path = os.path.join(config.EXTRACT_PATH, 'train.jsonl')
    test_path = os.path.join(config.EXTRACT_PATH, 'test.jsonl')

    if os.path.exists(train_path) and os.path.exists(test_path):
        print("✅ Dataset already extracted")
        return True

    print("Extracting dataset...")
    try:
        with zipfile.ZipFile(config.DATASET_ZIP_PATH, 'r') as zip_ref:
            zip_ref.extractall(config.EXTRACT_PATH)
        print("✅ Dataset extracted successfully")
        return True
    except Exception as e:
        print(f"❌ Error extracting dataset: {e}")
        return False

# Download and extract data
extraction_success = download_and_extract_data()

if not extraction_success:
    raise RuntimeError("Failed to extract dataset. Please check the file path and try again.")

Checking if dataset exists...
✅ Dataset found at /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-recommender-system.zip
✅ Dataset already extracted


## Data Format Conversion and Optimization

In [None]:
def convert_jsonl_to_optimized_format_fixed(input_path: str, output_path: str, dataset_name: str):
    """
    Memory-efficient conversion from JSONL to optimized format using streaming and chunking
    """
    print(f"Converting {dataset_name} from JSONL to optimized format (Memory Optimized)...")

    if not os.path.exists(input_path):
        print(f"❌ Input file not found: {input_path}")
        return False

    # Check if output already exists
    parquet_path = output_path.replace('.jsonl', '.parquet')
    if os.path.exists(parquet_path):
        print(f"✅ {dataset_name} already converted")
        return True

    try:
        print(f"Processing {input_path} in memory-efficient chunks...")

        # Process in very small chunks to avoid memory issues
        CHUNK_SIZE = 500000  # Reduced chunk size
        chunk_files = []
        chunk_counter = 0

        # Create temporary directory for chunks
        temp_dir = f"/tmp/otto_chunks_{dataset_name.lower().replace(' ', '_')}"
        os.makedirs(temp_dir, exist_ok=True)

        with open(input_path, 'r') as f:
            chunk_rows = []
            total_processed = 0

            for i, line in enumerate(f):
                try:
                    data = json.loads(line.strip())
                    session_id = data['session']

                    # Process events for this session
                    for event in data['events']:
                        chunk_rows.append({
                            'session': session_id,
                            'aid': event['aid'],
                            'ts': event['ts'],
                            'type': event['type']
                        })

                        # Process chunk when it reaches CHUNK_SIZE
                        if len(chunk_rows) >= CHUNK_SIZE:
                            # Convert chunk to DataFrame and save immediately
                            chunk_df = pl.DataFrame(chunk_rows)

                            # Optimize data types for the chunk
                            chunk_df = chunk_df.with_columns([
                                pl.col("session").cast(pl.UInt32),
                                pl.col("aid").cast(pl.UInt32),
                                pl.col("ts").cast(pl.UInt64),
                                pl.col("type").cast(pl.Categorical)
                            ])

                            # Save chunk to temporary file
                            chunk_file = os.path.join(temp_dir, f"chunk_{chunk_counter}.parquet")
                            chunk_df.write_parquet(chunk_file)
                            chunk_files.append(chunk_file)

                            total_processed += len(chunk_rows)
                            chunk_counter += 1

                            print(f"  Processed chunk {chunk_counter}: {total_processed:,} total rows")

                            # Clear chunk from memory
                            chunk_rows = []
                            del chunk_df
                            import gc
                            gc.collect()

                except json.JSONDecodeError as e:
                    print(f"  Warning: Skipping invalid JSON at line {i+1}")
                    continue
                except KeyError as e:
                    print(f"  Warning: Missing key at line {i+1}: {e}")
                    continue
                except Exception as e:
                    print(f"  Warning: Error processing line {i+1}: {e}")
                    continue

                # Progress update every 50k lines
                if i > 0 and i % 50000 == 0:
                    print(f"  Read {i:,} lines from file...")

            # Process remaining rows
            if chunk_rows:
                chunk_df = pl.DataFrame(chunk_rows)
                chunk_df = chunk_df.with_columns([
                    pl.col("session").cast(pl.UInt32),
                    pl.col("aid").cast(pl.UInt32),
                    pl.col("ts").cast(pl.UInt64),
                    pl.col("type").cast(pl.Categorical)
                ])

                chunk_file = os.path.join(temp_dir, f"chunk_{chunk_counter}.parquet")
                chunk_df.write_parquet(chunk_file)
                chunk_files.append(chunk_file)

                total_processed += len(chunk_rows)
                chunk_counter += 1
                print(f"  Final chunk {chunk_counter}: {total_processed:,} total rows")

                del chunk_df, chunk_rows
                import gc
                gc.collect()

        print(f"Created {len(chunk_files)} chunk files. Now combining...")

        # Combine all chunks using lazy evaluation
        print("Combining chunks with memory-efficient lazy loading...")

        # Read and combine chunks one by one
        combined_df = None
        for i, chunk_file in enumerate(chunk_files):
            print(f"  Combining chunk {i+1}/{len(chunk_files)}")

            chunk_df = pl.read_parquet(chunk_file)

            if combined_df is None:
                combined_df = chunk_df
            else:
                combined_df = pl.concat([combined_df, chunk_df])

            # Periodic memory cleanup
            if i % 10 == 0:
                import gc
                gc.collect()

        print("Sorting and finalizing DataFrame...")
        # Sort for better compression and access patterns
        combined_df = combined_df.sort(["session", "ts"])

        print(f"Final DataFrame shape: {combined_df.shape}")

        # Save as parquet
        print(f"Saving optimized data to {parquet_path}")
        combined_df.write_parquet(parquet_path)

        # Also save as JSONL for compatibility
        print(f"Saving as JSONL to {output_path}")
        combined_df.write_ndjson(output_path)

        # Print dataset statistics
        print(f"\n{dataset_name} Statistics:")
        print(f"  Total events: {len(combined_df):,}")
        print(f"  Unique sessions: {combined_df.select(pl.col('session')).n_unique():,}")
        print(f"  Unique items: {combined_df.select(pl.col('aid')).n_unique():,}")
        print(f"  Event types: {combined_df.select(pl.col('type')).unique().to_series().to_list()}")

        type_counts = combined_df.group_by("type").agg(pl.count().alias("count"))
        print(f"  Event distribution:")
        for row in type_counts.iter_rows():
            event_type, count = row
            percentage = count / len(combined_df) * 100
            print(f"    {event_type}: {count:,} ({percentage:.2f}%)")

        # Cleanup temporary files
        print("Cleaning up temporary chunk files...")
        for chunk_file in chunk_files:
            try:
                os.remove(chunk_file)
            except:
                pass
        try:
            os.rmdir(temp_dir)
        except:
            pass

        print(f"✅ {dataset_name} conversion completed successfully!")
        return True

    except Exception as e:
        print(f"❌ Error converting {dataset_name}: {e}")
        return False

# Alternative streaming approach using generators (even more memory efficient)
def convert_jsonl_streaming_approach(input_path: str, output_path: str, dataset_name: str):
    """
    Ultra memory-efficient streaming approach using generators
    """
    print(f"Converting {dataset_name} using streaming approach...")

    if not os.path.exists(input_path):
        print(f"❌ Input file not found: {input_path}")
        return False

    parquet_path = output_path.replace('.jsonl', '.parquet')
    if os.path.exists(parquet_path):
        print(f"✅ {dataset_name} already converted")
        return True

    def event_generator():
        """Generator that yields events one by one"""
        with open(input_path, 'r') as f:
            for i, line in enumerate(f):
                try:
                    data = json.loads(line.strip())
                    session_id = data['session']

                    for event in data['events']:
                        yield {
                            'session': session_id,
                            'aid': event['aid'],
                            'ts': event['ts'],
                            'type': event['type']
                        }

                except (json.JSONDecodeError, KeyError) as e:
                    continue  # Skip invalid lines

                if i % 100000 == 0 and i > 0:
                    print(f"  Processed {i:,} lines...")

    try:
        # Process in small batches using the generator
        BATCH_SIZE = 5000  # Very small batch size
        batch = []
        all_batches = []

        print("Processing events in small batches...")
        for event in event_generator():
            batch.append(event)

            if len(batch) >= BATCH_SIZE:
                # Convert batch to DataFrame
                batch_df = pl.DataFrame(batch).with_columns([
                    pl.col("session").cast(pl.UInt32),
                    pl.col("aid").cast(pl.UInt32),
                    pl.col("ts").cast(pl.UInt64),
                    pl.col("type").cast(pl.Categorical)
                ])

                all_batches.append(batch_df)
                batch = []

                if len(all_batches) % 100 == 0:
                    print(f"  Processed {len(all_batches) * BATCH_SIZE:,} events...")

                    # Periodically combine and clear batches to manage memory
                    if len(all_batches) >= 200:  # Combine every 1M events
                        print("  Combining batches to manage memory...")
                        combined_batch = pl.concat(all_batches)
                        all_batches = [combined_batch]
                        import gc
                        gc.collect()

        # Process remaining events
        if batch:
            batch_df = pl.DataFrame(batch).with_columns([
                pl.col("session").cast(pl.UInt32),
                pl.col("aid").cast(pl.UInt32),
                pl.col("ts").cast(pl.UInt64),
                pl.col("type").cast(pl.Categorical)
            ])
            all_batches.append(batch_df)

        # Final combination
        print("Final combination of all batches...")
        final_df = pl.concat(all_batches).sort(["session", "ts"])

        # Save results
        print(f"Saving to {parquet_path}")
        final_df.write_parquet(parquet_path)
        final_df.write_ndjson(output_path)

        print(f"✅ {dataset_name} conversion completed!")
        print(f"  Total events: {len(final_df):,}")
        print(f"  Memory efficient processing successful!")

        return True

    except Exception as e:
        print(f"❌ Streaming conversion failed: {e}")
        return False

# Updated main conversion section - replace the original conversion calls with this:
print("="*60)
print("MEMORY-OPTIMIZED DATA CONVERSION")
print("="*60)

# First, try the chunk-based approach
print("Attempting chunk-based conversion...")

# Convert training data with fixed approach
train_input = os.path.join(config.EXTRACT_PATH, 'train.jsonl')
train_output = os.path.join(config.DATA_PATH, 'train.jsonl')

train_success = convert_jsonl_to_optimized_format_fixed(train_input, train_output, "Training Data")

# If chunk-based approach fails, try streaming approach
if not train_success:
    print("\nChunk-based approach failed. Trying streaming approach...")
    train_success = convert_jsonl_streaming_approach(train_input, train_output, "Training Data")

# Convert test data
test_input = os.path.join(config.EXTRACT_PATH, 'test.jsonl')
test_output = os.path.join(config.DATA_PATH, 'test.jsonl')

test_success = convert_jsonl_to_optimized_format_fixed(test_input, test_output, "Test Data")

if not test_success:
    print("\nChunk-based approach failed for test data. Trying streaming approach...")
    test_success = convert_jsonl_streaming_approach(test_input, test_output, "Test Data")

# Final check
if not (train_success and test_success):
    print("❌ CRITICAL: Data conversion failed even with memory optimization!")
    print("Suggestions:")
    print("1. Restart Colab runtime and try again")
    print("2. Use 'High-RAM' runtime setting")
    print("3. Process train and test data separately in different sessions")
    print("4. Consider using a subset of data for initial testing")
else:
    print("✅ Memory-optimized conversion completed successfully!")

# Memory cleanup
import gc
gc.collect()
print(f"Memory cleanup completed.")

MEMORY-OPTIMIZED DATA CONVERSION
Attempting chunk-based conversion...
Converting Training Data from JSONL to optimized format (Memory Optimized)...
Processing /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-data-extracted/train.jsonl in memory-efficient chunks...
  Processed chunk 1: 500,000 total rows
  Processed chunk 2: 1,000,000 total rows
  Processed chunk 3: 1,500,000 total rows
  Processed chunk 4: 2,000,000 total rows
  Processed chunk 5: 2,500,000 total rows
  Read 50,000 lines from file...
  Processed chunk 6: 3,000,000 total rows
  Processed chunk 7: 3,500,000 total rows
  Processed chunk 8: 4,000,000 total rows
  Processed chunk 9: 4,500,000 total rows
  Processed chunk 10: 5,000,000 total rows
  Read 100,000 lines from file...
  Processed chunk 11: 5,500,000 total rows
  Processed chunk 12: 6,000,000 total rows
  Processed chunk 13: 6,500,000 total rows
  Processed chunk 14: 7,000,000 total rows
  Processed chunk 15: 7,500,000 total rows
  Read 150,000



  Combining chunk 18/434
  Combining chunk 19/434
  Combining chunk 20/434
  Combining chunk 21/434
  Combining chunk 22/434
  Combining chunk 23/434
  Combining chunk 24/434
  Combining chunk 25/434
  Combining chunk 26/434
  Combining chunk 27/434
  Combining chunk 28/434
  Combining chunk 29/434
  Combining chunk 30/434
  Combining chunk 31/434
  Combining chunk 32/434
  Combining chunk 33/434
  Combining chunk 34/434
  Combining chunk 35/434
  Combining chunk 36/434
  Combining chunk 37/434
  Combining chunk 38/434
  Combining chunk 39/434
  Combining chunk 40/434
  Combining chunk 41/434
  Combining chunk 42/434
  Combining chunk 43/434
  Combining chunk 44/434
  Combining chunk 45/434
  Combining chunk 46/434
  Combining chunk 47/434
  Combining chunk 48/434
  Combining chunk 49/434
  Combining chunk 50/434
  Combining chunk 51/434
  Combining chunk 52/434
  Combining chunk 53/434
  Combining chunk 54/434
  Combining chunk 55/434
  Combining chunk 56/434
  Combining chunk 57/434


  type_counts = combined_df.group_by("type").agg(pl.count().alias("count"))


  Event distribution:
    carts: 16,896,191 (7.80%)
    orders: 5,098,951 (2.35%)
    clicks: 194,720,954 (89.85%)
Cleaning up temporary chunk files...
✅ Training Data conversion completed successfully!
Converting Test Data from JSONL to optimized format (Memory Optimized)...
Processing /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-data-extracted/test.jsonl in memory-efficient chunks...
  Read 50,000 lines from file...
  Read 100,000 lines from file...
  Processed chunk 1: 500,000 total rows
  Read 150,000 lines from file...
  Read 200,000 lines from file...
  Processed chunk 2: 1,000,000 total rows
  Read 250,000 lines from file...
  Read 300,000 lines from file...
  Processed chunk 3: 1,500,000 total rows
  Read 350,000 lines from file...
  Read 400,000 lines from file...
  Read 450,000 lines from file...
  Processed chunk 4: 2,000,000 total rows
  Read 500,000 lines from file...
  Read 550,000 lines from file...
  Processed chunk 5: 2,500,000 total rows
  Read

## Data Validation and Quality Checks

In [None]:
def validate_converted_data():
    """
    Validate the converted data files
    """
    print("Validating converted data...")

    # Check if files exist
    train_parquet = os.path.join(config.DATA_PATH, 'train.parquet')
    test_parquet = os.path.join(config.DATA_PATH, 'test.parquet')
    train_jsonl = os.path.join(config.DATA_PATH, 'train.jsonl')
    test_jsonl = os.path.join(config.DATA_PATH, 'test.jsonl')

    files_to_check = [
        ("Training Parquet", train_parquet),
        ("Test Parquet", test_parquet),
        ("Training JSONL", train_jsonl),
        ("Test JSONL", test_jsonl)
    ]

    all_exist = True
    for name, path in files_to_check:
        if os.path.exists(path):
            size_mb = os.path.getsize(path) / (1024 * 1024)
            print(f"✅ {name}: {size_mb:.1f} MB")
        else:
            print(f"❌ {name}: Not found")
            all_exist = False

    if not all_exist:
        return False

    # Validate data integrity
    try:
        print("\nValidating data integrity...")

        # Load and check training data
        train_df = pl.read_parquet(train_parquet)
        test_df = pl.read_parquet(test_parquet)

        print(f"Training data: {train_df.shape[0]:,} rows, {train_df.shape[1]} columns")
        print(f"Test data: {test_df.shape[0]:,} rows, {test_df.shape[1]} columns")

        # Check required columns
        required_cols = ['session', 'aid', 'ts', 'type']
        for df_name, df in [("Training", train_df), ("Test", test_df)]:
            missing_cols = set(required_cols) - set(df.columns)
            if missing_cols:
                print(f"❌ {df_name} data missing columns: {missing_cols}")
                return False
            else:
                print(f"✅ {df_name} data has all required columns")

        # Check data types
        print(f"\nData types validation:")
        for col in required_cols:
            train_type = train_df[col].dtype
            test_type = test_df[col].dtype
            if train_type != test_type:
                print(f"❌ Type mismatch for {col}: train={train_type}, test={test_type}")
                return False
            else:
                print(f"✅ {col}: {train_type}")

        # Check for null values
        print(f"\nNull values check:")
        for df_name, df in [("Training", train_df), ("Test", test_df)]:
            null_counts = df.null_count()
            has_nulls = False
            for row in null_counts.iter_rows():
                for i, count in enumerate(row):
                    if count > 0:
                        col_name = df.columns[i]
                        print(f"⚠️  {df_name} {col_name}: {count} null values")
                        has_nulls = True
            if not has_nulls:
                print(f"✅ {df_name}: No null values")

        # Check session overlap between train and test
        train_sessions = set(train_df.select("session").unique().to_series().to_list())
        test_sessions = set(test_df.select("session").unique().to_series().to_list())

        overlap = train_sessions & test_sessions
        if overlap:
            print(f"⚠️  Session overlap between train and test: {len(overlap):,} sessions")
        else:
            print(f"✅ No session overlap between train and test")

        print(f"\n✅ Data validation completed successfully")
        return True

    except Exception as e:
        print(f"❌ Data validation failed: {e}")
        return False

# Validate converted data
validation_success = validate_converted_data()

if not validation_success:
    print("❌ Data validation failed")
else:
    print("✅ Data preparation completed successfully!")

Validating converted data...
✅ Training Parquet: 1535.0 MB
✅ Test Parquet: 55.3 MB
✅ Training JSONL: 14102.2 MB
✅ Test JSONL: 458.0 MB

Validating data integrity...
Training data: 216,716,096 rows, 4 columns
Test data: 6,928,123 rows, 4 columns
✅ Training data has all required columns
✅ Test data has all required columns

Data types validation:
✅ session: UInt32
✅ aid: UInt32
✅ ts: UInt64
✅ type: Categorical(ordering='physical')

Null values check:
✅ Training: No null values
✅ Test: No null values
✅ No session overlap between train and test

✅ Data validation completed successfully
✅ Data preparation completed successfully!


## Summary and Next Steps

In [None]:
def print_preparation_summary():
    """
    Print summary of data preparation
    """
    print("\n" + "="*60)
    print("DATA PREPARATION SUMMARY")
    print("="*60)

    # File locations
    print(f"Prepared data location: {config.DATA_PATH}")
    print(f"Output location: {config.OUTPUT_PATH}")

    # File sizes
    files = [
        ('train.parquet', 'Training data (Parquet format)'),
        ('test.parquet', 'Test data (Parquet format)'),
        ('train.jsonl', 'Training data (JSONL format)'),
        ('test.jsonl', 'Test data (JSONL format)')
    ]

    print(f"\nGenerated files:")
    for filename, description in files:
        filepath = os.path.join(config.DATA_PATH, filename)
        if os.path.exists(filepath):
            size_mb = os.path.getsize(filepath) / (1024 * 1024)
            print(f"  ✅ {filename}: {size_mb:.1f} MB - {description}")
        else:
            print(f"  ❌ {filename}: Missing - {description}")

    # Data statistics
    try:
        train_df = pl.read_parquet(os.path.join(config.DATA_PATH, 'train.parquet'))
        test_df = pl.read_parquet(os.path.join(config.DATA_PATH, 'test.parquet'))

        print(f"\nDataset statistics:")
        print(f"  Training events: {len(train_df):,}")
        print(f"  Test events: {len(test_df):,}")
        print(f"  Training sessions: {train_df.select('session').n_unique():,}")
        print(f"  Test sessions: {test_df.select('session').n_unique():,}")
        print(f"  Unique items (train): {train_df.select('aid').n_unique():,}")
        print(f"  Unique items (test): {test_df.select('aid').n_unique():,}")

    except Exception as e:
        print(f"  Could not load statistics: {e}")

    print(f"\n✅ Data preparation completed successfully!")
    print(f"Next step: Run the main solution notebook")
    print(f"Make sure to update the DATA_PATH in the main notebook to: {config.DATA_PATH}")
    print("="*60)

print_preparation_summary()

# Save configuration for main notebook
config_dict = {
    'DATA_PATH': config.DATA_PATH,
    'OUTPUT_PATH': config.OUTPUT_PATH,
    'preparation_completed': True,
    'validation_passed': validation_success
}

config_path = os.path.join(config.OUTPUT_PATH, 'data_config.json')
with open(config_path, 'w') as f:
    json.dump(config_dict, f, indent=2)

print(f"Configuration saved to: {config_path}")


DATA PREPARATION SUMMARY
Prepared data location: /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-data/
Output location: /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-output/

Generated files:
  ✅ train.parquet: 1535.0 MB - Training data (Parquet format)
  ✅ test.parquet: 55.3 MB - Test data (Parquet format)
  ✅ train.jsonl: 14102.2 MB - Training data (JSONL format)
  ✅ test.jsonl: 458.0 MB - Test data (JSONL format)

Dataset statistics:
  Training events: 216,716,096
  Test events: 6,928,123
  Training sessions: 12,899,779
  Test sessions: 1,671,803
  Unique items (train): 1,855,603
  Unique items (test): 783,486

✅ Data preparation completed successfully!
Next step: Run the main solution notebook
Make sure to update the DATA_PATH in the main notebook to: /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-data/
Configuration saved to: /content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-output/data_config.js