%%markdown
# Dataset Generation Template

This notebook provides a template for generating and uploading new datasets to MLflow, after you have the raw scraped data.

## Data Format

Your data needs to be organized in a dictionary with three required keys:
- `inputs`: Feature matrix as NumPy array
- `target`: Binary target matrix as NumPy array
- `target_names`: List of strings naming each target feature

### Shape Requirements

| Component | Type | Shape | Description | Example |
|-----------|------|-------|-------------|----------|
| `inputs` | `np.ndarray` | `(n_samples, n_features)` | Each row is one sample, each column a feature | `(1000, 10)` for 1000 samples with 10 features |
| `target` | `np.ndarray` | `(n_samples, n_targets)` | Binary matrix where each column represents one target | `(1000, 3)` for 1000 samples with 3 possible targets |
| `target_names` | `list[str]` | `(n_targets,)` | Names for each target column | `["cat", "dog", "bird"]` for 3 targets |

## Example

```python
data = {
    "inputs": np.array([
        [0.1, 0.2, 0.3],  # Sample 1 with 3 features
        [0.4, 0.5, 0.6],  # Sample 2 with 3 features
        # ... more samples
    ]),
    "target": np.array([
        [1, 0],  # Sample 1: positive for target 1, negative for target 2
        [0, 1],  # Sample 2: negative for target 1, positive for target 2
        # ... more samples
    ]),
    "target_names": ["group_A", "group_B"]  # Names for the two target columns
}
```

## Uploading the dataset
Once your data is prepared, use `upload_dataset()` to save it to MLflow.
This will verify that the data is formatted correctly and then upload it to the server.

```python
upload_dataset(
    data=data,
    dataset_name="ftir_no_bonding_effects",  # Broader dataest for which we can have multiple versions
    version_name="initial_data",  # Description of this version
    description="FTIR dataset downloaded from the FCGFormer paper without any modifications"  # Optional details
)
```

## Accessing the Dataset

After upload, the dataset will be available in MLflow for model training with:
- NumPy arrays saved as `.npy` files
- Target names and counts (number of positive examples) in text files
- The code of this notebook saved for reproducability (so you don't have to upload it anywhere)

You can view your dataset in MLflow by opening the link printed after calling `upload_dataset()`.

In [None]:
# This cell defines upload_dataset. You can ignore it.

# Install required packages for `upload_dataset()` with quiet flag to reduce output
%pip install -q numpy mlflow ipynbname requests tqdm scikit-learn scikit-multilearn

import os
import urllib.parse
import jupyter_client
from typing import Dict, Any
import time
from tqdm.notebook import tqdm
from skmultilearn.model_selection import iterative_train_test_split
try:
    import mlflow
except ImportError:
    %pip install -q mlflow
    import mlflow

try:
    import requests
except ImportError:
    %pip install -q requests
    import requests

try:
    import numpy as np
except ImportError:
    %pip install -q numpy
    import numpy as np

try:
    from notebook import notebookapp
except ImportError:
    from jupyter_server import serverapp as notebookapp

# MLFlow creds
MLFLOW_DOMAIN = "mlflow.gritans.lv"
MLFLOW_USERNAME = "data_user"
MLFLOW_PASSWORD = "ais7Rah2foo0gee9"
MLFLOW_TRACKING_URI = f"https://{MLFLOW_DOMAIN}"

parsed_uri = urllib.parse.urlparse(MLFLOW_TRACKING_URI)
auth_uri = parsed_uri._replace(
    netloc=f"{urllib.parse.quote(MLFLOW_USERNAME)}:{urllib.parse.quote(MLFLOW_PASSWORD)}@{parsed_uri.netloc}"
).geturl()

mlflow.set_tracking_uri(auth_uri)

def upload_dataset(
    data: Dict[str, Any],
    dataset_name: str,
    version_name: str,
    description: str | None = None,
    split: bool = True,
    test_size: float = 0.2,
    valid_size: float = 0.2,
):
    """
    Args:
        data (Dict[str, Any]): Dictionary containing the dataset with keys:
            - "inputs": NumPy array of shape (num_samples, num_input_features)
            - "target": NumPy array of shape (num_samples, num_target_features)
            - "target_names": List of target feature names, in the same order as the target array.
        dataset_name (str): Name of the dataset.
        version_name (str): A descriptive version name for the dataset. 
        description (str): An (optional) description of this dataset version.
        split (bool): Whether to create train/valid/test splits.
        test_size (float): Fraction of data for test set.
        valid_size (float): Fraction of remaining data for validation.
    """
    print(f"🚀 Starting dataset upload process for '{dataset_name}' - {version_name}")
    
    # Check dictionary
    print("✅ Validating dataset format...")
    expected_keys = {"inputs", "target", "target_names"}
    assert set(data.keys()) == expected_keys, (
        f"Invalid dataset format. Keys should be {expected_keys}."
    )

    # Check expected types
    assert isinstance(data["inputs"], np.ndarray), (
        f"Inputs must be a numpy.ndarray. Got {type(data['inputs'])}."
    )
    assert isinstance(data["target"], np.ndarray), (
        f"Targets must be a numpy.ndarray. Got {type(data['target'])}."
    )
    assert isinstance(data["target_names"], list), (
        f"target names must be a list. Got {type(data['target_names'])}."
    )
    assert all(isinstance(name, str) for name in data["target_names"]), (
        "All target names must be strings."
    )

    # Check expected shapes
    inputs: np.ndarray = data["inputs"]
    target: np.ndarray = data["target"]
    target_names = data["target_names"]

    assert inputs.ndim == 2, (
        f"Inputs must be a (num_samples, num_input_features) array. "
        f"Got {inputs.ndim} dimensions."
    )
    assert target.ndim == 2, (
        f"Targets must be a (num_samples, num_target_features) array. "
        f"Got {target.ndim} dimensions."
    )

    n_samples = inputs.shape[0]
    assert n_samples > 0, (
        f"Inputs must have at least one sample. Got {n_samples} samples."
    )
    assert n_samples == target.shape[0], (
        f"Inputs and targets must have the same number of samples. "
        f"Got {n_samples} inputs and {target.shape[0]} targets."
    )

    n_target_features = target.shape[1]
    assert n_target_features > 0 and n_target_features == len(target_names), (
        f"Targets must have the same number of features as target names. "
        f"Got {n_target_features} target features and {len(target_names)} target names."
    )

    # Create train/valid/test splits if requested
    splits = {}
    
    if split:
        print("🔪 Creating train/valid/test splits...")
        
        # For multi-label data, we need to use iterative stratification
        print("  - Using iterative stratification for multi-label data")
        
        # First split off the test set
        train_valid_inputs, train_valid_target, test_inputs, test_target = iterative_train_test_split(
            inputs, target, test_size=test_size
        )
        
        # Split remaining data into train/valid
        train_inputs, train_target, valid_inputs, valid_target = iterative_train_test_split(
            train_valid_inputs, train_valid_target, test_size=valid_size
        )
        
        # Store the splits
        splits = {
            "train": (train_inputs, train_target),
            "valid": (valid_inputs, valid_target),
            "test": (test_inputs, test_target)
        }
        
        # Print split statistics
        print("📊 Split statistics:")
        for split_name, (split_x, split_y) in splits.items():
            split_size = len(split_x)
            split_pct = split_size / n_samples * 100
            print(f"  - {split_name}: {split_size} samples ({split_pct:.1f}%)")
    else:
        # Just use the original dataset
        splits = {"full": (inputs, target)}
    
    # Connect to MLFlow
    print("🔍 Connecting to MLflow server...")
    mlflow.set_experiment(experiment_name=dataset_name)
    with mlflow.start_run(run_name=version_name, description=description) as run:
        print(f"✅ Created run with ID: {run.info.run_id}")
        local_dir = os.path.join("./runs", run.info.run_id)
        os.makedirs(local_dir, exist_ok=True)

        # Log the notebook generating this dataset
        print("📝 Locating and saving notebook source...")
        try:
            # primary: ipynbname often just works
            import ipynbname
            notebook_path = ipynbname.path()
            print(f"✅ Found notebook at: {notebook_path}")
        except Exception:
            # Fallback notebook location code - unchanged
            print("⚠️ Primary notebook path detection failed, trying alternative method...")
            # ... existing fallback code ...
            conn_file = jupyter_client.find_connection_file()
            kernel_id = os.path.basename(conn_file).split('-', 1)[1].split('.')[0]

            # 2) iterate over all running notebook servers
            for srv in notebookapp.list_running_servers():
                # build the URL for sessions
                url = srv['url'].rstrip('/') + '/api/sessions'
                token = srv.get('token', '')
                params = {'token': token} if token else {}

                try:
                    resp = requests.get(url, params=params)
                    resp.raise_for_status()
                except Exception:
                    continue

                # 3) look for our kernel in their active sessions
                for sess in resp.json():
                    if sess['kernel']['id'] == kernel_id:
                        # 4) reconstruct the full path
                        rel_path = sess['notebook']['path']       # e.g. "subdir/MyNotebook.ipynb"
                        notebook_path = os.path.join(srv['notebook_dir'], rel_path)
                        print(f"✅ Found notebook at: {notebook_path}")
                        break
                else:
                    continue
                break
            else:
                raise RuntimeError("Could not locate the current notebook path")

        mlflow.log_artifact(notebook_path)
        
        # Log dataset files - one set per split
        print("📄 Preparing and uploading dataset files...")
        
        target_names_path = os.path.join(local_dir, "target_names.txt")
        with open(target_names_path, "w") as f:
            for name in target_names:
                f.write(f"{name}\n")
        
        files_to_upload = [("target names", target_names_path)]
        
        # Save each split with its own files
        for split_name, (split_inputs, split_target) in splits.items():
            print(f"  - Processing {split_name} split...")
            
            # Calculate positive counts for this split
            split_pos_counts = split_target.sum(axis=0)
            
            # Save split-specific files
            split_inputs_path = os.path.join(local_dir, f"{split_name}_inputs.npy")
            split_target_path = os.path.join(local_dir, f"{split_name}_target.npy")
            split_pos_counts_path = os.path.join(local_dir, f"{split_name}_pos_counts.txt")
            
            # Save numpy arrays
            np.save(split_inputs_path, split_inputs)
            np.save(split_target_path, split_target)
            
            # Save positive counts
            with open(split_pos_counts_path, "w") as f:
                for i, count in enumerate(split_pos_counts):
                    f.write(f"{count}\n")
            
            # Add to upload list
            files_to_upload.extend([
                (f"{split_name} inputs", split_inputs_path),
                (f"{split_name} target", split_target_path),
                (f"{split_name} pos counts", split_pos_counts_path)
            ])

            pos_counts_dict = {name: int(count) for name, count in zip(target_names, split_pos_counts)}
            mlflow.log_param(f"{split_name}_positive_samples", pos_counts_dict)
            mlflow.log_param(f"{split_name}_num_samples", len(split_inputs))

        
        # Upload all files to MLFlow
        print("📤 Uploading files to MLflow server...")
        for desc, filepath in tqdm(files_to_upload, desc="Uploading files"):
            print(f"  - Uploading {desc}...")
            mlflow.log_artifact(filepath)
            time.sleep(0.2)  # Small delay for progress visibility

        # Log metadata
        print("📊 Logging metadata...")
        mlflow.log_param("target_names", target_names)
        mlflow.log_param("input_features", inputs.shape[1])
        mlflow.log_param("target_features", target.shape[1])
        mlflow.log_param("total_num_samples", n_samples)
        
        if description:
            mlflow.set_tag("description", description)
        
        print(f"✅ Dataset upload complete! View at: {MLFLOW_TRACKING_URI}/#/experiments/{run.info.experiment_id}/runs/{run.info.run_id}")

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Import necessary libraries
import numpy as np
import os
from glob import glob
from tqdm import tqdm

class_names = ["alkane", "methyl", "alkene", "alkyne", "alcohols", "amines", "nitriles", "aromatics",
 "alkyl halides", "esters", "ketones", "aldehydes", "carboxylic acids", "ether",
 "acyl halides", "amides", "nitro"]

# Define the data directory
data_dir = "../data/ftir"

# Initialize arrays to store data
all_inputs = []
all_targets = []

# We'll combine all the data from existing splits
print("Loading FTIR dataset...")

# Get all sample IDs across all directories
all_npy_paths = glob(os.path.join(data_dir, "**/*.npy"), recursive=True)
all_ids_with_paths = [(int(os.path.splitext(os.path.basename(path))[0]), os.path.dirname(path)) 
                      for path in all_npy_paths]
all_ids_with_paths.sort()  # Sort by ID

print(f"Found {len(all_ids_with_paths)} total samples")

# Load each sample
for sample_id, dir_path in tqdm(all_ids_with_paths, desc="Loading samples", unit="sample"):
    npy_path = os.path.join(dir_path, f"{sample_id}.npy")
    txt_path = os.path.join(dir_path, f"{sample_id}.txt")
    
    # Load feature vector
    x = np.load(npy_path)
    
    # Load target labels
    with open(txt_path, "r") as f:
        y = np.array([int(tok) for tok in f.read().strip().split()], dtype=np.int32)
    
    all_inputs.append(x)
    all_targets.append(y)

# Convert lists to numpy arrays
inputs = np.vstack(all_inputs)  # Stack vertically to create (n_samples, n_features)
target = np.vstack(all_targets)  # Stack vertically to create (n_samples, n_classes)

print(f"Dataset loaded: {inputs.shape[0]} samples, {inputs.shape[1]} features, {target.shape[1]} classes")

# Package the data in the required format
data = {
    "inputs": inputs,
    "target": target,
    "target_names": class_names
}

In [None]:
# Upload the FTIR dataset to MLflow
upload_dataset(
    data=data,
    dataset_name="ftir_fcg",
    version_name="custom_split",
    description="FTIR spectroscopy dataset with 17 functional group targets from thr FCGFormer paper",
    split=True  # Let upload_dataset handle the splitting
)