In [1]:
# install dependencies
! pip install -q -U numpy
! pip install -q -U pandas
! pip install -q -U mlflow
! pip install -q -U iterative-stratification

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/dill-0.3.9-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/opt_einsum-3.4.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/looseversion-1.3.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/lightning_utilit

In [2]:
# don't modify this cell

import pandas as pd
import mlflow
import urllib.parse
import os
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

# MLFlow creds
MLFLOW_DOMAIN = "https://mlflow.gritans.lv"
MLFLOW_USERNAME = "data_user"
MLFLOW_PASSWORD = "ais7Rah2foo0gee9"
MLFLOW_TRACKING_URI = f"{MLFLOW_DOMAIN}"

parsed_uri = urllib.parse.urlparse(MLFLOW_TRACKING_URI)
auth_uri = parsed_uri._replace(
    netloc=f"{urllib.parse.quote(MLFLOW_USERNAME)}:{urllib.parse.quote(MLFLOW_PASSWORD)}@{parsed_uri.netloc}"
).geturl()

mlflow.set_tracking_uri(auth_uri)

def upload_dataset(
    train_df: pd.DataFrame,
    valid_df: pd.DataFrame,
    dataset_name: str,
    version_name: str,
    description: str | None = None
):
    """
    Args:
        train_df (pd.DataFrame): DataFrame containing the training data.
        valid_df (pd.DataFrame): DataFrame containing the validation data.
        dataset_name (str): Name of the dataset to be used in MLFlow.
        version_name (str): Version name for the dataset.
        description (str | None): Description of the dataset. Default is None.
    """
    
    for df in [train_df, valid_df]:
        assert isinstance(df, pd.DataFrame), "df must be a pandas DataFrame"
        assert not df.empty, "df must not be empty"
        assert not df.isnull().values.any(), "df must not contain NaN values"
        assert "spectrum" in df.columns, "df must contain a 'spectrum' column"
        assert isinstance(df["spectrum"].iloc[0], np.ndarray), "spectrum column must contain numpy arrays"
        assert df["spectrum"].iloc[0].ndim == 1, "spectrum column must contain 1D numpy arrays"
        assert df["spectrum"].iloc[0].dtype in [float, np.float32, np.float64], "spectrum column must contain float values"

    mlflow.set_experiment(experiment_name=dataset_name)
    with mlflow.start_run(run_name=version_name, description=description) as run:
        local_dir = os.path.join("./runs", run.info.run_id)
        os.makedirs(local_dir, exist_ok=True)

        # Log train and valid DataFrames
        train_path = os.path.join(local_dir, "train_df.pkl")
        valid_path = os.path.join(local_dir, "valid_df.pkl")
        
        train_df.to_pickle(train_path)
        valid_df.to_pickle(valid_path)

        mlflow.log_artifact(train_path)
        mlflow.log_artifact(valid_path)

        # Log metadata
        mlflow.log_param("train_size", len(train_df))
        mlflow.log_param("valid_size", len(valid_df))
        mlflow.log_param("spectrum_len", train_df["spectrum"].iloc[0].shape[0])
        
        # column: dtype dict
        column_dtypes = {col: str(train_df[col].dtype) for col in train_df.columns}
        mlflow.log_param("columns", column_dtypes)

        # bool_column: num_positive
        for split_name, split_df in zip(["train", "valid"], [train_df, valid_df]):
            pos_counts = {}
            for col in split_df.columns:
                if split_df[col].dtype == bool:
                    pos_counts[col] = split_df[col].sum()
            mlflow.log_param(f"{split_name}_pos", pos_counts)

        # df head txt artifacts
        train_head_path = os.path.join(local_dir, "train_df_head.txt")
        valid_head_path = os.path.join(local_dir, "valid_df_head.txt")

        with open(train_head_path, "w") as f:
            f.write(str(train_df.head(n=5)))
        with open(valid_head_path, "w") as f:
            f.write(str(valid_df.head(n=5)))

        mlflow.log_artifact(train_head_path)
        mlflow.log_artifact(valid_head_path)

        if description:
            mlflow.set_tag("description", description)

def load_ftir() -> pd.DataFrame:
    """
    Downloads the non-augmented FTIR dataset.
    """

    run_id = "186a2a05ee2b4f8698b7993dc0ebddb0"
    artifact = "df.pkl"
    local_path = "./tmp/"
    df_path = os.path.join(local_path, artifact)

    if not os.path.exists(df_path):
        print(f"Artifact {artifact} not found in local path. Downloading...")
        os.makedirs(local_path, exist_ok=True)
        mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path=artifact, dst_path=local_path) # type: ignore

    df = pd.read_pickle(df_path)

    return df

## 1. download the original NIST dataset


In [3]:
# this downloads the non-augmented FTIR dataset
original_df = load_ftir() 
original_df.head(3)

Unnamed: 0,nist_idx,spectrum,alkane,methyl,alkene,alkyne,alcohols,amines,nitriles,aromatics,alkyl halides,esters,ketones,aldehydes,carboxylic acids,ether,acyl halides,amides,nitro
0,50066,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False
1,50113,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False
2,50124,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False


## 2. add extra columns to the dataset
e.g. save molecule name, hydrogen bonding info, etc.
Here we don't create new spectra, but just add extra info to existing ones.

For example, this adds a new colum `example_col` that is `True` if the sample contains both Alkane and Alcohols, and `False` otherwise.

```Python
# iterate through all samples, and build a new column
example_col = []
for i in range(len(original_df)):
    row = original_df.iloc[i]

    if row["alkane"] == True and row["alcohols"] == True:
        example_col.append(True)
    else:
        example_col.append(False)

# add the new column to the DataFrame
extra_df["example_col"] = example_col
```

In [4]:
# If you don't want to add any new columns, just leave this unchanged

# make a copy of the dataset
extra_df = original_df.copy()
extra_df.head(3)

Unnamed: 0,nist_idx,spectrum,alkane,methyl,alkene,alkyne,alcohols,amines,nitriles,aromatics,alkyl halides,esters,ketones,aldehydes,carboxylic acids,ether,acyl halides,amides,nitro
0,50066,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False
1,50113,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False
2,50124,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False


## 3. split the dataset into training and validation parts
We split off the validation dataset after adding extra columns, but before creating new samples (augmenting) because we want to test the model on the real samples, not artifical ones created by e.g. LSER.

Augmented samples are only meant to improve training.

In [5]:

# need to specify which columns are the actual targets we want to predict
# to make it so that the distribution of the training and validation sets is similar

# this can probably be just left to the functional group names

# because even though we also predict e.g. hydrogen bonding (so that the model has more to learn)
# it is not what we are actually interested in

target_names = ['alkane', 'methyl', 'alkene', 'alkyne', 'alcohols', 'amines',
                'nitriles', 'aromatics', 'alkyl halides', 'esters', 'ketones', 
                'aldehydes', 'carboxylic acids', 'ether', 'acyl halides', 
                'amides', 'nitro']

random_seed = 42

# Extract the multilabel targets for stratification
y = extra_df[target_names].astype(int).values

# Use MultilabelStratifiedShuffleSplit to get stratified indices
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_seed) # type: ignore

# Get the train and validation indices
for train_idx, valid_idx in msss.split(extra_df, y):
    train_df = extra_df.iloc[train_idx].copy()
    valid_df = extra_df.iloc[valid_idx].copy()

train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

print(f"Train size: {len(train_df)}")
print(f"Valid size: {len(valid_df)}")

# Optional: Check label distribution balance
print("\nLabel distribution comparison:")
for col in target_names:
    train_ratio = train_df[col].mean()
    valid_ratio = valid_df[col].mean()
    print(f"{col}: Train={train_ratio:.3f}, Valid={valid_ratio:.3f}")

Train size: 6926
Valid size: 1736

Label distribution comparison:
alkane: Train=0.691, Valid=0.690
methyl: Train=0.642, Valid=0.641
alkene: Train=0.135, Valid=0.134
alkyne: Train=0.026, Valid=0.026
alcohols: Train=0.270, Valid=0.270
amines: Train=0.094, Valid=0.094
nitriles: Train=0.043, Valid=0.043
aromatics: Train=0.580, Valid=0.578
alkyl halides: Train=0.278, Valid=0.277
esters: Train=0.111, Valid=0.111
ketones: Train=0.091, Valid=0.090
aldehydes: Train=0.024, Valid=0.024
carboxylic acids: Train=0.073, Valid=0.073
ether: Train=0.249, Valid=0.248
acyl halides: Train=0.011, Valid=0.011
amides: Train=0.019, Valid=0.019
nitro: Train=0.051, Valid=0.051


## 4. augment the training dataset
Here we add extra samples via e.g. LSER.

For example, this creates a copy of each existing sample, with the values scaled down 0.5 times.

```Python
new_rows = []

for i in range(len(train_df)):
    # get original sample
    row = train_df.iloc[i].copy() # important to copy here, otherwise original also modified
    row["scaled"] = False

    # create another sample
    row_copy = row.copy() # another copy for the modification
    row_copy["spectrum"] = row_copy["spectrum"] * 0.5
    row_copy["scaled"] = True

    # store both
    new_rows.append(row)
    new_rows.append(row_copy)

augmented_train_df = pd.DataFrame(new_rows)
```

In [6]:
# if you don't want to create new samples, just leave this unchanged

new_rows = []

for i in range(len(train_df)):
    row = train_df.iloc[i].copy() # important to copy here, otherwise original also modified
    new_rows.append(row)

augmented_train_df = pd.DataFrame(new_rows)
augmented_train_df.reset_index(drop=True, inplace=True)
augmented_train_df.head(3)

Unnamed: 0,nist_idx,spectrum,alkane,methyl,alkene,alkyne,alcohols,amines,nitriles,aromatics,alkyl halides,esters,ketones,aldehydes,carboxylic acids,ether,acyl halides,amides,nitro
0,50066,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False
1,50113,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False
2,50293,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False


## 5. upload the new dataset

In [None]:
# upload_dataset(
#     train_df=augmented_train_df,
#     valid_df=valid_df,
#     dataset_name="dataset_FTIR_example",
#     version_name="v1",
#     description="Example unchanged dataset"
# )

🏃 View run v1 at: https://data_user:ais7Rah2foo0gee9@mlflow.gritans.lv/#/experiments/15/runs/9c694f4d281140e697069431694c06b4
🧪 View experiment at: https://data_user:ais7Rah2foo0gee9@mlflow.gritans.lv/#/experiments/15
