In [None]:
# don't modify this cell

import pandas as pd
import mlflow
import urllib.parse
import os
import numpy as np
import ast
# MLFlow creds
MLFLOW_DOMAIN = "https://mlflow.gritans.lv"
MLFLOW_USERNAME = "data_user"
MLFLOW_PASSWORD = "ais7Rah2foo0gee9"
MLFLOW_TRACKING_URI = f"{MLFLOW_DOMAIN}"

parsed_uri = urllib.parse.urlparse(MLFLOW_TRACKING_URI)
auth_uri = parsed_uri._replace(
    netloc=f"{urllib.parse.quote(MLFLOW_USERNAME)}:{urllib.parse.quote(MLFLOW_PASSWORD)}@{parsed_uri.netloc}"
).geturl()

mlflow.set_tracking_uri(auth_uri)

def upload_dataset(
    train_df: pd.DataFrame,
    valid_df: pd.DataFrame,
    dataset_name: str,
    version_name: str,
    description: str | None = None
):
    """
    Args:
        train_df (pd.DataFrame): DataFrame containing the training data.
        valid_df (pd.DataFrame): DataFrame containing the validation data.
        dataset_name (str): Name of the dataset to be used in MLFlow.
        version_name (str): Version name for the dataset.
        description (str | None): Description of the dataset. Default is None.
    """
    
    for df in [train_df, valid_df]:
        assert isinstance(df, pd.DataFrame), "df must be a pandas DataFrame"
        assert not df.empty, "df must not be empty"
        assert not df.isnull().values.any(), "df must not contain NaN values"
        assert "spectrum" in df.columns, "df must contain a 'spectrum' column"
        assert isinstance(df["spectrum"].iloc[0], np.ndarray), "spectrum column must contain numpy arrays"
        assert df["spectrum"].iloc[0].ndim == 1, "spectrum column must contain 1D numpy arrays"
        assert df["spectrum"].iloc[0].dtype in [float, np.float32, np.float64], "spectrum column must contain float values"


    mlflow.set_experiment(experiment_name=dataset_name)
    with mlflow.start_run(run_name=version_name, description=description) as run:
        local_dir = os.path.join("./runs", run.info.run_id)
        os.makedirs(local_dir, exist_ok=True)

        # Log train and valid DataFrames
        train_path = os.path.join(local_dir, "train_df.csv.gz")
        valid_path = os.path.join(local_dir, "valid_df.csv.gz")

        # map to list
        train_df["spectrum"] = train_df["spectrum"].apply(lambda x: x.tolist())
        valid_df["spectrum"] = valid_df["spectrum"].apply(lambda x: x.tolist())

        # save as csv
        train_df.to_csv(train_path, index=False, compression='gzip')
        valid_df.to_csv(valid_path, index=False, compression='gzip')
        
        mlflow.log_artifact(train_path)
        mlflow.log_artifact(valid_path)

        # Log metadata
        mlflow.log_param("train_size", len(train_df))
        mlflow.log_param("valid_size", len(valid_df))
        mlflow.log_param("spectrum_len", len(train_df["spectrum"].iloc[0]))
        
        # bool_column: num_positive
        for split_name, split_df in zip(["train", "valid"], [train_df, valid_df]):
            pos_counts = {}
            for col in split_df.columns:
                if split_df[col].dtype == bool:
                    pos_counts[col] = split_df[col].sum()
            mlflow.log_param(f"{split_name}_pos", pos_counts)

        # df head txt artifacts
        train_head_path = os.path.join(local_dir, "train_df_head.txt")
        valid_head_path = os.path.join(local_dir, "valid_df_head.txt")

        with open(train_head_path, "w") as f:
            f.write(str(train_df.head(n=5)))
        with open(valid_head_path, "w") as f:
            f.write(str(valid_df.head(n=5)))

        mlflow.log_artifact(train_head_path)
        mlflow.log_artifact(valid_head_path)

        if description:
            mlflow.set_tag("description", description)

def load_ftir() -> pd.DataFrame:
    """
    Downloads the non-augmented FTIR dataset.
    """

    run_id = "f97846a98e434a5e907d6abce6ee1916"
    artifact = "FTIR_split.csv"
    local_path = "./tmp/"
    df_path = os.path.join(local_path, artifact)

    if not os.path.exists(df_path):
        print(f"Artifact {artifact} not found in local path. Downloading...")
        os.makedirs(local_path, exist_ok=True)
        mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path=artifact, dst_path=local_path) # type: ignore

    df = pd.read_csv(df_path)
    str_to_arr = lambda st: np.array(ast.literal_eval(st), dtype=np.float32)
    df["spectrum"] = df["spectrum"].apply(str_to_arr) # type: ignore

    return df

## 1. download the original NIST dataset
This will take some time to decode csv.


In [None]:
# this downloads the non-augmented FTIR dataset
original_df = load_ftir() 
original_df.head(3)

## 2. add extra data (e.g. chemmotion and graphformer)
Add chemmotion/graphformer samples and mark them

In [None]:
nist_df = original_df.copy()
nist_df["source"] = "nist"

# Chemmotion
chemmotion_df = original_df.iloc[0:0].copy() # this creates an empty DataFrame with the same columns

# TODO load chemmotion

chemmotion_df["source"] = "chemmotion"
chemmotion_df["is_train"] = True

# Graphgormer
graphgormer_df = original_df.iloc[0:0].copy()

# TODO load graphgormer

graphgormer_df["source"] = "graphformer"
graphgormer_df["is_train"] = True

combined_df = pd.concat([nist_df, chemmotion_df, graphgormer_df], ignore_index=True)

## 3. add extra columns to the dataset
add columns for hydrogen bonding info, etc.

In [None]:
extra_df = combined_df.copy()
# TODO add extra columns to extra_df
extra_df.head(3)

## 4. split the dataset into training and validation parts
We split off the validation dataset after adding extra columns, but before creating new samples (augmenting) because we want to test the model on the real samples, not artifical ones created by e.g. LSER.

Augmented samples are only meant to improve training.

In [None]:

# need to specify which columns are the actual targets we want to predict
# to make it so that the distribution of the training and validation sets is similar

# this can probably be just left to the functional group names

# because even though we also predict e.g. hydrogen bonding (so that the model has more to learn)
# it is not what we are actually interested in

target_names = ['alkane', 'methyl', 'alkene', 'alkyne', 'alcohols', 'amines',
                'nitriles', 'aromatics', 'alkyl halides', 'esters', 'ketones', 
                'aldehydes', 'carboxylic acids', 'ether', 'acyl halides', 
                'amides', 'nitro']

# split based on is_train column
train_df = extra_df[extra_df["is_train"] == True].copy()
valid_df = extra_df[extra_df["is_train"] == False].copy()

# Optional: Check label distribution balance
print("\nLabel distribution comparison:")
for col in target_names:
    train_ratio = train_df[col].mean()
    valid_ratio = valid_df[col].mean()
    print(f"{col}: Train={train_ratio:.3f}, Valid={valid_ratio:.3f}")

## 5. augment the training dataset
Here we add extra samples via  LSER.

In [None]:
from LSER_augment2 import debug_apply_lser_shifts
from tqdm import tqdm

pi_stars = [-0.33, -0.08, 0.14, 0.27, 0.59, 0.71, 0.88, 1.09]
betas = [0.00, 0.00, 0.71, 0.49, 0.18, 0.48, 0.76, 0.31 ]
alphas = [0.00, 0.00, 0.00, 0.00, 0.78, 0.08, 0.00, 1.17 ]
wavenumbers = np.linspace(400,4001,3602)

new_rows = []

for i in tqdm(range(len(train_df))):
    row = train_df.iloc[i].copy() 
    row["lser"] = False # original sample

    fgs = ['alcohols', 'ketones', 'aldehydes', 'esters', 'amides', 'nitriles', 'carboxylic acids', 'alkyl halides', 'nitro']

    if any(row[fg] for fg in fgs):
        for i in range(0,8):
            pi_star = pi_stars[i]
            beta = betas[i]
            alpha = alphas[i]
            row_copy = row.copy()
            spectrum = row_copy["spectrum"]
            functional_groups = {fg: row_copy[fg] for fg in fgs}
            row_copy['spectrum'], _ = debug_apply_lser_shifts(
                spectrum, wavenumbers, functional_groups, pi_star, beta, alpha
            )
            row_copy["lser"] = True
            new_rows.append(row_copy)            

    new_rows.append(row)

augmented_train_df = pd.DataFrame(new_rows)
augmented_train_df.reset_index(drop=True, inplace=True)
augmented_train_df.head(3)

## 6. upload the new dataset

In [None]:
# upload_dataset(
#     train_df=augmented_train_df,
#     valid_df=valid_df,
#     dataset_name="dataset_FTIR_example_csv",
#     version_name="v1",
#     description="Example unchanged dataset"
# )