Author: Joseph Ko <br>
Reproducible notebook to train scikit-learn models from: "A Machine Learning Framework for Predicting Microphysical Properties of Ice Crystals from Cloud Particle Imagery" (Ko et al. 2025) <br>
Required packages: scikit-learn, joblib, numpy, pandas

# Imports and configuration

In [1]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np
import pandas as pd
import os

!!! IMPORTANT NOTE !!! <br>
Paths throughout this notebook must be set by user! <br>
Required idx files inside of idx.tar at: 10.5281/zenodo.15758769 
Required tabular data is in tabular-data.tar at: 10.5281/zenodo.15758769 

In [2]:
# load indices
train_idx_path = "/home/jko/synth-ros-data/idx/idx-train-sequential-subset-700k.txt"
val_idx_path = "/home/jko/synth-ros-data/idx/idx-val-sequential-subset-700k.txt"
test_idx_path = "/home/jko/synth-ros-data/idx/idx-test-sequential-subset-700k.txt"
train_idx = np.loadtxt(train_idx_path, dtype=int)
val_idx = np.loadtxt(val_idx_path, dtype=int)
test_idx = np.loadtxt(test_idx_path, dtype=int)
# define targets and features
features_single = ['aspect_ratio', 'aspect_ratio_elip', 'extreme_pts', 
        'contour_area', 'contour_perimeter', 'area_ratio', 'complexity', 
        'circularity']
features_stereo = ['aspect_ratio_1','aspect_ratio_elip_1', 'extreme_pts_1', 'contour_area_1', 'contour_perimeter_1', 'area_ratio_1', 'complexity_1', 'circularity_1', 'aspect_ratio_2', 'aspect_ratio_elip_2', 'extreme_pts_2', 'contour_area_2', 'contour_perimeter_2', 'area_ratio_2', 'complexity_2','circularity_2']
targets_reg = ['rho_eff', 'sa_eff']
targets_cls = ['n_arms']
n_rand = 666 # this random seed was used for analysis in paper
# specify where to save your scikit-learn models
out_dir = '/home/jko/temp/skl-models'
os.makedirs(out_dir, exist_ok=True)
# set number of cpus available
ncpus = 32

For point of reference, this whole notebook runs in a few minutes with 32 cpus.

# Linear baseline

## Single view

In [3]:
data_dir = '/home/jko/synth-ros-data/tabular-data-v2/shuffled_small'
data_file = 'ros-tabular-data-shuffled-default-subset-700000.parquet'
data_path = os.path.join(data_dir, data_file)

### Regression

In [4]:
# create train/test set
df = pd.read_parquet(data_path)
df = df[df['view']=='default']
df_features = df[features_single]
df_targets = df[targets_reg]
X = df_features
y = df_targets
# train/val/test split 
X_train = df_features.iloc[train_idx]
X_val = df_features.iloc[val_idx]
X_test = df_features.iloc[test_idx]
y_train = df_targets.iloc[train_idx] 
y_val = df_targets.iloc[val_idx]
y_test = df_targets.iloc[test_idx]
# train
reg = LinearRegression()
reg.fit(X_train, y_train)
# save model
out_filename = 'lin_reg.pkl'
out_path = os.path.join(out_dir, out_filename)
joblib.dump(reg, out_path)

['/home/jko/temp/skl-models/lin_reg.pkl']

### Classification

In [5]:
# create train/test set
df = pd.read_parquet(data_path)
df = df[df['view']=='default']
df_features = df[features_single]
df_targets = df[targets_cls]
X = df_features
y = df_targets
# train/val/test split
X_train = df_features.iloc[train_idx]
X_val = df_features.iloc[val_idx]
X_test = df_features.iloc[test_idx]
y_train = df_targets.iloc[train_idx] 
y_val = df_targets.iloc[val_idx]
y_test = df_targets.iloc[test_idx] 
# train logistic regression (linear classifier)
linear_classifier = LogisticRegression(solver='saga', random_state=n_rand, max_iter=1000, n_jobs=ncpus)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
linear_classifier.fit(X_train_scaled, y_train.values.ravel())
# save model
out_filename = 'lin_cls.pkl'
out_path = os.path.join(out_dir, out_filename)
joblib.dump(linear_classifier, out_path)

['/home/jko/temp/skl-models/lin_cls.pkl']

## Stereo View (2DS)

In [6]:
data_dir = '/home/jko/synth-ros-data/tabular-data-v2/shuffled_small'
data_file = 'ros-tabular-data-stereo-default-2ds-shuffled-subset-700000.parquet'
data_path = os.path.join(data_dir, data_file)

### Regression

In [7]:
# create train/test set
df = pd.read_parquet(data_path)
df_features = df[features_stereo]
df_targets = df[targets_reg]
X = df_features
y = df_targets
# train/val/test split
X_train = df_features.iloc[train_idx]
X_val = df_features.iloc[val_idx]
X_test = df_features.iloc[test_idx]
y_train = df_targets.iloc[train_idx] 
y_val = df_targets.iloc[val_idx]
y_test = df_targets.iloc[test_idx] 
# train
reg = LinearRegression()
reg.fit(X_train, y_train)
# save model
out_filename = 'lin_reg_stereo_2ds.pkl'
out_path = os.path.join(out_dir, out_filename)
joblib.dump(reg, out_path)

['/home/jko/temp/skl-models/lin_reg_stereo_2ds.pkl']

### Classification

In [8]:
# create train/test set
df = pd.read_parquet(data_path)
df_features = df[features_stereo]
df_targets = df[targets_cls]
X = df_features
y = df_targets
# train/val/test split
X_train = df_features.iloc[train_idx]
X_val = df_features.iloc[val_idx]
X_test = df_features.iloc[test_idx]
y_train = df_targets.iloc[train_idx] 
y_val = df_targets.iloc[val_idx]
y_test = df_targets.iloc[test_idx] 
# train logistic regression
linear_classifier = LogisticRegression(solver='saga', random_state=n_rand, max_iter=1000, n_jobs=ncpus)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
linear_classifier.fit(X_train_scaled, y_train.values.ravel())
# save model
out_filename = 'lin_cls_stereo_2ds.pkl'
out_path = os.path.join(out_dir, out_filename)
joblib.dump(linear_classifier, out_path)

['/home/jko/temp/skl-models/lin_cls_stereo_2ds.pkl']

## Stereo View (PHIPS)

In [9]:
data_dir = '/home/jko/synth-ros-data/tabular-data-v2/shuffled_small'
data_file = 'ros-tabular-data-stereo-default-2ds-shuffled-subset-700000.parquet'
data_path = os.path.join(data_dir, data_file)

### Regression

In [10]:
# create train/test set
df = pd.read_parquet(data_path)
df_features = df[features_stereo]
df_targets = df[targets_reg]
X = df_features
y = df_targets
# train/val/test split
X_train = df_features.iloc[train_idx]
X_val = df_features.iloc[val_idx]
X_test = df_features.iloc[test_idx]
y_train = df_targets.iloc[train_idx] 
y_val = df_targets.iloc[val_idx]
y_test = df_targets.iloc[test_idx] 
# train
reg = LinearRegression()
reg.fit(X_train, y_train)
# save model
out_filename = 'lin_reg_stereo_phips.pkl'
out_path = os.path.join(out_dir, out_filename)
joblib.dump(reg, out_path)

['/home/jko/temp/skl-models/lin_reg_stereo_phips.pkl']

### Classification

In [11]:
# create train/test set
df = pd.read_parquet(data_path)
df_features = df[features_stereo]
df_targets = df[targets_cls]
X = df_features
y = df_targets
# train/val/test split
X_train = df_features.iloc[train_idx]
X_val = df_features.iloc[val_idx]
X_test = df_features.iloc[test_idx]
y_train = df_targets.iloc[train_idx] 
y_val = df_targets.iloc[val_idx]
y_test = df_targets.iloc[test_idx] 
# train logistic regression
linear_classifier = LogisticRegression(solver='saga', random_state=n_rand, max_iter=1000, n_jobs=ncpus)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
linear_classifier.fit(X_train_scaled, y_train.values.ravel())
# save model
out_filename = 'lin_cls_stereo_phips.pkl'
out_path = os.path.join(out_dir, out_filename)
joblib.dump(linear_classifier, out_path)

['/home/jko/temp/skl-models/lin_cls_stereo_phips.pkl']

# Random Forest

In [12]:
data_dir = '/home/jko/synth-ros-data/tabular-data-v2/shuffled_small'
data_file = 'ros-tabular-data-shuffled-default-subset-700000.parquet'
data_path = os.path.join(data_dir, data_file)

## Regression

In [13]:
# set hyperparameters (tuned with grid search)
max_depth=20 
min_samples_leaf=10
n_estimators=200
# create train/test set
df = pd.read_parquet(data_path)
df = df[df['view']=='default']
df_features = df[features_single]
df_targets = df[targets_reg]
X = df_features
y = df_targets
# train/val/test split
X_train = df_features.iloc[train_idx]
X_val = df_features.iloc[val_idx]
X_test = df_features.iloc[test_idx]
y_train = df_targets.iloc[train_idx] 
y_val = df_targets.iloc[val_idx]
y_test = df_targets.iloc[test_idx] 
# train model
reg = RandomForestRegressor(max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, random_state=n_rand, n_jobs=ncpus)
reg.fit(X_train, y_train)
# save model
out_filename = 'rf_reg.pkl'
out_path = os.path.join(out_dir, out_filename)
joblib.dump(reg, out_path)

['/home/jko/temp/skl-models/rf_reg.pkl']

## Classification

In [14]:
# set hyperparameters (tuned with grid search)
max_depth=10
min_samples_split=5
n_estimators=200
# create train/test set
df = pd.read_parquet(data_path)
df = df[df['view']=='default']
df_features = df[features_single]
df_targets = df[targets_cls]
X = df_features
y = df_targets
# train/val/test split
X_train = df_features.iloc[train_idx]
X_val = df_features.iloc[val_idx]
X_test = df_features.iloc[test_idx]
y_train = df_targets.iloc[train_idx] 
y_val = df_targets.iloc[val_idx]
y_test = df_targets.iloc[test_idx] 
# train model
classifier = RandomForestClassifier(max_depth=10, min_samples_split=5,
n_estimators=200, n_jobs=ncpus, random_state=n_rand)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
classifier.fit(X_train_scaled, y_train.values.ravel())
# save model
out_filename = 'rf_cls.pkl'
out_path = os.path.join(out_dir, out_filename)
joblib.dump(classifier, out_path)

['/home/jko/temp/skl-models/rf_cls.pkl']