# Train machine learning model to identify autophagy positive cells

Now that we extracted features for all single-cell images, we can train a machine learning model to identify autophagy positive cells.

In [21]:
import lamindb as ln
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define parameters for our RandomForest Classifier
ln.Param(name="random_state", dtype="int").save()
ln.Param(name="n_estimators", dtype="int").save()
ln.Param(name="max_depth", dtype="int").save()
ln.Param(name="min_samples_split", dtype="int").save()
ln.Param(name="min_samples_leaf", dtype="int").save()
ln.Param(name="max_features", dtype="str").save()
ln.Param(name="criterion", dtype="str").save()
ln.Param(name="bootstrap", dtype="bool").save()

# Define parameter values
rfc_params = {
    'random_state': 42,
    'n_estimators': 100,
    'max_depth': 10,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'criterion': 'gini',
    'bootstrap': True
}

ln.track(params=rfc_params)

[92m→[0m returning existing Param record with same name: 'random_state'
[92m→[0m returning existing Param record with same name: 'n_estimators'
[92m→[0m returning existing Param record with same name: 'max_depth'
[92m→[0m returning existing Param record with same name: 'min_samples_split'
[92m→[0m returning existing Param record with same name: 'min_samples_leaf'
[92m→[0m returning existing Param record with same name: 'max_features'
[92m→[0m returning existing Param record with same name: 'criterion'
[92m→[0m returning existing Param record with same name: 'bootstrap'
[92m→[0m loaded Transform('siQQy6o49VMq0000'), re-started Run('R84WQb7D...') at 2025-02-24 17:33:40 UTC
→ params: random_state=42, n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, criterion=gini, bootstrap=True
[92m→[0m notebook imports: lamindb==1.1.0 pandas==2.2.3 scikit-learn==1.6.1


Get the `wildtype` and `EI24KO` KO features:

In [22]:
study = ln.ULabel.get(name="autophagy imaging")

sc_datasets = ln.Artifact.filter(ulabels=study).filter(ulabels__name="scportrait single-cell images")
featurized_datasets = ln.Artifact.filter(ulabels=study).filter(ulabels__name="single-cell image featurization results")

In [23]:
# load data
wt_cells_afs = featurized_datasets.filter(ulabels__name="WT").distinct().one()
features_wt = wt_cells_afs.load()

ko_cells_afs = featurized_datasets.filter(ulabels__name="EI24KO").distinct().one()
features_ko = ko_cells_afs.load()

In [24]:
# Remove columns we don't want to train on
data = features_wt.drop(columns=["label", "cell_id"])
data = data.drop(columns=[col for col in data.columns if "mCherry" in col])

# Separate features and target
X = data.drop("class", axis=1)
y = data["class"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TODO can or should we use https://docs.lamin.ai/lamindb.core.mappedcollection (https://docs.lamin.ai/scrna-mappedcollection)

# Train model
clf = RandomForestClassifier(**rfc_params)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

Accuracy: 0.7
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.95      0.81        20
           1       0.67      0.20      0.31        10

    accuracy                           0.70        30
   macro avg       0.69      0.57      0.56        30
weighted avg       0.69      0.70      0.64        30



TODO 1-3 takeaway sentences

In [25]:
# add ML parameters
# https://docs.lamin.ai/track#track-parameters

In [26]:
data_ko = features_ko.drop(columns=["label", "cell_id"])
data_ko = data_ko.drop(columns=[x for x in data_ko.columns if "mCherry" in x])
X_ko = data_ko.drop("class", axis=1)
y_true = data_ko["class"]
predictions_ko = clf.predict(X_ko)

In [27]:
pred_results = pd.DataFrame({"prediction": predictions_ko, "label": y_true})

In [28]:
pred_results[pred_results.label == 0].sum() / pred_results[
    pred_results.label == 0
].count()

prediction    0.133758
label         0.000000
dtype: float64

TODO add 1-3 takeaway sentences

In [29]:
ln.finish()

[94m•[0m please hit CTRL + s to save the notebook in your editor .... still waiting .... [92m✓[0m
[93m![0m cells [(0, 21)] were not run consecutively
[92m→[0m finished Run('R84WQb7D') after 22s at 2025-02-24 17:34:03 UTC
[92m→[0m go to: https://lamin.ai/scportrait/examples/transform/siQQy6o49VMq0000
[92m→[0m to update your notebook from the CLI, run: lamin save /home/lukas/code/lamin-usecases/docs/imaging4.ipynb
