# Train Random Forest Classifier to identify autophagy positive cells

In [None]:
import lamindb as ln
import bionty
ln.track()

In [None]:
from scportrait.pipeline.featurization import CellFeaturizer, MLClusterClassifier
import pandas as pd
import numpy as np

In [None]:
study = ln.ULabel.get(name="autophagy imaging")
sc_image_dataset = ln.ULabel.get(name="scportrait single-cell images")

sc_datasets = ln.Artifact.filter(ulabels=study).filter(ulabels=sc_image_dataset)

config_file = ln.ULabel.get(name="scportrait config")
config = ln.Artifact.filter(ulabels = study).filter(ulabels=config_file).one()

In [None]:
# process single-cell images with a featurizer to get features to train our RandomForest classifier on
featurizer = CellFeaturizer( directory = ".",
                            config = config.load().path, 
                            project_location = None)

# we are going to train on wildtype cells
genotype = ln.ULabel.get(name = "WT")
wt_cells = sc_datasets.filter(ulabels = genotype)

#we have two different conditions which will be the two classes that our classifier should be able to tell apart
conditions = [ln.ULabel.get(name=x) for x in set(a.features.get_values()['stimulation'] for a in wt_cells)]

#we will store the calculated features in a dictionary for each condition
condition_lookup = {}
features = None
for i, condition in enumerate(conditions):
    cells = wt_cells.filter(ulabels = condition)
    paths = [dataset.load().path for dataset in cells]
    dataset_lookup = {x.uid:i for i, x in enumerate(cells)}
    labels = list(dataset_lookup.values())
    results = featurizer.process(extraction_dir=paths[0], labels=labels[0], return_results=True)
    results["class"] = i
    condition_lookup[condition.name] = 1
    if features is None:
        features = results
    else:
        features = pd.concat([features, results])

In [None]:
artifact = ln.Artifact.from_df(features, description = "featurized single-cell images").save()
artifact.cell_lines.add(bionty.CellLine.filter(name = "U2OS").one())

#annotate with required metadata
artifact.features.add_values(
    {
        "study": "autophagy imaging",
        "artefact type": "single-cell image featurization results",
        "genotype": "WT",
    }
)

now lets do the same for the KO cells

In [None]:
# we are going to train on wildtype cells
genotype = ln.ULabel.get(name = "EI24KO")
ko_cells = sc_datasets.filter(ulabels = genotype)

#we have two different conditions which will be the two classes that our classifier should be able to tell apart
conditions = [ln.ULabel.get(name=x) for x in set(a.features.get_values()['stimulation'] for a in ko_cells)]

#we will store the calculated features in a dictionary for each condition
condition_lookup = {}
features_ko = None
for i, condition in enumerate(conditions):
    cells = ko_cells.filter(ulabels = condition)
    paths = [dataset.load().path for dataset in cells]
    dataset_lookup = {x.uid:i for i, x in enumerate(cells)}
    labels = list(dataset_lookup.values())
    results = featurizer.process(extraction_dir=paths[0], labels=labels[0], return_results=True)
    results["class"] = i
    condition_lookup[condition.name] = 1
    if features_ko is None:
        features_ko = results
    else:
        features_ko = pd.concat([features_ko, results])

In [None]:
artifact = ln.Artifact.from_df(features, description = "featurized single-cell images").save()
artifact.cell_lines.add(bionty.CellLine.filter(name = "U2OS").one())

#annotate with required metadata
artifact.features.add_values(
    {
        "study": "autophagy imaging",
        "artefact type": "single-cell image featurization results",
        "genotype": "EI24KO",
    }
)

In [None]:
ln.finish()