# Train Random Forest Classifier to identify autophagy positive cells

In [None]:
import lamindb as ln
ln.track()

In [None]:
from scportrait.pipeline.featurization import CellFeaturizer, MLClusterClassifier
import pandas as pd
import numpy as np

In [None]:
study = ln.ULabel.get(name="autophagy imaging")
sc_image_dataset = ln.ULabel.get(name="scportrait single-cell images")
featurized_cells = ln.ULabel.get(name = "single-cell image featurization results")
WT = ln.ULabel.get(name = "WT")
KO = ln.ULabel.get(name = "EI24KO")

sc_datasets = ln.Artifact.filter(ulabels=study).filter(ulabels=sc_image_dataset)
featurized_datasets = ln.Artifact.filter(ulabels=study).filter(ulabels=featurized_cells)

In [None]:
#load data from lamindb_instance
wt_cells = featurized_datasets.filter(ulabels = WT).one()
features = wt_cells.load()

#load data from lamindb_instance
ko_cells = featurized_datasets.filter(ulabels = KO).one()
features_ko = ko_cells.load()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

#lets remove columns we don't want to train on
data = features.drop(columns = ["label", "cell_id"])
data = data.drop(columns = [x for x in data.columns if "mCherry" in x])

# Separate features and target
X = data.drop('class', axis=1)
y = data['class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train RandomForest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# add ML parameters
# https://docs.lamin.ai/track#track-parameters

In [None]:
data_ko = features_ko.drop(columns = ["label", "cell_id"])
data_ko = data_ko.drop(columns = [x for x in data_ko.columns if "mCherry" in x])
X_ko = data_ko.drop('class', axis=1)
y_true = data_ko['class']
predictions_ko = clf.predict(X_ko)

In [None]:
pred_results = pd.DataFrame({"prediction":predictions_ko,"label":y_true})

In [None]:
pred_results[pred_results.label == 0].sum()/pred_results[pred_results.label == 0].count()

In [None]:
ln.finish()