In [2]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, classification_report


# Checking CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [4]:
data_root = "/home/kgulbarg/thesis/INR_SCHIC/Nagano_Data"
LABEL_TO_ID = {"G1": 0, "early_S": 1, "mid_S": 2, "late_S": 3}
emb_dim = 256

In [5]:
X, y = [], []

for label in sorted(os.listdir(data_root)):
    label_dir = os.path.join(data_root, label)

    # only real label folders
    if not os.path.isdir(label_dir) or label not in LABEL_TO_ID:
        continue

    # iterate files inside the label folder
    for name in sorted(os.listdir(label_dir)):
        if not (name.endswith("_latent.npy") and
                os.path.isfile(os.path.join(label_dir, name))):
            continue

        latent_path = os.path.join(label_dir, name)
        vec = np.load(latent_path).ravel()
        X.append(vec)
        y.append(LABEL_TO_ID[label])

    print(f"Collected latent representations for {label} cells.")


X = np.vstack(X)
y = np.array(y)     

print(X.shape)
print(y.shape)

Collected latent representations for G1 cells.
Collected latent representations for early_S cells.
Collected latent representations for late_S cells.
Collected latent representations for mid_S cells.
(1171, 256)
(1171,)


### Classifier

In [8]:
def train_classifier(latent_vecs, labels, epochs=500, lr=5e-3, val_split=0.2, seed=0):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    X = torch.as_tensor(latent_vecs, dtype=torch.float32, device=device)
    y = torch.as_tensor(labels,     dtype=torch.long,   device=device).view(-1)

    class Classifier(nn.Module):
        def __init__(self, emb_dim=256):
            super().__init__()
            self.norm = nn.LayerNorm(emb_dim)
            self.fc1  = nn.Linear(emb_dim, 128)
            self.fc2  = nn.Linear(128, 64)
            self.fc3  = nn.Linear(64, 4)
            self.drop = nn.Dropout(0.1)    # avoid overfitting
        def forward(self, x):
            x = self.norm(x)
            x = F.relu(self.fc1(x)); x = self.drop(x)
            x = F.relu(self.fc2(x))
            return self.fc3(x)   # logits

    # stratified split
    idx_tr_list, idx_val_list = [], []
    cpu_y = y.cpu().numpy()
    for c in range(4):
        idxs = np.where(cpu_y == c)[0]
        rng = np.random.default_rng(seed)
        rng.shuffle(idxs)
        cut_c = int((1 - val_split) * len(idxs))
        idx_tr_list.append(torch.as_tensor(idxs[:cut_c], device=device))
        idx_val_list.append(torch.as_tensor(idxs[cut_c:], device=device))
    idx_tr = torch.cat(idx_tr_list); idx_val = torch.cat(idx_val_list)
    Xtr, Xval = X[idx_tr], X[idx_val]; ytr, yval = y[idx_tr], y[idx_val]

    # train
    model = Classifier().to(device)
    opt = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    loss_fn = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        opt.zero_grad()
        loss = loss_fn(model(Xtr), ytr)
        loss.backward(); opt.step()
        # print(f"epoch {epoch}, loss {loss.item():.4f}")

    # evaluate
    with torch.no_grad():
        train_acc = (model(Xtr).argmax(1) == ytr).float().mean().item()
        val_logits = model(Xval)
        val_acc = (val_logits.argmax(1) == yval).float().mean().item()
        yp = val_logits.argmax(1).cpu().numpy()

    print(confusion_matrix(yval.cpu().numpy(), yp))
    print(classification_report(yval.cpu().numpy(), yp, digits=3))

    return model, train_acc, val_acc


In [9]:
model, train_acc, val_acc = train_classifier(X, y)
print(f"train acc = {train_acc:.3f}, val acc = {val_acc:.3f}")

[[13 14 10 19]
 [14 19 12 16]
 [14 10  9 20]
 [16 21 15 14]]
              precision    recall  f1-score   support

           0      0.228     0.232     0.230        56
           1      0.297     0.311     0.304        61
           2      0.196     0.170     0.182        53
           3      0.203     0.212     0.207        66

    accuracy                          0.233       236
   macro avg      0.231     0.231     0.231       236
weighted avg      0.232     0.233     0.232       236

train acc = 1.000, val acc = 0.233


### LGBM Classifier

In [6]:
print(y)

[0 0 0 ... 2 2 2]


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb

# Ensure numeric X
X = X.astype(np.float32)

# Encode y if strings/categorical
if y.dtype.kind in {"U", "S", "O"}:
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    target_names = le.classes_
else:
    y_enc = y
    target_names = None

# Split
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# Model
num_classes = len(np.unique(y_enc))
clf = lgb.LGBMClassifier(
    objective="multiclass" if num_classes > 2 else "binary",
    n_estimators=2000,
    learning_rate=0.05,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    class_weight="balanced",
)

clf.fit(
    X_tr, y_tr,
    eval_set=[(X_va, y_va)],
    eval_metric="multi_logloss" if num_classes > 2 else "logloss",
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(50)]
)

# Evaluate
y_pred = clf.predict(X_va)
print("Val accuracy:", accuracy_score(y_va, y_pred))
print(classification_report(y_va, y_pred, target_names=target_names))
# Optional: feature importance
# print(clf.feature_importances_)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33013
[LightGBM] [Info] Number of data points in the train set: 936, number of used features: 256
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 1.35143
[100]	valid_0's multi_logloss: 1.39526
Early stopping, best iteration is:
[26]	valid_0's multi_logloss: 1.34075
Val accuracy: 0.3872340425531915
              precision    recall  f1-score   support

           0       0.50      0.59      0.54        56
           1       0.39      0.36      0.38        61
           2       0.28      0.25      0.26        53
           3       0.34      0.35

