In [2]:
import lightgbm
print(lightgbm.LGBMClassifier)

<class 'lightgbm.sklearn.LGBMClassifier'>


In [3]:
import xgboost
print(xgboost.XGBClassifier)

<class 'xgboost.sklearn.XGBClassifier'>


In [4]:
import sklearn

In [5]:
print(sklearn.ensemble.RandomForestClassifier)

<class 'sklearn.ensemble._forest.RandomForestClassifier'>


In [20]:
# === Imports ===
import os, time
import math
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from lightgbm import LGBMClassifier

pd.set_option("display.max_columns",None)

In [7]:
# === Parameters ===
CSV_PATH = 'Nagano_Aggregated.csv'

RANDOM_STATE = 42
N_JOBS = -1  # used where supported

In [8]:
# === Load ===
df = pd.read_csv(CSV_PATH)

# Feature Engineering

In [9]:
def add_row_level_features(data: pd.DataFrame) -> pd.DataFrame:
    df2 = data.copy()

    # 1) Distance-based
    df2['Genomic_distance'] = (df2['Bin_2'] - df2['Bin_1']).abs()
    df2['Log_distance'] = np.log1p(df2['Genomic_distance'])

    # 2) Transformations of Count
    df2['Log_count'] = np.log1p(df2['Count'])

    # 3) Symmetry / order
    df2['Bin_min'] = df2[['Bin_1','Bin_2']].min(axis=1)
    df2['Bin_max'] = df2[['Bin_1','Bin_2']].max(axis=1)
    df2['Is_forward'] = (df2['Bin_1'] <= df2['Bin_2']).astype(int)
    df2['Midpoint'] = (df2['Bin_1'] + df2['Bin_2']) / 2.0

    # 4) Neighborhood / marginal features
    # Compute marginal sums per (Chr_name, Bin)
    # We'll do this efficiently by stacking
    tmp1 = df2[['Chr_name','Bin_1','Count']].rename(columns={'Bin_1':'Bin'})
    tmp2 = df2[['Chr_name','Bin_2','Count']].rename(columns={'Bin_2':'Bin'})
    # Counts #contacts for a bin with any other bin
    # Cell agnostic
    tmp = pd.concat([tmp1, tmp2], axis=0, ignore_index=True)
    bin_marginal = tmp.groupby(['Chr_name','Bin'])['Count'].sum().rename('Bin_total').reset_index()
    df2 = df2.merge(bin_marginal.rename(columns={'Bin':'Bin_1','Bin_total':'Bin1_total'}),
                    on=['Chr_name','Bin_1'], how='left')
    df2 = df2.merge(bin_marginal.rename(columns={'Bin':'Bin_2','Bin_total':'Bin2_total'}),
                    on=['Chr_name','Bin_2'], how='left')

    df2['Bin_total_min'] = df2[['Bin1_total','Bin2_total']].min(axis=1)
    df2['Bin_total_max'] = df2[['Bin1_total','Bin2_total']].max(axis=1)
    df2['Bin_total_sum'] = df2['Bin1_total'] + df2['Bin2_total']
    df2['Bin_total_diff_abs'] = (df2['Bin1_total'] - df2['Bin2_total']).abs()

    # 5) Observed/expected by distance (chromosome-specific)
    # Expected = mean Count at similar distance bucket within chromosome
    dist_bin = (df2['Genomic_distance'] / max(1, df2['Genomic_distance'].quantile(0.95)/50)).astype(int)
    df2['_dist_bin'] = dist_bin
    exp = df2.groupby(['Chr_name','_dist_bin'])['Count'].mean().rename('Expected_by_dist').reset_index()
    df2 = df2.merge(exp, on=['Chr_name','_dist_bin'], how='left')
    df2['Obs_over_exp_dist'] = df2['Count'] / (df2['Expected_by_dist'] + 1e-9)

    # 6) Percentile rank of Count within distance bin
    def pct_rank(group):
        return group.rank(pct=True)
    df2['Count_pct_in_dist_bin'] = df2.groupby(['Chr_name','_dist_bin'])['Count'].transform(pct_rank)

    # 7) Relative positions on chromosome (if you later add chr length, you can normalize)
    # For now, normalize by within-chr max bin seen in data
    chr_max_bin = pd.concat([
        df2.groupby('Chr_name')['Bin_1'].max(),
        df2.groupby('Chr_name')['Bin_2'].max()
    ], axis=1).max(axis=1).rename('Chr_max_bin')
    df2 = df2.merge(chr_max_bin, left_on='Chr_name', right_index=True, how='left')
    df2['Rel_pos_bin1'] = df2['Bin_1'] / (df2['Chr_max_bin'] + 1e-9)
    df2['Rel_pos_bin2'] = df2['Bin_2'] / (df2['Chr_max_bin'] + 1e-9)
    df2['Rel_pos_mid'] = df2['Midpoint'] / (df2['Chr_max_bin'] + 1e-9)

    # Clean up helper columns
    df2 = df2.drop(columns=['_dist_bin'], errors='ignore')

    return df2

# We aggregate row-level engineered features to per-Cell_name features.
# This is often more aligned with the cell-cycle label being per-cell rather than per-contact.

def aggregate_cell_level(df_rowfeat: pd.DataFrame) -> pd.DataFrame:
    # Choose subset of numeric engineered features to aggregate
    feature_cols = [
        'Genomic_distance','Log_distance','Log_count',
        'Bin_total_min','Bin_total_max','Bin_total_sum','Bin_total_diff_abs',
        'Obs_over_exp_dist','Count_pct_in_dist_bin',
        'Rel_pos_bin1','Rel_pos_bin2','Rel_pos_mid',
        'Chr_leading_eig','Chr_entropy'
    ]
    feature_cols = [c for c in feature_cols if c in df_rowfeat.columns]

    # Aggregations
    agg_funcs = {
        'mean':'mean','std':'std','min':'min','max':'max','median':'median'
    }

    grp = df_rowfeat.groupby(['Cell_name','Target_label'])
    pieces = []
    for fname in feature_cols:
        piece = grp[fname].agg(['mean','std','min','max','median'])
        piece.columns = [f"{fname}_{k}" for k in piece.columns]
        pieces.append(piece)
    agg_df = pd.concat(pieces, axis=1)

    # Also add distance-based proportions (short/medium/long)
    tmp = df_rowfeat.copy()
    q1 = tmp['Genomic_distance'].quantile(0.33)
    q2 = tmp['Genomic_distance'].quantile(0.66)
    tmp['dist_bucket'] = np.where(tmp['Genomic_distance']<=q1, 'short',
                           np.where(tmp['Genomic_distance']<=q2, 'medium','long'))
    prop = tmp.pivot_table(index=['Cell_name','Target_label'],
                           columns='dist_bucket', values='Count',
                           aggfunc='sum', fill_value=0)
    prop = prop.div(prop.sum(axis=1)+1e-9, axis=0)
    prop.columns = [f"prop_{c}" for c in prop.columns]
    agg_df = agg_df.join(prop, how='left')

    agg_df = agg_df.reset_index()
    return agg_df


In [10]:
# Provide your input CSV path here:
CSV_PATH = 'Nagano_Aggregated.csv'  # <-- CHANGE THIS to your file path

# Optional: sampling (set to None to use all rows)
ROW_SAMPLE_N = None  # e.g., 1_000_000 for speed during dev

# Memory guardrails for spectral/matrix features
MAX_BINS_PER_CHR_FOR_SPECTRAL = 2000   # build matrix only if unique bins per chr <= this
MAX_CONTACTS_PER_CHR_FOR_SPECTRAL = 400000  # skip spectral if too many edges
RANDOM_STATE = 42
N_JOBS = -1  # used where supported

df = pd.read_csv(CSV_PATH)

# Basic types
df['Bin_1'] = pd.to_numeric(df['Bin_1'], errors='coerce')
df['Bin_2'] = pd.to_numeric(df['Bin_2'], errors='coerce')
df['Count'] = pd.to_numeric(df['Count'], errors='coerce')

# Add row level features
df_feat = add_row_level_features(df)
print('Row-level features shape:', df_feat.shape)
df_feat.head()

Row-level features shape: (20214896, 26)


Unnamed: 0,Target_label,Cell_name,Chr_name,Bin_1,Bin_2,Count,Genomic_distance,Log_distance,Log_count,Bin_min,Bin_max,Is_forward,Midpoint,Bin1_total,Bin2_total,Bin_total_min,Bin_total_max,Bin_total_sum,Bin_total_diff_abs,Expected_by_dist,Obs_over_exp_dist,Count_pct_in_dist_bin,Chr_max_bin,Rel_pos_bin1,Rel_pos_bin2,Rel_pos_mid
0,late_S,late_S_273,chr1,95,95,72,0,0.0,4.290459,95,95,1,95.0,279412,279412,279412,279412,558824,0,47.049672,1.530298,0.722324,197,0.482234,0.482234,0.482234
1,late_S,late_S_273,chr1,14,14,42,0,0.0,3.7612,14,14,1,14.0,235870,235870,235870,235870,471740,0,47.049672,0.892674,0.589925,197,0.071066,0.071066,0.071066
2,late_S,late_S_273,chr1,45,84,2,39,3.688879,1.098612,45,84,1,64.5,215029,278594,215029,278594,493623,63565,2.601316,0.768842,0.537894,197,0.228426,0.426396,0.327411
3,late_S,late_S_273,chr1,52,53,6,1,0.693147,1.94591,52,53,1,52.5,271080,187824,187824,271080,458904,83256,47.049672,0.127525,0.173187,197,0.263959,0.269036,0.266497
4,late_S,late_S_273,chr1,159,159,53,0,0.0,3.988984,159,159,1,159.0,285184,285184,285184,285184,570368,0,47.049672,1.126469,0.632196,197,0.807107,0.807107,0.807107


In [11]:
# Add Cell level aggregate features
cell_df = aggregate_cell_level(df_feat)
print('Cell-level feature shape:', cell_df.shape)
cell_df.head()

Cell-level feature shape: (1171, 65)


Unnamed: 0,Cell_name,Target_label,Genomic_distance_mean,Genomic_distance_std,Genomic_distance_min,Genomic_distance_max,Genomic_distance_median,Log_distance_mean,Log_distance_std,Log_distance_min,Log_distance_max,Log_distance_median,Log_count_mean,Log_count_std,Log_count_min,Log_count_max,Log_count_median,Bin_total_min_mean,Bin_total_min_std,Bin_total_min_min,Bin_total_min_max,Bin_total_min_median,Bin_total_max_mean,Bin_total_max_std,Bin_total_max_min,Bin_total_max_max,Bin_total_max_median,Bin_total_sum_mean,Bin_total_sum_std,Bin_total_sum_min,Bin_total_sum_max,Bin_total_sum_median,Bin_total_diff_abs_mean,Bin_total_diff_abs_std,Bin_total_diff_abs_min,Bin_total_diff_abs_max,Bin_total_diff_abs_median,Obs_over_exp_dist_mean,Obs_over_exp_dist_std,Obs_over_exp_dist_min,Obs_over_exp_dist_max,Obs_over_exp_dist_median,Count_pct_in_dist_bin_mean,Count_pct_in_dist_bin_std,Count_pct_in_dist_bin_min,Count_pct_in_dist_bin_max,Count_pct_in_dist_bin_median,Rel_pos_bin1_mean,Rel_pos_bin1_std,Rel_pos_bin1_min,Rel_pos_bin1_max,Rel_pos_bin1_median,Rel_pos_bin2_mean,Rel_pos_bin2_std,Rel_pos_bin2_min,Rel_pos_bin2_max,Rel_pos_bin2_median,Rel_pos_mid_mean,Rel_pos_mid_std,Rel_pos_mid_min,Rel_pos_mid_max,Rel_pos_mid_median,prop_long,prop_medium,prop_short
0,G1_1,G1,13.573416,19.613125,0,152,3.0,1.733732,1.434151,0.0,5.030438,1.386294,1.447007,0.910755,0.693147,4.60517,1.098612,242084.917076,54971.445031,202,866408,242555.0,275830.012893,51198.188958,695,866408,278967.0,517914.929969,97894.553207,1390,1732816,521317.0,33745.095817,41266.028343,0,644563,21461.0,0.442431,0.339843,0.017152,3.651586,0.372153,0.321826,0.20443,0.013574,0.977653,0.226512,0.476313,0.270966,0.015228,1.0,0.465116,0.57534,0.271124,0.015228,1.0,0.590164,0.525826,0.26138,0.015228,1.0,0.527778,0.080471,0.064133,0.855396
1,G1_10,G1,14.334655,14.859562,0,138,10.0,2.153037,1.206297,0.0,4.934474,2.397895,1.57171,1.050115,0.693147,5.57973,1.098612,237703.98832,53912.039855,16,866408,237927.0,281042.945045,50360.343149,658,866408,284293.0,518746.933365,93799.776315,1316,1732816,520250.0,43338.956724,45683.108151,0,676341,31768.0,0.879588,0.724853,0.017152,6.729479,0.703813,0.477453,0.267246,0.013574,0.998679,0.504495,0.475761,0.261822,0.015228,1.0,0.475904,0.581675,0.265884,0.015228,1.0,0.593548,0.528718,0.257791,0.015228,1.0,0.538071,0.095503,0.096534,0.807964
2,G1_100,G1,14.258807,12.846144,0,119,12.0,2.240374,1.126506,0.0,4.787492,2.564949,1.432659,0.950612,0.693147,5.736572,1.098612,237269.247565,52762.832294,202,866408,237139.0,285076.002793,47915.722861,3271,866408,289395.0,522345.250358,89648.355457,6542,1732816,524342.0,47806.755228,46074.269412,0,676341,36528.0,0.811166,0.65321,0.017152,7.057525,0.674723,0.454343,0.26087,0.013574,0.999295,0.490453,0.472324,0.261037,0.015228,1.0,0.473684,0.57922,0.26196,0.015228,1.0,0.59542,0.525772,0.256526,0.015228,1.0,0.536184,0.125605,0.116094,0.7583
3,G1_101,G1,15.851466,15.682642,0,143,12.0,2.272627,1.190906,0.0,4.969813,2.564949,1.608215,1.080224,0.693147,5.988961,1.386294,235970.39358,53398.626857,237,866408,235944.0,281737.187189,48251.797399,658,866408,285542.0,517707.58077,91228.94329,1316,1732816,519277.5,45766.793609,45128.466751,0,674033,34591.5,1.03074,0.87969,0.017152,10.911307,0.737836,0.518441,0.280159,0.013574,0.999977,0.52287,0.467115,0.262684,0.015228,1.0,0.465409,0.583349,0.263072,0.015228,1.0,0.592,0.525232,0.256384,0.015228,1.0,0.530612,0.101818,0.089761,0.808421
4,G1_102,G1,13.594406,20.913794,0,157,2.0,1.653106,1.468138,0.0,5.062595,1.098612,1.372676,0.835581,0.693147,4.343805,1.098612,243078.080682,55100.264399,646,866408,243314.0,277515.49397,51708.792684,6545,866408,280954.0,520593.574652,97900.114417,13090,1732816,522968.5,34437.413288,42839.752012,0,674033,21381.0,0.364093,0.282027,0.017152,3.217198,0.340168,0.275471,0.180393,0.013574,0.965783,0.215409,0.475513,0.271072,0.015228,1.0,0.464302,0.574079,0.272932,0.015228,1.0,0.6,0.524796,0.261418,0.015228,1.0,0.53081,0.08305,0.05939,0.857561


# Load and add INR embeddings

In [14]:
import os, re
import numpy as np
import pandas as pd

data_root = data_root = "/home/kgulbarg/thesis/INR_SCHIC/Nagano_Data"

def build_latent_df(data_root: str) -> pd.DataFrame:
    """
    Walks data_root/<Target_label>/ and loads <Cell_name>_latent.npy.
    Returns a DataFrame with columns: ['Cell_name','Target_label','latent_0',...].
    """
    latents = {}
    dim = None
    pat = re.compile(r'^(?P<cell>.+)_latent\.npy$')

    for label in sorted(os.listdir(data_root)):
        label_dir = os.path.join(data_root, label)
        if not os.path.isdir(label_dir):
            continue

        for fname in sorted(os.listdir(label_dir)):
            m = pat.match(fname)
            if not m:
                continue
            cell = m.group('cell')
            path = os.path.join(label_dir, fname)

            vec = np.load(path).ravel()
            if dim is None:
                dim = vec.size
            elif vec.size != dim:
                raise ValueError(f"Inconsistent latent size for {path}: {vec.size} vs {dim}")

            # key by (Cell_name, Target_label)
            latents[(cell, label)] = vec.astype(np.float32)

    if not latents:
        raise RuntimeError("No *_latent.npy files found.")

    idx = pd.MultiIndex.from_tuples(list(latents.keys()), names=['Cell_name','Target_label'])    
    mat = np.vstack(list(latents.values()))
    cols = [f'latent_{i}' for i in range(mat.shape[1])]
    latent_df = pd.DataFrame(mat, index=idx, columns=cols).reset_index()
    return latent_df

# 1) Build latent dataframe from your folder tree
latent_df = build_latent_df(data_root)   # data_root points to the folder that contains G1/, early_S/, ...

# 2) Merge with your engineered features
cell_df_aug = cell_df.merge(latent_df, on=['Cell_name','Target_label'], how='left')

# 3) (Optional) sanity checks: missing latents?
missing = cell_df_aug[cell_df_aug.filter(like='latent_').isna().all(axis=1)][['Cell_name','Target_label']]
if len(missing):
    print(f"Rows with missing latents: {len(missing)}")
    # print(missing.head())

print(cell_df_aug.shape)
cell_df_aug.head()

(1171, 321)


Unnamed: 0,Cell_name,Target_label,Genomic_distance_mean,Genomic_distance_std,Genomic_distance_min,Genomic_distance_max,Genomic_distance_median,Log_distance_mean,Log_distance_std,Log_distance_min,Log_distance_max,Log_distance_median,Log_count_mean,Log_count_std,Log_count_min,Log_count_max,Log_count_median,Bin_total_min_mean,Bin_total_min_std,Bin_total_min_min,Bin_total_min_max,Bin_total_min_median,Bin_total_max_mean,Bin_total_max_std,Bin_total_max_min,Bin_total_max_max,Bin_total_max_median,Bin_total_sum_mean,Bin_total_sum_std,Bin_total_sum_min,Bin_total_sum_max,Bin_total_sum_median,Bin_total_diff_abs_mean,Bin_total_diff_abs_std,Bin_total_diff_abs_min,Bin_total_diff_abs_max,Bin_total_diff_abs_median,Obs_over_exp_dist_mean,Obs_over_exp_dist_std,Obs_over_exp_dist_min,Obs_over_exp_dist_max,Obs_over_exp_dist_median,Count_pct_in_dist_bin_mean,Count_pct_in_dist_bin_std,Count_pct_in_dist_bin_min,Count_pct_in_dist_bin_max,Count_pct_in_dist_bin_median,Rel_pos_bin1_mean,Rel_pos_bin1_std,Rel_pos_bin1_min,Rel_pos_bin1_max,Rel_pos_bin1_median,Rel_pos_bin2_mean,Rel_pos_bin2_std,Rel_pos_bin2_min,Rel_pos_bin2_max,Rel_pos_bin2_median,Rel_pos_mid_mean,Rel_pos_mid_std,Rel_pos_mid_min,Rel_pos_mid_max,Rel_pos_mid_median,prop_long,prop_medium,prop_short,latent_0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,latent_11,latent_12,latent_13,latent_14,latent_15,latent_16,latent_17,latent_18,latent_19,latent_20,latent_21,latent_22,latent_23,latent_24,latent_25,latent_26,latent_27,latent_28,latent_29,latent_30,latent_31,latent_32,latent_33,latent_34,latent_35,latent_36,latent_37,latent_38,latent_39,latent_40,latent_41,latent_42,latent_43,latent_44,latent_45,latent_46,latent_47,latent_48,latent_49,latent_50,latent_51,latent_52,latent_53,latent_54,latent_55,latent_56,latent_57,latent_58,latent_59,latent_60,latent_61,latent_62,latent_63,latent_64,latent_65,latent_66,latent_67,latent_68,latent_69,latent_70,latent_71,latent_72,latent_73,latent_74,latent_75,latent_76,latent_77,latent_78,latent_79,latent_80,latent_81,latent_82,latent_83,latent_84,latent_85,latent_86,latent_87,latent_88,latent_89,latent_90,latent_91,latent_92,latent_93,latent_94,latent_95,latent_96,latent_97,latent_98,latent_99,latent_100,latent_101,latent_102,latent_103,latent_104,latent_105,latent_106,latent_107,latent_108,latent_109,latent_110,latent_111,latent_112,latent_113,latent_114,latent_115,latent_116,latent_117,latent_118,latent_119,latent_120,latent_121,latent_122,latent_123,latent_124,latent_125,latent_126,latent_127,latent_128,latent_129,latent_130,latent_131,latent_132,latent_133,latent_134,latent_135,latent_136,latent_137,latent_138,latent_139,latent_140,latent_141,latent_142,latent_143,latent_144,latent_145,latent_146,latent_147,latent_148,latent_149,latent_150,latent_151,latent_152,latent_153,latent_154,latent_155,latent_156,latent_157,latent_158,latent_159,latent_160,latent_161,latent_162,latent_163,latent_164,latent_165,latent_166,latent_167,latent_168,latent_169,latent_170,latent_171,latent_172,latent_173,latent_174,latent_175,latent_176,latent_177,latent_178,latent_179,latent_180,latent_181,latent_182,latent_183,latent_184,latent_185,latent_186,latent_187,latent_188,latent_189,latent_190,latent_191,latent_192,latent_193,latent_194,latent_195,latent_196,latent_197,latent_198,latent_199,latent_200,latent_201,latent_202,latent_203,latent_204,latent_205,latent_206,latent_207,latent_208,latent_209,latent_210,latent_211,latent_212,latent_213,latent_214,latent_215,latent_216,latent_217,latent_218,latent_219,latent_220,latent_221,latent_222,latent_223,latent_224,latent_225,latent_226,latent_227,latent_228,latent_229,latent_230,latent_231,latent_232,latent_233,latent_234,latent_235,latent_236,latent_237,latent_238,latent_239,latent_240,latent_241,latent_242,latent_243,latent_244,latent_245,latent_246,latent_247,latent_248,latent_249,latent_250,latent_251,latent_252,latent_253,latent_254,latent_255
0,G1_1,G1,13.573416,19.613125,0,152,3.0,1.733732,1.434151,0.0,5.030438,1.386294,1.447007,0.910755,0.693147,4.60517,1.098612,242084.917076,54971.445031,202,866408,242555.0,275830.012893,51198.188958,695,866408,278967.0,517914.929969,97894.553207,1390,1732816,521317.0,33745.095817,41266.028343,0,644563,21461.0,0.442431,0.339843,0.017152,3.651586,0.372153,0.321826,0.20443,0.013574,0.977653,0.226512,0.476313,0.270966,0.015228,1.0,0.465116,0.57534,0.271124,0.015228,1.0,0.590164,0.525826,0.26138,0.015228,1.0,0.527778,0.080471,0.064133,0.855396,0.642443,0.0,0.401247,0.42939,0.0,0.352586,0.389757,0.549973,0.0,0.452508,0.236006,0.0,0.399444,0.0,0.0,0.0,0.0,0.31367,0.469898,0.396538,0.0,0.0,0.234453,0.0,0.0,0.0,0.0,0.0,0.147327,0.0,0.426035,0.0,0.401745,0.41692,0.41365,0.0,0.0,0.474021,0.0,0.0,0.0,0.0,0.0,0.34748,0.0,0.258347,0.0,0.304485,0.0,0.0,0.0,0.436573,0.0,0.0,0.251139,0.0,0.0,0.483704,0.380901,0.5317,0.358215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.270945,0.478178,0.0,0.0,0.543027,0.0,0.0,0.506631,0.0,0.387376,0.0,0.327745,0.0,0.0,0.572741,0.0,0.0,0.293408,0.15358,0.0,0.0,0.423925,0.437509,0.461633,0.0,0.0,0.0,0.0,0.0,0.430555,0.0,0.487585,0.0,0.0,0.0,0.0,0.53268,0.408922,0.413503,0.0,0.0,0.0,0.0,0.0,0.337109,0.0,0.0,0.0,0.452588,0.0,0.0,0.381829,0.0,0.353133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296,0.413722,0.0,0.0,0.479518,0.430811,0.075826,0.0,0.0,0.0,0.393451,0.0,0.0,0.404621,0.469484,0.257202,0.0,0.0,0.0,0.410628,0.0,0.0,0.391851,0.383968,0.497917,0.476233,0.540704,0.357756,0.0,0.0,0.0,0.0,0.430979,0.0,0.40047,0.0,0.435026,0.0,0.0,0.0,0.227962,0.502107,0.0,0.0,0.0,0.0,0.0,0.0,0.247229,0.0,0.0,0.563266,0.0,0.445383,0.0,0.283605,0.0,0.109649,0.357684,0.518322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408928,0.0,0.0,0.0,0.0,0.0,0.0,0.47439,0.373156,0.57044,0.0,0.0,0.0,0.0,0.0,0.320076,0.108793,0.0,0.0,0.349661,0.404013,0.392637,0.436641,0.399356,0.324703,0.0,0.434408,0.0,0.47029,0.414034,0.0,0.280679,0.0,0.381245,0.238449,0.133715,0.0,0.0,0.0,0.441153,0.580427,0.0,0.0,0.0,0.448834,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,G1_10,G1,14.334655,14.859562,0,138,10.0,2.153037,1.206297,0.0,4.934474,2.397895,1.57171,1.050115,0.693147,5.57973,1.098612,237703.98832,53912.039855,16,866408,237927.0,281042.945045,50360.343149,658,866408,284293.0,518746.933365,93799.776315,1316,1732816,520250.0,43338.956724,45683.108151,0,676341,31768.0,0.879588,0.724853,0.017152,6.729479,0.703813,0.477453,0.267246,0.013574,0.998679,0.504495,0.475761,0.261822,0.015228,1.0,0.475904,0.581675,0.265884,0.015228,1.0,0.593548,0.528718,0.257791,0.015228,1.0,0.538071,0.095503,0.096534,0.807964,0.0,0.0,0.481217,0.0,0.0,0.291385,0.0,0.145023,0.0,0.0,0.0,0.382594,0.0,0.240666,0.0,0.404465,0.0,0.0,0.456903,0.0,0.0,0.0,0.0,0.491596,0.0,0.0,0.500439,0.202437,0.0,0.0,0.434649,0.310721,0.337824,0.0,0.425194,0.0,0.0,0.0,0.0,0.0,0.469815,0.292756,0.380586,0.0,0.0,0.0,0.463346,0.0,0.367286,0.0,0.299047,0.0,0.337063,0.0,0.0,0.535068,0.354222,0.0,0.310702,0.183038,0.337254,0.0,0.337668,0.377522,0.0,0.0,0.0,0.440194,0.483119,0.0,0.447779,0.0,0.433084,0.330356,0.0,0.427571,0.0,0.0,0.453537,0.374587,0.0,0.427344,0.0,0.397394,0.117931,0.0,0.0,0.0,0.0,0.0,0.370681,0.0,0.274212,0.0,0.0,0.49357,0.0,0.0,0.0,0.0,0.365468,0.0,0.377389,0.0,0.0,0.0,0.509017,0.268736,0.432762,0.0,0.0,0.285496,0.497481,0.037232,0.0,0.482322,0.0,0.294735,0.460971,0.0,0.406051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.30991,0.0,0.0,0.446105,0.0,0.0,0.0,0.491707,0.494447,0.0,0.0,0.399527,0.45885,0.0,0.0,0.456039,0.434679,0.0,0.484965,0.0,0.431326,0.0,0.0,0.0,0.0,0.0,0.0,0.34333,0.0,0.197661,0.342192,0.0,0.0,0.0,0.0,0.0,0.442835,0.0,0.0,0.0,0.303155,0.49818,0.402737,0.0,0.0,0.0,0.0,0.360475,0.235497,0.033281,0.0,0.0,0.354302,0.158305,0.0,0.0,0.4571,0.370281,0.0,0.0,0.445662,0.0,0.205396,0.215876,0.299694,0.0,0.40906,0.0,0.0,0.385694,0.0,0.0,0.0,0.489995,0.397044,0.0,0.388142,0.0,0.504503,0.0,0.0,0.0,0.504907,0.0,0.397604,0.309168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143978,0.370459,0.396621,0.364284,0.0,0.0,0.277223,0.336845,0.0,0.535434,0.40105,0.0,0.425981,0.342679,0.350609,0.0,0.526145,0.0,0.44044,0.0,0.0,0.0,0.271208,0.0,0.403849,0.447276,0.0,0.0,0.380577,0.384111,0.0,0.0
2,G1_100,G1,14.258807,12.846144,0,119,12.0,2.240374,1.126506,0.0,4.787492,2.564949,1.432659,0.950612,0.693147,5.736572,1.098612,237269.247565,52762.832294,202,866408,237139.0,285076.002793,47915.722861,3271,866408,289395.0,522345.250358,89648.355457,6542,1732816,524342.0,47806.755228,46074.269412,0,676341,36528.0,0.811166,0.65321,0.017152,7.057525,0.674723,0.454343,0.26087,0.013574,0.999295,0.490453,0.472324,0.261037,0.015228,1.0,0.473684,0.57922,0.26196,0.015228,1.0,0.59542,0.525772,0.256526,0.015228,1.0,0.536184,0.125605,0.116094,0.7583,0.0,0.332344,0.0,0.199447,0.407456,0.0,0.309861,0.128031,0.233504,0.196954,0.0,0.0,0.376688,0.148077,0.0,0.072547,0.0,0.0,0.0,0.0,0.0,0.265051,0.362549,0.0,0.0,0.318551,0.0,0.420586,0.0,0.0,0.0,0.455901,0.323006,0.0,0.330962,0.0,0.0,0.255534,0.0,0.272242,0.167816,0.0,0.0,0.0,0.0,0.079662,0.0,0.473346,0.093895,0.0,0.0,0.0,0.204034,0.0,0.254399,0.499035,0.0,0.0,0.0,0.135348,0.0,0.0,0.443524,0.220287,0.417176,0.402235,0.0,0.085818,0.0,0.0,0.0,0.278493,0.0,0.0,0.53363,0.414705,0.0,0.138497,0.0,0.0,0.387786,0.458726,0.0,0.104628,0.281042,0.0,0.0,0.179386,0.526493,0.296985,0.063769,0.389172,0.0,0.490626,0.0,0.391073,0.0,0.0,0.429909,0.0,0.0,0.122614,0.359332,0.099789,0.045935,0.0,0.0,0.0,0.0,0.338982,0.0,0.0,0.0,0.0,0.177963,0.0,0.0,0.0,0.0,0.276869,0.08818,0.0,0.461212,0.0,0.374938,0.0,0.0,0.0,0.406394,0.0,0.278941,0.381481,0.362549,0.218982,0.1261,0.368163,0.0,0.0,0.0,0.383694,0.0,0.0,0.15749,0.0,0.0,0.435136,0.293822,0.0,0.0,0.0,0.266977,0.0,0.283989,0.0,0.0,0.233984,0.354076,0.0,0.0,0.44914,0.0,0.0,0.0,0.0,0.464036,0.0,0.487259,0.0,0.0,0.239453,0.0,0.570879,0.337612,0.393504,0.0,0.0,0.28996,0.0,0.321482,0.0,0.0,0.0,0.0,0.0,0.400766,0.389276,0.0,0.0,0.223073,0.0,0.509601,0.37554,0.486603,0.003329,0.0,0.292191,0.390034,0.367545,0.338617,0.097986,0.002321,0.0,0.26318,0.0,0.42039,0.336951,0.232074,0.0,0.396692,0.055592,0.0,0.0,0.473306,0.0,0.279232,0.350634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500156,0.482952,0.0,0.0,0.0,0.0,0.50415,0.0,0.0,0.0,0.185413,0.4546,0.0,0.37228,0.0,0.257173,0.0,0.268543,0.321272,0.137866,0.389266,0.0,0.0,0.0,0.355219,0.0,0.0,0.0
3,G1_101,G1,15.851466,15.682642,0,143,12.0,2.272627,1.190906,0.0,4.969813,2.564949,1.608215,1.080224,0.693147,5.988961,1.386294,235970.39358,53398.626857,237,866408,235944.0,281737.187189,48251.797399,658,866408,285542.0,517707.58077,91228.94329,1316,1732816,519277.5,45766.793609,45128.466751,0,674033,34591.5,1.03074,0.87969,0.017152,10.911307,0.737836,0.518441,0.280159,0.013574,0.999977,0.52287,0.467115,0.262684,0.015228,1.0,0.465409,0.583349,0.263072,0.015228,1.0,0.592,0.525232,0.256384,0.015228,1.0,0.530612,0.101818,0.089761,0.808421,0.359912,0.348091,0.51939,0.0,0.0,0.0,0.488282,0.0,0.462098,0.0,0.490981,0.0,0.0,0.0,0.0,0.0,0.070456,0.0,0.0,0.108573,0.0,0.0,0.0,0.296608,0.0,0.0,0.0,0.0,0.0,0.0,0.352677,0.0,0.357697,0.0,0.0,0.505649,0.0,0.0,0.0,0.0,0.371532,0.421739,0.0,0.0,0.0,0.553732,0.0,0.42711,0.0,0.485084,0.38498,0.0,0.0,0.0,0.35417,0.0,0.42779,0.0,0.0,0.526326,0.461985,0.0,0.116026,0.333104,0.0,0.508769,0.0,0.44955,0.323798,0.0,0.0,0.0,0.0,0.0,0.349028,0.0,0.398813,0.0,0.35251,0.0,0.0,0.0,0.0,0.283149,0.0,0.0,0.295386,0.0,0.180093,0.0,0.0,0.465389,0.420284,0.0,0.434621,0.394717,0.300574,0.359889,0.0,0.446136,0.455734,0.0,0.0,0.0,0.0,0.447945,0.0,0.0,0.0,0.352302,0.0,0.0,0.196658,0.550938,0.0,0.459782,0.0,0.0,0.366914,0.294922,0.0,0.389227,0.0,0.404676,0.0,0.219122,0.337489,0.0,0.0,0.0,0.225973,0.0,0.0,0.0,0.0,0.0,0.417649,0.293843,0.0,0.0,0.0,0.501743,0.410536,0.484544,0.0,0.41493,0.381699,0.598127,0.225539,0.3525,0.0,0.0,0.0,0.0,0.460133,0.484158,0.0,0.474826,0.0,0.365553,0.383614,0.0,0.0,0.0,0.0,0.0,0.090602,0.0,0.375971,0.0,0.0,0.0,0.0,0.303706,0.319213,0.0,0.452387,0.0,0.493528,0.0,0.262352,0.48856,0.0,0.0,0.0,0.571775,0.414953,0.0,0.0,0.0,0.456136,0.22832,0.428273,0.0,0.0,0.0,0.0,0.345484,0.0,0.336212,0.0,0.0,0.38515,0.410948,0.0,0.0,0.0,0.550399,0.0,0.0,0.0,0.0,0.553241,0.0,0.080653,0.427259,0.41066,0.0,0.0,0.333987,0.0,0.0,0.325174,0.0,0.0,0.142068,0.0,0.0,0.0,0.0,0.486093,0.450749,0.0,0.0,0.249384,0.257478,0.0,0.459038,0.366584,0.0,0.384362,0.0,0.293492,0.0,0.0,0.0,0.411403,0.0,0.0,0.0,0.0,0.491165,0.22918,0.0,0.0,0.0
4,G1_102,G1,13.594406,20.913794,0,157,2.0,1.653106,1.468138,0.0,5.062595,1.098612,1.372676,0.835581,0.693147,4.343805,1.098612,243078.080682,55100.264399,646,866408,243314.0,277515.49397,51708.792684,6545,866408,280954.0,520593.574652,97900.114417,13090,1732816,522968.5,34437.413288,42839.752012,0,674033,21381.0,0.364093,0.282027,0.017152,3.217198,0.340168,0.275471,0.180393,0.013574,0.965783,0.215409,0.475513,0.271072,0.015228,1.0,0.464302,0.574079,0.272932,0.015228,1.0,0.6,0.524796,0.261418,0.015228,1.0,0.53081,0.08305,0.05939,0.857561,0.097069,0.0,0.080176,0.537194,0.373468,0.0,0.324674,0.388925,0.0,0.124439,0.0,0.0,0.36112,0.0,0.0,0.493789,0.0,0.342109,0.337404,0.208659,0.390767,0.0,0.0,0.0,0.219944,0.0,0.204095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.160482,0.0,0.147513,0.0,0.0,0.0,0.134954,0.0,0.0,0.0,0.0,0.0,0.343779,0.0,0.0,0.137074,0.0,0.0,0.0,0.13899,0.415173,0.0,0.031907,0.0,0.0,0.0,0.0,0.0,0.0,0.337465,0.0,0.0,0.0,0.113737,0.0,0.0,0.0,0.192459,0.17064,0.34934,0.0,0.0,0.463782,0.333583,0.256572,0.331499,0.0,0.0,0.0,0.0,0.0,0.571785,0.368306,0.0,0.0,0.486901,0.0,0.0,0.0,0.371714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.504184,0.0,0.401909,0.0,0.329907,0.290699,0.0,0.469517,0.348091,0.241481,0.0,0.325639,0.493663,0.426238,0.0,0.453888,0.0,0.457614,0.432032,0.0,0.0,0.441383,0.0,0.0,0.110972,0.415722,0.106508,0.0,0.146612,0.313192,0.0,0.0,0.324854,0.0,0.0,0.0,0.302565,0.459342,0.0,0.0,0.494116,0.180444,0.0,0.0,0.0,0.0,0.3911,0.0,0.0,0.465105,0.0,0.0,0.0,0.0,0.132297,0.294074,0.251794,0.0,0.219626,0.0,0.0,0.384064,0.0,0.0,0.0,0.473428,0.0,0.351928,0.0,0.0,0.552943,0.487375,0.0,0.498328,0.0,0.414123,0.403258,0.503771,0.376101,0.0,0.193117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.44326,0.0,0.387145,0.508782,0.211237,0.477418,0.354421,0.287216,0.0,0.341158,0.0,0.421311,0.0,0.0,0.0,0.0,0.0,0.308887,0.0,0.0,0.191636,0.0,0.142158,0.0,0.0,0.261524,0.0,0.0,0.0,0.0,0.412112,0.0,0.0,0.207995,0.0,0.467254,0.0,0.195718,0.0,0.387284,0.415184,0.0,0.391564,0.0,0.225718,0.422135,0.0,0.0,0.26659,0.0,0.353857,0.46504,0.365113,0.303869,0.0,0.354298,0.0,0.300523,0.126554,0.0,0.0


In [22]:
# Define Train Test Split
X_cell = cell_df_aug.drop(columns=['Cell_name','Target_label'])
y_cell = cell_df_aug['Target_label'].copy()

# Train/test split with stratification (note small N risk)
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_cell, y_cell, test_size=0.25, random_state=RANDOM_STATE, stratify=y_cell
)

# 1) Encode labels (fit on train only)
le = LabelEncoder()
yc_train_enc = le.fit_transform(yc_train)
yc_test_enc  = le.transform(yc_test)

# 2) Optional: class imbalance → per-class weights
w = compute_class_weight(class_weight='balanced',
                         classes=np.unique(yc_train_enc),
                         y=yc_train_enc)
class_weight = {int(c): float(wt) for c, wt in zip(np.unique(yc_train_enc), w)}
# LightGBM accepts dict for multiclass (keys = class indices when y is encoded)

# 3) LGBM multiclass classifier
lgbm = LGBMClassifier(
    objective='multiclass',        # multiclass softmax
    num_class=len(le.classes_),    # K
    n_estimators=400,
    learning_rate=0.05,
    num_leaves=63,                 # ~controls tree complexity
    max_depth=-1,                  # no hard cap; you can set 8–12 if overfitting
    min_child_samples=50,
    subsample=0.8,                 # row sampling
    colsample_bytree=0.8,          # feature sampling
    reg_alpha=0.0,
    reg_lambda=0.0,
    random_state=RANDOM_STATE,
    n_jobs=N_JOBS,
    class_weight=class_weight,     # or set to None to disable weighting
    metric='multi_logloss'
)

# 4) Fit (with early stopping if you like)
lgbm.fit(
    Xc_train, yc_train_enc,
    eval_set=[(Xc_test, yc_test_enc)],
    eval_metric='multi_logloss',
    callbacks=[]
    # for early stopping (LightGBM>=4): callbacks=[lgb.early_stopping(50, verbose=False)]
)

# 5) Predict and map back
yc_pred_enc = lgbm.predict(Xc_test)           # returns class indices
yc_pred = le.inverse_transform(yc_pred_enc)

print('=== Baseline: LightGBM (Cell-Level) ===')
print(classification_report(yc_test, yc_pred, target_names=le.classes_))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40810
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 306
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
=== Baseline: LightGBM (Cell-Level) ===
              precision    recall  f1-score   support

          G1       0.88      0.91      0.90        70
     early_S       0.73      0.88      0.80        76
      late_S       0.76      0.68      0.72        82
       mid_S       0.59      0.49      0.54        65

    accuracy                           0.75       293
   macro avg       0.74      0.74      0.74       293
weighted avg       0.74      0.75      0.74       293

