Cell 1 — Paths & basic checks

In [57]:
import sqlite3
import pandas as pd
from pathlib import Path

# Paths
DB_PATH = Path("data/UHCSDB/microstructures.sqlite")
IMG_DIR = Path("data/UHCSDB/micrographs")

print("DB exists:", DB_PATH.exists())
print("Images folder exists:", IMG_DIR.exists())


DB exists: True
Images folder exists: True


Cell 2 — Inspect DB tables (optional sanity)

In [58]:
conn = sqlite3.connect(DB_PATH)
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)
tables

Unnamed: 0,name
0,user
1,collection
2,sample
3,micrograph


Cell 3 — Peek columns (optional sanity)

In [59]:
tables_list = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)['name'].tolist()
print("Tables:", tables_list)

for t in tables_list:
    try:
        df_ = pd.read_sql(f"SELECT * FROM '{t}' LIMIT 5;", conn)
        print(f"\n=== {t} ===")
        print(df_.columns.tolist())
        display(df_)
    except Exception as e:
        print(f"\n=== {t} === (could not preview) -> {e}")


Tables: ['user', 'collection', 'sample', 'micrograph']

=== user ===
['user_id', 'username', 'givenname', 'familyname', 'email', 'orcid']


Unnamed: 0,user_id,username,givenname,familyname,email,orcid
0,1,bdecost,Brian,DeCost,bdecost@andrew.cmu.edu,0000-0002-3459-5888
1,2,mhecht,Matthew,Hecht,mhecht@andrew.cmu.edu,



=== collection ===
['collection_id', 'name', 'doi']


Unnamed: 0,collection_id,name,doi



=== sample ===
['sample_id', 'label', 'anneal_time', 'anneal_time_unit', 'anneal_temperature', 'anneal_temp_unit', 'cool_method']


Unnamed: 0,sample_id,label,anneal_time,anneal_time_unit,anneal_temperature,anneal_temp_unit,cool_method
0,1,3.1 IN1 Etched Sample 1,,,,,
1,2,AC 800C 8H WQ,8.0,H,800.0,C,WQ
2,3,AC1 + vickers mark,,,,,
3,4,AC1 1000C 5M WQ,5.0,M,1000.0,C,WQ
4,5,AC1 1100C 1H 650C 1H,1.0,H,1100.0,C,650-1H



=== micrograph ===
['micrograph_id', 'path', 'micron_bar', 'micron_bar_units', 'micron_bar_px', 'magnification', 'detector', 'sample_key', 'contributor_key', 'primary_microconstituent']


Unnamed: 0,micrograph_id,path,micron_bar,micron_bar_units,micron_bar_px,magnification,detector,sample_key,contributor_key,primary_microconstituent
0,1,micrograph1.tif,5.0,um,129,4910x,SE,42,2,pearlite
1,2,micrograph2.tif,10.0,um,103,1964X,SE,18,2,spheroidite
2,4,micrograph4.tif,10.0,um,129,,SE,35,2,pearlite+spheroidite
3,5,micrograph5.tif,5.0,um,129,4910X,SE,10,2,pearlite
4,6,micrograph6.tif,20.0,um,124,1178X,SE,29,2,spheroidite


Cell 4 — Pull filenames + microconstituent + scale info

In [60]:
# We target the MICROCONSTITUENT label (not WQ/FC/etc.)

query = """
SELECT 
  m.path                       AS filename,
  m.primary_microconstituent   AS microconstituent,
  s.sample_id                  AS sample_key,
  m.micron_bar,
  m.micron_bar_px
FROM micrograph m
JOIN sample s
  ON m.sample_key = s.sample_id;
"""

df_map = pd.read_sql(query, conn)
df_map.head()


Unnamed: 0,filename,microconstituent,sample_key,micron_bar,micron_bar_px
0,micrograph1.tif,pearlite,42,5.0,129
1,micrograph2.tif,spheroidite,18,10.0,103
2,micrograph4.tif,pearlite+spheroidite,35,10.0,129
3,micrograph5.tif,pearlite,10,5.0,129
4,micrograph6.tif,spheroidite,29,20.0,124


Cell 5 — Resolve paths, compute µm/px, build base dataframe

In [61]:
from typing import Optional

IM_EXTS = [".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"]

def match_path(name: str) -> Optional[Path]:
    base = Path(str(name)).name
    p = IMG_DIR / base
    if p.exists():
        return p
    stem = Path(base).stem
    for ext in IM_EXTS:
        q = IMG_DIR / f"{stem}{ext}"
        if q.exists():
            return q
    hits = list(IMG_DIR.glob(stem + ".*"))
    return hits[0] if hits else None

df_map["filepath"] = df_map["filename"].map(match_path)
missing = df_map["filepath"].isna().sum()
print("Missing (no image found):", missing)

# micrometers-per-pixel
df_map["um_per_px"] = df_map["micron_bar"] / df_map["micron_bar_px"]

# Keep only rows with image + scale info; de-dup by filepath
df_base = (df_map
           .dropna(subset=["filepath","um_per_px"])
           .drop_duplicates(subset=["filepath"])
           [["filepath","microconstituent","sample_key","um_per_px"]]
           .reset_index(drop=True))

df_base.head(), df_base.shape


Missing (no image found): 0


(                                  filepath      microconstituent  sample_key  \
 0  data/UHCSDB/micrographs/micrograph1.tif              pearlite          42   
 1  data/UHCSDB/micrographs/micrograph2.tif           spheroidite          18   
 2  data/UHCSDB/micrographs/micrograph4.tif  pearlite+spheroidite          35   
 3  data/UHCSDB/micrographs/micrograph5.tif              pearlite          10   
 4  data/UHCSDB/micrographs/micrograph6.tif           spheroidite          29   
 
    um_per_px  
 0   0.038760  
 1   0.097087  
 2   0.077519  
 3   0.038760  
 4   0.161290  ,
 (803, 4))

Cell 6 — Clean/standardize microconstituent labels

In [62]:
def clean_microconst(s: str) -> str:
    t = str(s).lower().strip()
    # Mixed classes first
    if "spheroidite" in t and "pearlite" in t:
        return "Pearlite+Spheroidite"
    if "pearlite" in t and "widman" in t:
        return "Pearlite+Widmanstätten"
    if "spheroidite" in t and "widman" in t:
        return "Spheroidite+Widmanstätten"
    # Single classes
    if "widman" in t:
        return "Widmanstätten cementite"
    if "spheroidite" in t:
        return "Spheroidite"
    if "pearlite" in t:
        return "Pearlite"
    if "network" in t or "proeutectoid" in t:
        return "Carbide network"
    if "martensite" in t or "bainite" in t:
        return "Martensite/Bainite"
    # Fallback: capitalize what we got
    return t.capitalize()

dfC = df_base.copy()
dfC["label"] = dfC["microconstituent"].map(clean_microconst)

print("Class counts (raw cleaned):")
print(dfC["label"].value_counts())


Class counts (raw cleaned):
label
Spheroidite                  374
Carbide network              174
Pearlite                      93
Spheroidite+Widmanstätten     80
Pearlite+Spheroidite          38
Martensite/Bainite            36
Pearlite+Widmanstätten         8
Name: count, dtype: int64


Cell 7 — (Optional) Drop very tiny classes

In [63]:
# Adjust or comment out if you want to keep all classes
MIN_SAMPLES = 20
vc = dfC["label"].value_counts()
dfC = dfC[dfC["label"].isin(vc[vc >= MIN_SAMPLES].index)].reset_index(drop=True)

print("Class counts (after MIN_SAMPLES filter):")
print(dfC["label"].value_counts())


Class counts (after MIN_SAMPLES filter):
label
Spheroidite                  374
Carbide network              174
Pearlite                      93
Spheroidite+Widmanstätten     80
Pearlite+Spheroidite          38
Martensite/Bainite            36
Name: count, dtype: int64


Cell 8 — Image I/O & feature extraction (physical GLCM)

In [64]:
import numpy as np, cv2
from tqdm import tqdm
from skimage.feature import graycomatrix, graycoprops

# --- robust image loading/cleanup ---

def to_8bit(img):
    if img.dtype == np.uint16:
        img = (img / 256).astype(np.uint8)
    elif img.dtype != np.uint8:
        img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    return img

def load_gray_clean(p):
    img = cv2.imread(str(p), cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise FileNotFoundError(p)
    img = to_8bit(img)
    # crop bottom to remove scale bar/annotations (heuristic)
    h = img.shape[0]
    img = img[: int(h*0.88), :]
    return img

def normalize_for_glcm(img):
    # CLAHE normalization stabilizes contrast for GLCM
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    return clahe.apply(img)

# --- physical-distance GLCM (fixed distance list length) ---

PHYS_D_UM = np.array([0.2, 0.5, 1.0])  # tweak if you like

def glcm_features_phys(img_gray, um_per_px):
    img = cv2.resize(img_gray, (256,256))
    img = normalize_for_glcm(img)
    img_q = (img / 8).astype(np.uint8)  # 32 gray levels

    dists_px = np.rint(PHYS_D_UM / um_per_px).astype(int)
    dists_px = np.clip(dists_px, 1, 128)            # keep in a reasonable range
    dists_list = dists_px.tolist()                  # fixed length == len(PHYS_D_UM)

    glcm = graycomatrix(
        img_q,
        distances=dists_list,
        angles=[0, np.pi/4, np.pi/2, 3*np.pi/4],
        levels=32, symmetric=True, normed=True
    )

    feats = []
    for prop in ["contrast","dissimilarity","homogeneity","ASM","energy","correlation"]:
        feats.extend(graycoprops(glcm, prop).ravel())
    feats = np.asarray(feats, dtype=np.float32)

    expected = len(dists_list) * 4 * 6
    if feats.size != expected:
        raise ValueError(f"Feature length {feats.size} != expected {expected} for dists={dists_list}")
    return feats

def featurize_phys(paths, um_per_px):
    X=[]
    for i, (p, u) in enumerate(tqdm(list(zip(paths, um_per_px)), total=len(paths), desc="Featurizing")):
        try:
            img = load_gray_clean(p)
            X.append(glcm_features_phys(img, u))
        except Exception as e:
            print(f"\n[featurize_phys] Problem at index {i}, path={p}, um_per_px={u}: {e}")
            raise
    return np.vstack(X)


Cell 9 — Stratified split at SAMPLE level

In [67]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# one label per sample_key (group == sample)
df_samples = (dfC.groupby("sample_key")["label"].first()
              .to_frame().reset_index())

# find classes with < 2 samples (cannot be stratified)
cls_counts = df_samples["label"].value_counts()
rare_classes = cls_counts[cls_counts < 2].index.tolist()
rare_samples = df_samples.loc[df_samples["label"].isin(rare_classes), "sample_key"].values

# core set that can be stratified
core = df_samples.loc[~df_samples["label"].isin(rare_classes)]
core_ids = core["sample_key"].values
core_labels = core["label"].values

if len(core["label"].unique()) < 2:
    raise ValueError("After removing singleton classes, fewer than 2 classes remain for test split.")

# stratified split on the core set
sk_tr_core, sk_te = train_test_split(
    core_ids, test_size=0.20, random_state=42, stratify=core_labels
)

# force singleton-class samples into TRAIN
sk_tr = np.concatenate([sk_tr_core, rare_samples])

# expand to image rows
tr_mask = dfC["sample_key"].isin(sk_tr)
te_mask = dfC["sample_key"].isin(sk_te)

X_tr_paths = dfC.loc[tr_mask, "filepath"].astype(str).values
X_te_paths = dfC.loc[te_mask, "filepath"].astype(str).values
y_tr       = dfC.loc[tr_mask, "label"].values
y_te       = dfC.loc[te_mask, "label"].values
um_tr      = dfC.loc[tr_mask, "um_per_px"].values
um_te      = dfC.loc[te_mask, "um_per_px"].values

print("Classes with only 1 sample_key (kept in TRAIN only):", rare_classes)
print("\nTrain counts:\n", pd.Series(y_tr).value_counts())
print("\nTest counts:\n",  pd.Series(y_te).value_counts())


Classes with only 1 sample_key (kept in TRAIN only): ['Martensite/Bainite']

Train counts:
 Spheroidite                  299
Carbide network              141
Pearlite                      73
Spheroidite+Widmanstätten     72
Martensite/Bainite            36
Pearlite+Spheroidite          16
Name: count, dtype: int64

Test counts:
 Spheroidite                  75
Carbide network              33
Pearlite+Spheroidite         22
Pearlite                     20
Spheroidite+Widmanstätten     8
Name: count, dtype: int64


Cell 10 — Extract features

In [68]:
Xtr = featurize_phys(X_tr_paths, um_tr)
Xte = featurize_phys(X_te_paths, um_te)

Xtr.shape, Xte.shape


Featurizing: 100%|██████████| 637/637 [00:01<00:00, 370.18it/s]
Featurizing: 100%|██████████| 158/158 [00:00<00:00, 325.48it/s]


((637, 72), (158, 72))

Cell 11 — Train balanced RBF‑SVM with a tiny grid; evaluate

In [69]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

pipe = make_pipeline(StandardScaler(),
                     SVC(kernel="rbf", class_weight="balanced", random_state=42))

param_grid = {
    "svc__C":     [0.5, 1, 2, 5, 10],
    "svc__gamma": ["scale", 0.03, 0.1, 0.3]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs = GridSearchCV(pipe, param_grid, cv=cv, scoring="f1_macro", n_jobs=-1)
gs.fit(Xtr, y_tr)

clf = gs.best_estimator_
pred = clf.predict(Xte)

print("Best params:", gs.best_params_, "\n")
print(classification_report(y_te, pred, zero_division=0))
print("Confusion matrix:\n", confusion_matrix(y_te, pred))


Best params: {'svc__C': 10, 'svc__gamma': 0.03} 

                           precision    recall  f1-score   support

          Carbide network       0.96      0.73      0.83        33
       Martensite/Bainite       0.00      0.00      0.00         0
                 Pearlite       0.33      0.35      0.34        20
     Pearlite+Spheroidite       0.75      0.14      0.23        22
              Spheroidite       0.78      0.52      0.62        75
Spheroidite+Widmanstätten       0.16      0.88      0.27         8

                 accuracy                           0.51       158
                macro avg       0.50      0.43      0.38       158
             weighted avg       0.73      0.51      0.56       158

Confusion matrix:
 [[24  9  0  0  0  0]
 [ 0  0  0  0  0  0]
 [ 1  1  7  0  4  7]
 [ 0  2  4  3  6  7]
 [ 0  2 10  1 39 23]
 [ 0  0  0  0  1  7]]


(Optional) Cell 12 — Save a CSV of predictions

In [70]:
out = pd.DataFrame({
    "filepath": X_te_paths,
    "true": y_te,
    "pred": pred
})
out.to_csv("uhcs_microconstituent_predictions.csv", index=False)
out.head()


Unnamed: 0,filepath,true,pred
0,data/UHCSDB/micrographs/micrograph1.tif,Pearlite,Spheroidite+Widmanstätten
1,data/UHCSDB/micrographs/micrograph8.tif,Carbide network,Martensite/Bainite
2,data/UHCSDB/micrographs/micrograph9.tif,Carbide network,Martensite/Bainite
3,data/UHCSDB/micrographs/micrograph10.png,Spheroidite,Spheroidite
4,data/UHCSDB/micrographs/micrograph18.png,Carbide network,Carbide network
