In [None]:
import sqlite3
import pandas as pd
from pathlib import Path

# CHECKS IF PATHS EXIST

# Paths
DB_PATH = Path("data/UHCSDB/microstructures.sqlite")
IMG_DIR = Path("data/UHCSDB/micrographs")

print("DB exists:", DB_PATH.exists())
print("Images folder exists:", IMG_DIR.exists())


DB exists: True
Images folder exists: True


In [None]:
# GRABS TABLE VALUES FROM SQL
conn = sqlite3.connect(DB_PATH)
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)
tables

Unnamed: 0,name
0,user
1,collection
2,sample
3,micrograph


In [None]:
import sqlite3, pandas as pd
from pathlib import Path

# GETS  VALUES FROM SQL

DB_PATH = Path("data/UHCSDB/microstructures.sqlite")
conn = sqlite3.connect(DB_PATH)

tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)['name'].tolist()
print("Tables:", tables)

# Peek a few rows from each table to see names/columns
for t in tables:
    try:
        df_ = pd.read_sql(f"SELECT * FROM '{t}' LIMIT 5;", conn)
        print(f"\n=== {t} ===")
        print(df_.columns.tolist())
        display(df_)
    except Exception as e:
        print(f"\n=== {t} === (could not preview) -> {e}")


Tables: ['user', 'collection', 'sample', 'micrograph']

=== user ===
['user_id', 'username', 'givenname', 'familyname', 'email', 'orcid']


Unnamed: 0,user_id,username,givenname,familyname,email,orcid
0,1,bdecost,Brian,DeCost,bdecost@andrew.cmu.edu,0000-0002-3459-5888
1,2,mhecht,Matthew,Hecht,mhecht@andrew.cmu.edu,



=== collection ===
['collection_id', 'name', 'doi']


Unnamed: 0,collection_id,name,doi



=== sample ===
['sample_id', 'label', 'anneal_time', 'anneal_time_unit', 'anneal_temperature', 'anneal_temp_unit', 'cool_method']


Unnamed: 0,sample_id,label,anneal_time,anneal_time_unit,anneal_temperature,anneal_temp_unit,cool_method
0,1,3.1 IN1 Etched Sample 1,,,,,
1,2,AC 800C 8H WQ,8.0,H,800.0,C,WQ
2,3,AC1 + vickers mark,,,,,
3,4,AC1 1000C 5M WQ,5.0,M,1000.0,C,WQ
4,5,AC1 1100C 1H 650C 1H,1.0,H,1100.0,C,650-1H



=== micrograph ===
['micrograph_id', 'path', 'micron_bar', 'micron_bar_units', 'micron_bar_px', 'magnification', 'detector', 'sample_key', 'contributor_key', 'primary_microconstituent']


Unnamed: 0,micrograph_id,path,micron_bar,micron_bar_units,micron_bar_px,magnification,detector,sample_key,contributor_key,primary_microconstituent
0,1,micrograph1.tif,5.0,um,129,4910x,SE,42,2,pearlite
1,2,micrograph2.tif,10.0,um,103,1964X,SE,18,2,spheroidite
2,4,micrograph4.tif,10.0,um,129,,SE,35,2,pearlite+spheroidite
3,5,micrograph5.tif,5.0,um,129,4910X,SE,10,2,pearlite
4,6,micrograph6.tif,20.0,um,124,1178X,SE,29,2,spheroidite


In [None]:
# GETS FILENAME AND LABEL FROM SQL

query = """
SELECT micrograph.path AS filename,
       sample.label AS label
FROM micrograph
JOIN sample
  ON micrograph.sample_key = sample.sample_id;
"""

df_map = pd.read_sql(query, conn)
df_map.head()


Unnamed: 0,filename,label
0,micrograph1.tif,ET Gyro
1,micrograph2.tif,AC1 800C 85H WQ
2,micrograph4.tif,AC1 970C 90M FC
3,micrograph5.tif,AC1 750C 5M WQ
4,micrograph6.tif,AC1 970C 90M AR


In [None]:
from pathlib import Path
from typing import Optional
import pandas as pd

# Resolve filenames → filepaths

IMG_DIR = Path("data/UHCSDB/micrographs")
IM_EXTS = [".png",".jpg",".jpeg",".tif",".tiff",".bmp"]

def match_path(name: str) -> Optional[Path]:
    base = Path(str(name)).name
    p = IMG_DIR / base
    if p.exists():
        return p
    stem = Path(base).stem
    for ext in IM_EXTS:
        q = IMG_DIR / f"{stem}{ext}"
        if q.exists():
            return q
    hits = list(IMG_DIR.glob(stem + ".*"))
    return hits[0] if hits else None

df_map["filepath"] = df_map["filename"].map(match_path)
missing = df_map["filepath"].isna().sum()
print("Missing (no image found):", missing)

labels_df = df_map.dropna(subset=["filepath"]).drop_duplicates(subset=["filepath"])[["filepath","label"]]
labels_df.head(), labels_df.shape




Missing (no image found): 0


(                                  filepath            label
 0  data/UHCSDB/micrographs/micrograph1.tif          ET Gyro
 1  data/UHCSDB/micrographs/micrograph2.tif  AC1 800C 85H WQ
 2  data/UHCSDB/micrographs/micrograph4.tif  AC1 970C 90M FC
 3  data/UHCSDB/micrographs/micrograph5.tif   AC1 750C 5M WQ
 4  data/UHCSDB/micrographs/micrograph6.tif  AC1 970C 90M AR,
 (803, 2))

In [None]:
# SAVES LABELS TO CSV

labels_df.to_csv("data/UHCSDB/labels.csv", index=False)
print("saved labels.csv")


saved labels.csv


In [7]:
import numpy as np, cv2
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from skimage.feature import graycomatrix, graycoprops

df = labels_df.copy()

# keep only classes with at least N samples (stabilizes training)
MIN_SAMPLES = 10
vc = df["label"].value_counts()
df = df[df["label"].isin(vc[vc>=MIN_SAMPLES].index)].reset_index(drop=True)

def load_gray(p): 
    img = cv2.imread(str(p), cv2.IMREAD_GRAYSCALE)
    if img is None: raise FileNotFoundError(p)
    return img

def glcm_features(img_gray):
    img = cv2.resize(img_gray, (256,256))
    img_q = (img/8).astype(np.uint8)           # 32 levels
    glcm = graycomatrix(img_q, [1,2,4], [0, np.pi/4, np.pi/2, 3*np.pi/4],
                        levels=32, symmetric=True, normed=True)
    feats=[]
    for prop in ["contrast","dissimilarity","homogeneity","ASM","energy","correlation"]:
        feats.extend(graycoprops(glcm, prop).ravel())
    return np.array(feats, dtype=np.float32)

def featurize(paths):
    X=[]
    for p in tqdm(paths, desc="Featurizing"):
        X.append(glcm_features(load_gray(p)))
    return np.vstack(X)

X_paths = df["filepath"].astype(str).values
y       = df["label"].astype(str).values

X_tr, X_te, y_tr, y_te = train_test_split(X_paths, y, test_size=0.2, stratify=y, random_state=42)
Xtr = featurize(X_tr); Xte = featurize(X_te)

clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", probability=True, random_state=42))
clf.fit(Xtr, y_tr)
pred = clf.predict(Xte)

print(classification_report(y_te, pred))
print("Confusion matrix:\n", confusion_matrix(y_te, pred))


Featurizing: 100%|██████████| 609/609 [00:01<00:00, 367.44it/s]
Featurizing: 100%|██████████| 153/153 [00:00<00:00, 330.28it/s]


                        precision    recall  f1-score   support

         AC 800C 8H WQ       0.00      0.00      0.00         4
    AC1 + vickers mark       0.00      0.00      0.00         3
       AC1 1000C 5M WQ       0.00      0.00      0.00         3
  AC1 1100C 1H 650C 1H       0.00      0.00      0.00         3
        AC1 700C 5M WQ       0.00      0.00      0.00         2
        AC1 800C 24H Q       0.00      0.00      0.00         2
       AC1 800C 24H WQ       0.00      0.00      0.00         4
         AC1 800C 3H Q       0.00      0.00      0.00         2
        AC1 800C 3H WQ       0.17      0.33      0.22         3
        AC1 800C 85H Q       0.00      0.00      0.00         2
       AC1 800C 85H WQ       0.50      0.25      0.33         4
         AC1 800C 8H Q       0.00      0.00      0.00         3
        AC1 900C 24H Q       0.00      0.00      0.00         2
         AC1 900C 3H Q       0.00      0.00      0.00         3
        AC1 900C 90M Q       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
