In [None]:
# Installs (run once per session)
!pip install --upgrade pandas scikit-learn tqdm scipy

In [None]:
# 1) (If using Drive) mount and set your CSV path
from google.colab import drive
drive.mount('/content/drive')
CSV_PATH = '/content/drive/MyDrive/aphasia/aphasia_tokens.csv'
SAVE_DIR = '/content/drive/MyDrive/aphasia'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ---- Imports ----
import re, numpy as np, pandas as pd
from tqdm import tqdm
from scipy import sparse
import os, datetime

from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from joblib import dump, load

In [None]:
# ---- Load data ----
df = pd.read_csv(CSV_PATH)
assert {'sample_id','token','is_word','is_CIU'}.issubset(df.columns), "CSV must have columns: sample_id, token, is_word, is_CIU"

STAMP = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [None]:
# ---- Build context window (prev/next) per token ----
def add_context(df):
    rows = []
    for sid, g in df.groupby('sample_id', sort=False):
        toks = g['token'].tolist()
        isw  = g['is_word'].astype(int).tolist()
        iciu = g['is_CIU' ].astype(int).tolist()
        n = len(toks)
        for i in range(n):
            rows.append({
                'sample_id': sid,
                'token': toks[i] if isinstance(toks[i], str) else str(toks[i]),
                'prev1': toks[i-1] if i-1 >= 0 else '<BOS>',
                'next1': toks[i+1] if i+1 < n else '<EOS>',
                'prev2': toks[i-2] if i-2 >= 0 else '<BOS2>',
                'next2': toks[i+2] if i+2 < n else '<EOS2>',
                'is_word': isw[i],
                'is_CIU' : iciu[i],
            })
    return pd.DataFrame(rows)

X_all = add_context(df)

In [None]:
STOPWORDS = {
    'the','a','an','and','but','or','so','to','of','in','on','at','for','from','with','by','as',
    'that','this','these','those','it','its','is','was','are','were','be','been','being','do','does','did',
    'have','has','had','he','she','they','we','you','i','me','him','her','them','us','my','your','our',
    'their','his','hers','theirs','mine','yours','ours','not','no','if','then','because','about','over',
    'under','up','down','out','into','off','just','very','really','there','here','now','also','too','again'
}
FILLERS = {'uh','um','er','ah','oh','mm','hmm','yeah','yep','nope','okay','ok','alright'}
def safe_str(x):
    return x if isinstance(x, str) else str(x)

In [None]:
class HandcraftedFeats(BaseEstimator, TransformerMixin):
    def __init__(self, use_context=True):
        self.use_context = use_context
    def fit(self, X, y=None):
        return self
    def _feat_row(self, tok, prev1, next1):
        t = safe_str(tok)
        p = safe_str(prev1)
        n = safe_str(next1)

        def counts(s):
            s = safe_str(s)
            al = sum(ch.isalpha() for ch in s)
            di = sum(ch.isdigit() for ch in s)
            pu = sum((not ch.isalnum()) for ch in s)
            return al, di, pu

        len_t = len(t)
        alpha_t, digit_t, punct_t = counts(t)

        feats = [
            len_t,
            alpha_t,
            digit_t,
            punct_t,
            1.0 if t.isalpha() else 0.0,
            1.0 if t.islower() else 0.0,
            1.0 if t.isupper() else 0.0,
            1.0 if "'" in t else 0.0,
            1.0 if "-" in t or "â€“" in t else 0.0,
            1.0 if any(ch.isdigit() for ch in t) else 0.0,
            1.0 if t.lower() in STOPWORDS else 0.0,
            1.0 if t.lower() in FILLERS else 0.0,
            1.0 if t.lower() == 'and' else 0.0,
            1.0 if t.lower().startswith('&=') else 0.0,
            1.0 if t.lower() in {'xxx','xx'} else 0.0,
            1.0 if t.endswith('-') else 0.0,
        ]

        if self.use_context:
            feats += [
                1.0 if safe_str(p).lower() in STOPWORDS else 0.0,
                1.0 if safe_str(n).lower() in STOPWORDS else 0.0,
                1.0 if safe_str(p).lower() in FILLERS else 0.0,
                1.0 if safe_str(n).lower() in FILLERS else 0.0,
            ]
        return feats

    def transform(self, X):
        # X is a 2D array (subset of columns from ColumnTransformer)
        # assume columns are [token, prev1, next1]
        if isinstance(X, pd.DataFrame):
            arr = X[['token','prev1','next1']].values
        else:
            arr = X
        rows = [self._feat_row(tok, prev1, next1) for tok, prev1, next1 in arr]
        return sparse.csr_matrix(np.asarray(rows, dtype=np.float32))

# ---- Preprocessor: char n-grams + handcrafted features ----
tok_char = TfidfVectorizer(analyzer='char', ngram_range=(2,5), lowercase=True, min_df=1)
ctx_prev = TfidfVectorizer(analyzer='char', ngram_range=(2,5), lowercase=True, min_df=1)
ctx_next = TfidfVectorizer(analyzer='char', ngram_range=(2,5), lowercase=True, min_df=1)

preprocessor = ColumnTransformer(
    transformers=[
        ('tok_char', tok_char, 'token'),
        ('prev_char', ctx_prev, 'prev1'),
        ('next_char', ctx_next, 'next1'),
        ('hand', HandcraftedFeats(), ['token','prev1','next1']),
    ],
    remainder='drop',
    sparse_threshold=1.0
)

# ---- Build the DataFrame of inputs and targets ----
X_df = X_all[['sample_id','token','prev1','next1']].copy()
y_word = X_all['is_word'].astype(int).values
y_ciu  = X_all['is_CIU' ].astype(int).values
groups = X_all['sample_id'].values

# ---- Grouped train/test split by sample_id (80/20) ----
SEED = 42
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
(train_idx, test_idx), = gss.split(X_df, y_word, groups=groups)

X_train, X_test = X_df.iloc[train_idx], X_df.iloc[test_idx]
y_word_tr, y_word_te = y_word[train_idx], y_word[test_idx]
y_ciu_tr,  y_ciu_te  = y_ciu [train_idx], y_ciu [test_idx]
groups_tr, groups_te = groups[train_idx], groups[test_idx]

print(f"Train tokens: {len(X_train)} | Test tokens: {len(X_test)}")
print(f"Train samples: {X_train['sample_id'].nunique()} | Test samples: {X_test['sample_id'].nunique()}")

# ---- Models to train ----
def model_zoo():
    return {
        'SVM-linear':  SVC(kernel='linear', probability=True, class_weight='balanced', random_state=SEED),
        'SVM-rbf':     SVC(kernel='rbf',   probability=True, class_weight='balanced', C=2.0, gamma='scale', random_state=SEED),
        'RandomForest': RandomForestClassifier(
            n_estimators=300, max_depth=None, class_weight='balanced_subsample', n_jobs=-1, random_state=SEED
        ),
        'DecisionTree': DecisionTreeClassifier(max_depth=None, class_weight='balanced', random_state=SEED),
        'KNN':          KNeighborsClassifier(n_neighbors=15, weights='distance')
    }

# ---- Training/Evaluation helper ----
def train_eval(task_name, y_tr, y_te):
    results = []
    saved_paths = []
    for name, clf in model_zoo().items():
        pipe = Pipeline(steps=[
            ('prep', preprocessor),
            # Note: TF-IDF gives sparse normalized features; numeric feats remain sparse via HandcraftedFeats
            ('clf', clf)
        ])
        pipe.fit(X_train[['token','prev1','next1']], y_tr)

        # --- SAVE the fitted pipeline ---
        model_path = os.path.join(SAVE_DIR, f"{task_name}_{name}_{STAMP}.joblib")
        dump(pipe, model_path, compress=("xz", 3))
        saved_paths.append(model_path)

        # Evaluate
        yhat = pipe.predict(X_test[['token','prev1','next1']])
        # scores for ROC-AUC
        if hasattr(pipe.named_steps['clf'], 'predict_proba'):
            yscore = pipe.predict_proba(X_test[['token','prev1','next1']])[:,1]
        elif hasattr(pipe.named_steps['clf'], 'decision_function'):
            dfun = pipe.decision_function(X_test[['token','prev1','next1']])
            # rescale to [0,1] to be safe
            dfun = (dfun - dfun.min()) / (dfun.max() - dfun.min() + 1e-9)
            yscore = dfun
        else:
            yscore = (yhat == 1).astype(float)

        acc = accuracy_score(y_te, yhat)
        prec, rec, f1, _ = precision_recall_fscore_support(y_te, yhat, average='binary', zero_division=0)
        try:
            auc = roc_auc_score(y_te, yscore)
        except Exception:
            auc = float("nan")
        cm = confusion_matrix(y_te, yhat)

        print(f"\n[{task_name} | {name}]  Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  F1={f1:.4f}  AUC={auc:.4f}")
        print("Confusion matrix [[TN FP],[FN TP]]:\n", cm)

        results.append({
            'task': task_name, 'model': name,
            'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'auc': auc
        })
    return pd.DataFrame(results)

print("\n=== WORD vs NOT_WORD ===")
res_word = train_eval('WORD', y_word_tr, y_word_te)

print("\n=== CIU vs NOT_CIU ===")
res_ciu  = train_eval('CIU',  y_ciu_tr,  y_ciu_te)

print("\n=== Summary ===")
summary = pd.concat([res_word, res_ciu], ignore_index=True)
summary.sort_values(['task','f1'], ascending=[True, False], inplace=True)
summary.reset_index(drop=True, inplace=True)
summary

metrics_path = os.path.join(SAVE_DIR, f"classifier_metrics_{STAMP}.csv")
summary.to_csv(metrics_path, index=False)
print("Metrics saved to:", metrics_path)

Train tokens: 1755 | Test tokens: 222
Train samples: 28 | Test samples: 7

=== WORD vs NOT_WORD ===

[WORD | SVM-linear]  Acc=0.9955  Prec=0.9954  Rec=1.0000  F1=0.9977  AUC=0.9946
Confusion matrix [[TN FP],[FN TP]]:
 [[  5   1]
 [  0 216]]

[WORD | SVM-rbf]  Acc=0.9955  Prec=0.9954  Rec=1.0000  F1=0.9977  AUC=0.9969
Confusion matrix [[TN FP],[FN TP]]:
 [[  5   1]
 [  0 216]]

[WORD | RandomForest]  Acc=0.9955  Prec=0.9954  Rec=1.0000  F1=0.9977  AUC=0.9892
Confusion matrix [[TN FP],[FN TP]]:
 [[  5   1]
 [  0 216]]

[WORD | DecisionTree]  Acc=0.9955  Prec=0.9954  Rec=1.0000  F1=0.9977  AUC=0.9167
Confusion matrix [[TN FP],[FN TP]]:
 [[  5   1]
 [  0 216]]

[WORD | KNN]  Acc=0.9955  Prec=0.9954  Rec=1.0000  F1=0.9977  AUC=0.9136
Confusion matrix [[TN FP],[FN TP]]:
 [[  5   1]
 [  0 216]]

=== CIU vs NOT_CIU ===

[CIU | SVM-linear]  Acc=0.7883  Prec=0.8678  Rec=0.8629  F1=0.8653  AUC=0.7463
Confusion matrix [[TN FP],[FN TP]]:
 [[ 24  23]
 [ 24 151]]

[CIU | SVM-rbf]  Acc=0.8063  Prec=0.