In [1]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertConfig

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
custom_config = BertConfig(
    vocab_size=30522,               
    num_attention_heads=12,        
    num_hidden_layers=12,          
    attention_probs_dropout_prob=0.1,  
    hidden_size=768,               
    intermediate_size=3072,        
    hidden_dropout_prob=0.1,       
    hidden_act="relu",             
    max_position_embeddings=512    
)
bert_model = TFBertModel(custom_config)

data = pd.read_csv("C:\\Users\\lclai\\Desktop\\research LDIG\\data_dementia\\clean\\pitt_clean.csv")
data = data[["label", "clean_transcripts"]]

data = data[(data['label'] == 0) | (data['label'] == 1)]

def preprocess_and_get_bert_embeddings(text, tokenizer, model, max_length=512):
    inputs = tokenizer(
        text,
        return_tensors="tf",
        padding='max_length',  
        truncation=True,
        max_length=max_length
    )
    
    bert_outputs = model(**inputs)
    return bert_outputs.last_hidden_state, bert_outputs.pooler_output 

input_ids = []
attention_masks = []
last_hidden_states = []
pooler_outputs = []

for text in data['clean_transcripts']:
    last_hidden_state, pooler_output = preprocess_and_get_bert_embeddings(text, tokenizer, bert_model)
    last_hidden_states.append(last_hidden_state.numpy())  
    pooler_outputs.append(pooler_output.numpy())

last_hidden_states_tensor = tf.convert_to_tensor(last_hidden_states, dtype=tf.float32)
pooler_outputs_tensor = tf.convert_to_tensor(pooler_outputs, dtype=tf.float32)

print(f"Last Hidden States Tensor Shape: {last_hidden_states_tensor.shape}")
print(f"Pooler Outputs Tensor Shape: {pooler_outputs_tensor.shape}")


  from .autonotebook import tqdm as notebook_tqdm




Last Hidden States Tensor Shape: (496, 1, 512, 768)
Pooler Outputs Tensor Shape: (496, 1, 768)


In [15]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

df = pd.read_csv("C:\\Users\\lclai\\Desktop\\research LDIG\\data_dementia\\clean\\pitt_clean.csv")
df = df[(df['label'] == 0) | (df['label'] == 1)]

X_bert = last_hidden_states_tensor[:, 0, :].numpy()

bert_pooled = np.mean(X_bert, axis=1) 

numeric = np.hstack((df[['age']].to_numpy(), bert_pooled))  
    
scaler = StandardScaler()
scaled_numeric = scaler.fit_transform(numeric)
    
df["sex"] = df["sex"].map({"M": 0, "F": 1})
sex = df[['sex']].to_numpy()  

X_train = np.hstack((sex, scaled_numeric))

In [16]:
X_train

array([[ 1.        , -1.18781324, -0.65208057, ..., -0.14898256,
         1.00249698,  0.18825872],
       [ 1.        , -1.07372429, -0.19957252, ...,  0.40940099,
         1.06490486, -0.52115697],
       [ 1.        , -0.95963533, -1.19929472, ..., -0.13240364,
         0.07498048,  0.16304884],
       ...,
       [ 1.        ,  0.63761005,  0.33020389, ...,  0.09378595,
         0.1386087 , -0.55675863],
       [ 0.        ,  0.97987692, -1.3302383 , ..., -1.53522748,
         0.11022139,  1.17908664],
       [ 0.        ,  1.20805483,  0.41988061, ...,  1.64007031,
         0.0579432 , -0.72321097]])

In [3]:
y = df['label']

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score

models = {
    'Random Forest': RandomForestClassifier(random_state=42),
  #  'XGBoost': xgb.XGBClassifier(random_state=42)
}

param_grids = {
    'Random Forest': {
        'n_estimators': [200, 500, 1000], 
        'max_depth': [None, 10, 20], 
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],  
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200],  
        'learning_rate': [0.01, 0.1, 0.2], 
        'max_depth': [3, 6, 10],
        'subsample': [0.8, 1.0], 
    }
}

results = {}

for model_name in models:
    model = models[model_name]
    param_grid = param_grids[model_name]
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
    
    grid_search.fit(X_train, y)
    
    best_model = grid_search.best_estimator_

    auc = grid_search.best_score_
    
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'mean_auc': auc
    }

for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Best Params: {result['best_params']}")
    print(f"Mean AUC: {result['mean_auc']:.4f}\n")

Model: Random Forest
Best Params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
Mean AUC: 0.8434



In [28]:
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

y = np.array(y)

n_splits = 5 
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

auc_scores = []

for train_idx, val_idx in kf.split(X_train, y):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]
    
    xgb_clf = xgb.XGBClassifier(
        n_estimators=200, max_depth=10, learning_rate=0.1, subsample=0.8,
        random_state=42
    )
    xgb_clf.fit(X_train_fold, y_train_fold)
    pred_xgb = xgb_clf.predict_proba(X_val_fold)[:, 1]
    
    rf_clf = RandomForestClassifier(
        n_estimators=200, max_depth=10, min_samples_split=2, min_samples_leaf=4,
        max_features="sqrt", random_state=42
    )
    rf_clf.fit(X_train_fold, y_train_fold)
    pred_rf = rf_clf.predict_proba(X_val_fold)[:, 1]
    
    y_pred_proba_avg = (pred_xgb + pred_rf) / 2
    auc_score = roc_auc_score(y_val_fold, y_pred_proba_avg)
    
    auc_scores.append(auc_score)
    print(f"Fold AUC Score: {auc_score:.4f}")

print(f"\nMean AUC Score (XGBoost + RF, {n_splits}-fold CV): {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")


Fold AUC Score: 0.8383
Fold AUC Score: 0.9535
Fold AUC Score: 0.7955
Fold AUC Score: 0.9020
Fold AUC Score: 0.8170

Mean AUC Score (XGBoost + RF, 5-fold CV): 0.8613 ± 0.0583


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []

for train_idx, val_idx in kf.split(X_train, y):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]
    
    softmax = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1, max_iter=200, random_state=42)
    softmax.fit(X_train_fold, y_train_fold)
    
    y_val_proba = softmax.predict_proba(X_val_fold)[:, 1]


    auc = roc_auc_score(y_val_fold, y_val_proba)
    auc_scores.append(auc)

mean_auc = np.mean(auc_scores)

print(f"Mean AUC Score (Softmax CV): {mean_auc:.4f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean AUC Score (Softmax CV): 0.8989


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
class TextCNN(tf.keras.Model):
    def __init__(self, dropout_prob=0.5):
        super(TextCNN, self).__init__()
        
        # convolution kernels
        self.conv1 = tf.keras.layers.Conv2D(130, (5, 768), activation='relu')
        self.conv2 = tf.keras.layers.Conv2D(130, (10, 768), activation='relu')
        self.conv3 = tf.keras.layers.Conv2D(130, (15, 768), activation='relu')
        self.conv4 = tf.keras.layers.Conv2D(130, (20, 768), activation='relu')

        # max pooling
        self.pool = tf.keras.layers.GlobalMaxPooling2D()

        # fusion layer
        self.fc = tf.keras.layers.Dense(260, activation='relu')
        self.dropout = tf.keras.layers.Dropout(dropout_prob)

    def call(self, inputs):
        # input the output of the bert model
        x = tf.expand_dims(inputs, -1) 
    
        x1 = self.conv1(x)
        x2 = self.conv2(x)
        x3 = self.conv3(x)
        x4 = self.conv4(x)
        
        # pool the outputs of the convolution layers 
        pooled_1 = self.pool(x1)
        pooled_2 = self.pool(x2)
        pooled_3 = self.pool(x3)
        pooled_4 = self.pool(x4)
        
        # fusion of all the features
        fused_features = tf.concat([pooled_1, pooled_2, pooled_3, pooled_4], axis=-1)

        feature_vector = self.fc(fused_features)
        feature_vector = self.dropout(feature_vector)
        
        return feature_vector

textcnn_model = TextCNN()

In [30]:
import numpy as np
from sklearn.model_selection import train_test_split

X = last_hidden_states_tensor[:, 0, :].numpy()
y = data['label'].values

def obtain_features(sample):
    sample_with_batch_dim = tf.expand_dims(sample, axis=0)  # (1, 512, 768)
    sample_cnn = textcnn_model(sample_with_batch_dim)  # (1, features_cnn)
    return sample_cnn.numpy().squeeze()  

In [31]:
import numpy as np

X_features = []

for i in range(X.shape[0]):  
    X_features.append(obtain_features(X[i])) 

X_def = np.array(X_features)

In [None]:
X_def.shape

(496, 260)

In [32]:
X_final = np.hstack((X_numeric, X_def))

In [None]:
y.shape

(496,)

In [None]:
X_final.shape

(496, 262)

In [33]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
import numpy as np

clf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [500, 1000],  
    'max_depth': [10, 30],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 5, 8],  
   
}

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_final, y)

best_clf = grid_search.best_estimator_
y_pred_proba = best_clf.predict_proba(X_final)[:, 1]
auc_score = roc_auc_score(y, y_pred_proba)

print(f"Best parameters: {grid_search.best_params_}")
print(f"cross-validation AUC: {grid_search.best_score_:.4f}")


Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
cross-validation AUC: 0.7324


In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

clf = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

param_grid = {
    'n_estimators': [300, 500], 
    'max_depth': [3, 5], 
    'learning_rate': [0.01, 0.1, 0.3],  
    'subsample': [0.8, 1.0],  
    'gamma': [0.1, 0.3] 
}

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_final, y)

best_clf = grid_search.best_estimator_

y_pred = cross_val_predict(best_clf, X_final, y, cv=5)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation AUC: {grid_search.best_score_:.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best parameters: {'gamma': 0.3, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1.0}
Best cross-validation AUC: 0.7384


In [35]:
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

n_splits = 5 
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

auc_scores = []

for train_idx, val_idx in kf.split(X_final, y):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]
    
    xgb_clf = xgb.XGBClassifier(
        n_estimators=300, max_depth=3, learning_rate=0.01, subsample=1,
        random_state=42
    )
    xgb_clf.fit(X_train_fold, y_train_fold)
    pred_xgb = xgb_clf.predict_proba(X_val_fold)[:, 1]
    
    rf_clf = RandomForestClassifier(
        n_estimators=500, max_depth=10, min_samples_split=10, min_samples_leaf=1,
        max_features="sqrt", random_state=42
    )
    rf_clf.fit(X_train_fold, y_train_fold)
    pred_rf = rf_clf.predict_proba(X_val_fold)[:, 1]
    
    y_pred_proba_avg = (pred_xgb + pred_rf) / 2
    auc_score = roc_auc_score(y_val_fold, y_pred_proba_avg)
    
    auc_scores.append(auc_score)
    print(f"Fold AUC Score: {auc_score:.4f}")

print(f"\nMean AUC Score (XGBoost + RF, {n_splits}-fold CV): {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")


Fold AUC Score: 0.7995
Fold AUC Score: 0.9233
Fold AUC Score: 0.7882
Fold AUC Score: 0.9044
Fold AUC Score: 0.8382

Mean AUC Score (XGBoost + RF, 5-fold CV): 0.8507 ± 0.0545
