## Import Libraries

In [None]:
import os
import glob
import re
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.optim import AdamW

# Fix for torch.utils._pytree compatibility issue
if not hasattr(torch.utils._pytree, 'register_pytree_node'):
    torch.utils._pytree.register_pytree_node = lambda *args, **kwargs: None

from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    get_linear_schedule_with_warmup
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)

## Configuration

In [None]:
# Data configuration
DATA_PATH = "Organized Normalized Tumbling Window DGMs (3s)/all"
FEATURE_COLUMNS = None

# BERT configuration
MODEL_NAME = 'bert-base-uncased'
MAX_LENGTH = 128
BATCH_SIZE = 4
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 50
MAX_GRAD_NORM = 1.0


## Data Loading and Preprocessing with User ID Extraction

The DGM data is organized as:
```
all/
├── literate/
│   ├── user_1/
│   │   ├── user_1_question_10_ID_29_tumbling_all_window_DGMs.csv
│   │   ├── user_1_question_12_ID_15_tumbling_all_window_DGMs.csv
│   │   └── ...
│   ├── user_2/
│   └── ...
└── illiterate/
    ├── user_1/
    └── ...
```

In [None]:
class DGMDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_length=512, normalize=True):
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.normalize = normalize
        
        # Load all data files
        self.samples = []
        self.labels = []
        self.features = []
        self.user_ids = []
        
        self._load_data()
        self._preprocess_features()
        
    def _load_data(self):
        """Load CSV files from organized folder structure and extract user IDs
        Treat each CSV file as one sample by aggregating its rows (column-wise mean).
        """
        print("Loading DGM data files...")
        
        # Define label mapping
        label_map = {'literate': 0, 'illiterate': 1}
        
        for label_name, label_value in label_map.items():
            label_path = os.path.join(self.data_path, label_name)
            
            if not os.path.exists(label_path):
                print(f"Warning: {label_path} does not exist")
                continue
            
            # Get all user folders
            user_folders = [d for d in os.listdir(label_path) 
                          if os.path.isdir(os.path.join(label_path, d)) and d.startswith('user_')]
            
            for user_folder in user_folders:
                # Extract user ID from folder name (e.g., 'user_1' -> 1)
                match = re.search(r'user_(\d+)', user_folder)
                if not match:
                    continue
                user_id = int(match.group(1))
                
                user_path = os.path.join(label_path, user_folder)
                csv_files = glob.glob(os.path.join(user_path, '*.csv'))
                
                for csv_file in csv_files:
                    try:
                        df = pd.read_csv(csv_file)

                        # Treat each file as one sample by aggregating its rows.
                        # Convert all columns to numeric where possible, replace non-numeric with 0,
                        # then compute the column-wise mean to produce a single feature vector per file.
                        numeric_df = df.apply(pd.to_numeric, errors='coerce').fillna(0.0)

                        # If there are no numeric columns, use an empty feature vector
                        if numeric_df.shape[1] == 0:
                            agg_features = np.array([])
                        else:
                            agg_features = numeric_df.mean(axis=0).values

                        # Append one sample per file
                        self.samples.append(csv_file)
                        self.labels.append(label_value)
                        self.features.append(agg_features)
                        self.user_ids.append(user_id)

                    except Exception as e:
                        print(f"Error loading {csv_file}: {e}")
        
        print(f"Loaded {len(self.samples)} samples from {len(set(self.user_ids))} users")
        print(f"Label distribution: {pd.Series(self.labels).value_counts().to_dict()}")
        
    def _preprocess_features(self):
        """Normalize features and convert to text"""
        print("Preprocessing features...")

        # Convert to numpy array for easier manipulation
        # Handle variable-length feature vectors by padding with zeros to the max length
        if len(self.features) == 0:
            features_array = np.zeros((0, 0), dtype=float)
        else:
            max_len = max([len(f) for f in self.features])
            features_array = np.zeros((len(self.features), max_len), dtype=float)
            for i, f in enumerate(self.features):
                if len(f) > 0:
                    features_array[i, :len(f)] = f

        if self.normalize:
            # Normalize each feature to [0, 1] range
            # Handle NaN and inf values
            features_array = np.nan_to_num(features_array, nan=0.0, posinf=1.0, neginf=0.0)
            
            # Min-max normalization per feature
            if features_array.size == 0:
                min_vals = np.array([])
                max_vals = np.array([])
            else:
                min_vals = np.min(features_array, axis=0)
                max_vals = np.max(features_array, axis=0)
                
            # Avoid division by zero
            range_vals = max_vals - min_vals if features_array.size != 0 else np.array([])
            if range_vals.size != 0:
                range_vals[range_vals == 0] = 1.0
                features_array = (features_array - min_vals) / range_vals

        # Convert features to text representation (rounded to 3 decimals)
        self.text_features = []
        for features in features_array:
            # Convert numerical features to space-separated string
            text = ' '.join([f"{val:.3f}" for val in features])
            self.text_features.append(text)
        
        print("Feature preprocessing complete")
        
    def get_user_ids(self):
        """Return list of all user IDs"""
        return self.user_ids
    
    def get_unique_users(self):
        """Return sorted list of unique user IDs"""
        return sorted(set(self.user_ids))
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        text = self.text_features[idx]
        label = self.labels[idx]
        user_id = self.user_ids[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long),
            'user_id': torch.tensor(user_id, dtype=torch.long)
        }

## Initialize Dataset and Tokenizer

In [None]:
# Initialize tokenizer
print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Load dataset
print("\nLoading dataset...")
dataset = DGMDataset(
    data_path=DATA_PATH,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    normalize=True
)

print(f"\nDataset loaded successfully!")
print(f"Total samples: {len(dataset)}")
print(f"Number of unique users: {len(dataset.get_unique_users())}")
print(f"User IDs: {dataset.get_unique_users()}")

## Training and Evaluation Functions

In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, device, gradient_accumulation_steps=1):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    optimizer.zero_grad()
    
    progress_bar = tqdm(dataloader, desc="Training")
    for step, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss / gradient_accumulation_steps
        loss.backward()
        
        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * gradient_accumulation_steps
        
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({'loss': loss.item() * gradient_accumulation_steps})
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    
    return avg_loss, accuracy

def evaluate(model, dataloader, device):
    """Evaluate model"""
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average='macro', zero_division=0
    )
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'predictions': predictions,
        'true_labels': true_labels
    }

## Leave-One-User-Out Cross-Validation

For each user:
1. Use that user's data as test set
2. Use all other users' data as training set
3. Train model from scratch
4. Evaluate on held-out user
5. Store results

In [None]:
# Get unique users
unique_users = dataset.get_unique_users()
all_user_ids = dataset.get_user_ids()

print(f"Starting Leave-One-User-Out Cross-Validation")
print(f"Number of folds: {len(unique_users)}")
print(f"Unique users: {unique_users}")

# Store results
louo_results = []
all_predictions = []

# Perform LOUO CV
for fold_idx, test_user in enumerate(unique_users):
    print(f"\n{'='*80}")
    print(f"Fold {fold_idx + 1}/{len(unique_users)} - Testing on User {test_user}")
    print(f"{'='*80}")
    
    # Split data
    train_indices = [i for i, uid in enumerate(all_user_ids) if uid != test_user]
    test_indices = [i for i, uid in enumerate(all_user_ids) if uid == test_user]
    
    print(f"Training samples: {len(train_indices)}")
    print(f"Test samples: {len(test_indices)}")
    
    # Create subsets
    train_dataset = Subset(dataset, train_indices)
    test_dataset = Subset(dataset, test_indices)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    # Initialize model
    model = BertForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2
    ).to(device)
    
    # Initialize optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_loader) * NUM_EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_STEPS,
        num_training_steps=total_steps
    )
    
    # Training loop
    print(f"\nTraining for {NUM_EPOCHS} epochs...")
    for epoch in range(NUM_EPOCHS):
        print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
        train_loss, train_acc = train_epoch(
            model, train_loader, optimizer, scheduler, device,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS
        )
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        
        # Clear cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    # Evaluate on test set
    print(f"\nEvaluating on User {test_user}...")
    test_results = evaluate(model, test_loader, device)
    
    print(f"\nUser {test_user} Results:")
    print(f"  Accuracy:  {test_results['accuracy']:.4f}")
    print(f"  Precision: {test_results['precision']:.4f}")
    print(f"  Recall:    {test_results['recall']:.4f}")
    print(f"  F1-Score:  {test_results['f1']:.4f}")
    
    # Store results
    louo_results.append({
        'user_id': test_user,
        'accuracy': test_results['accuracy'],
        'precision': test_results['precision'],
        'recall': test_results['recall'],
        'f1': test_results['f1'],
        'n_samples': len(test_indices)
    })
    
    # Store predictions
    for i, (pred, true) in enumerate(zip(test_results['predictions'], test_results['true_labels'])):
        all_predictions.append({
            'user_id': test_user,
            'sample_idx': test_indices[i],
            'true_label': true,
            'predicted_label': pred
        })
    
    # Clean up
    del model, optimizer, scheduler
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"\n{'='*80}")
print("Leave-One-User-Out Cross-Validation Complete!")
print(f"{'='*80}")

## Aggregate Results and Analysis

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(louo_results)
predictions_df = pd.DataFrame(all_predictions)

# Calculate aggregate statistics
mean_accuracy = results_df['accuracy'].mean()
std_accuracy = results_df['accuracy'].std()
mean_precision = results_df['precision'].mean()
std_precision = results_df['precision'].std()
mean_recall = results_df['recall'].mean()
std_recall = results_df['recall'].std()
mean_f1 = results_df['f1'].mean()
std_f1 = results_df['f1'].std()

print("\nAggregate Results (Mean ± Std):")
print(f"Accuracy:  {mean_accuracy:.4f} ± {std_accuracy:.4f}")
print(f"Precision: {mean_precision:.4f} ± {std_precision:.4f}")
print(f"Recall:    {mean_recall:.4f} ± {std_recall:.4f}")
print(f"F1-Score:  {mean_f1:.4f} ± {std_f1:.4f}")

# Display per-user results
print("\nPer-User Results:")
print(results_df.sort_values('accuracy', ascending=False).to_string(index=False))

# Save results
results_df.to_csv('bert_dgms_3s_louo_results.csv', index=False)
predictions_df.to_csv('bert_dgms_3s_louo_predictions.csv', index=False)
print("\nResults saved to:")
print("  - bert_dgms_3s_louo_results.csv")
print("  - bert_dgms_3s_louo_predictions.csv")

## Visualization

In [None]:
# Plot performance distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('BERT DGMs (3s) - Leave-One-User-Out Cross-Validation Results', fontsize=16)

# Accuracy
axes[0, 0].hist(results_df['accuracy'], bins=20, color='skyblue', edgecolor='black')
axes[0, 0].axvline(mean_accuracy, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_accuracy:.3f}')
axes[0, 0].set_xlabel('Accuracy')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Accuracy Distribution')
axes[0, 0].legend()

# Precision
axes[0, 1].hist(results_df['precision'], bins=20, color='lightgreen', edgecolor='black')
axes[0, 1].axvline(mean_precision, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_precision:.3f}')
axes[0, 1].set_xlabel('Precision')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Precision Distribution')
axes[0, 1].legend()

# Recall
axes[1, 0].hist(results_df['recall'], bins=20, color='lightcoral', edgecolor='black')
axes[1, 0].axvline(mean_recall, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_recall:.3f}')
axes[1, 0].set_xlabel('Recall')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Recall Distribution')
axes[1, 0].legend()

# F1-Score
axes[1, 1].hist(results_df['f1'], bins=20, color='plum', edgecolor='black')
axes[1, 1].axvline(mean_f1, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_f1:.3f}')
axes[1, 1].set_xlabel('F1-Score')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('F1-Score Distribution')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('bert_dgms_3s_louo_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("Visualization saved to: bert_dgms_3s_louo_distribution.png")

## Box Plot of Metrics

In [None]:
plt.figure(figsize=(10, 6))
metrics_data = results_df[['accuracy', 'precision', 'recall', 'f1']]

bp = plt.boxplot([metrics_data['accuracy'], metrics_data['precision'], 
                   metrics_data['recall'], metrics_data['f1']],
                  labels=['Accuracy', 'Precision', 'Recall', 'F1-Score'],
                  patch_artist=True)

colors = ['skyblue', 'lightgreen', 'lightcoral', 'plum']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)

plt.ylabel('Score')
plt.title('BERT DGMs (3s) - Performance Metrics Distribution (LOUO)')
plt.grid(axis='y', alpha=0.3)
plt.ylim(0, 1.05)
plt.savefig('bert_dgms_3s_louo_boxplot.png', dpi=300, bbox_inches='tight')
plt.show()

print("Box plot saved to: bert_dgms_3s_louo_boxplot.png")

## Statistical Summary

In [None]:
print("\nDetailed Statistical Summary:")
print("="*60)
summary_stats = results_df[['accuracy', 'precision', 'recall', 'f1']].describe()
print(summary_stats)

print("\n" + "="*60)
print("Top 5 Users by Accuracy:")
print(results_df.nlargest(5, 'accuracy')[['user_id', 'accuracy', 'f1', 'n_samples']])

print("\nBottom 5 Users by Accuracy:")
print(results_df.nsmallest(5, 'accuracy')[['user_id', 'accuracy', 'f1', 'n_samples']])