## Import Libraries

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    AdamW, 
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)

## Configuration

In [None]:
# Data configuration
DATA_PATH = "DGMs v3/3s"
FEATURE_COLUMNS = None

# BERT configuration
MODEL_NAME = 'bert-base-uncased'
MAX_LENGTH = 512

# Training configuration
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
WARMUP_STEPS = 100
MAX_GRAD_NORM = 1.0

# Cross-validation
N_SPLITS = 5


## Data Loading and Preprocessing

In [None]:
class EyeTrackingDataset(Dataset):
    def __init__(self, data_path, feature_columns, tokenizer, max_length=512, normalize=True):
        self.data_path = data_path
        self.feature_columns = feature_columns
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.normalize = normalize
        
        self.samples = []
        self.labels = []
        self.sequences = []
        
        self._load_data()
        self._preprocess_sequences()
        
    def _load_data(self):
        """Load CSV files and extract sequences"""
        print("Loading data files...")
        
        label_map = {'literate': 1, 'illiterate': 0}
        
        if self.feature_columns is None:
            for label_name in label_map.keys():
                folder_path = os.path.join(self.data_path, label_name)
                if os.path.exists(folder_path):
                    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
                    if csv_files:
                        sample_df = pd.read_csv(csv_files[0])
                        self.feature_columns = sample_df.select_dtypes(include=[np.number]).columns.tolist()
                        print(f"Auto-detected {len(self.feature_columns)} numeric columns")
                        print(f"Feature columns (first 10): {self.feature_columns[:10]}..." if len(self.feature_columns) > 10 else f"Feature columns: {self.feature_columns}")
                        break
        
        for label_name, label_value in label_map.items():
            folder_path = os.path.join(self.data_path, label_name)
            if not os.path.exists(folder_path):
                print(f"Warning: {folder_path} does not exist")
                continue
                
            csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
            print(f"Found {len(csv_files)} files in {label_name} folder")
            
            for csv_file in tqdm(csv_files, desc=f"Loading {label_name} files"):
                try:
                    df = pd.read_csv(csv_file)
                    feature_data = df[self.feature_columns].copy()
                    feature_data = feature_data.fillna(feature_data.mean())
                    
                    if len(feature_data) > 0:
                        self.samples.append(csv_file)
                        self.labels.append(label_value)
                        self.sequences.append(feature_data.values)
                        
                except Exception as e:
                    print(f"Error loading {csv_file}: {e}")
                    continue
        
        print(f"Loaded {len(self.samples)} samples total")
        print(f"Label distribution: {pd.Series(self.labels).value_counts().to_dict()}")
    
    def _preprocess_sequences(self):
        """Preprocess sequences for BERT input"""
        print("Preprocessing sequences...")
        
        if self.normalize:
            all_values = np.concatenate([seq.flatten() for seq in self.sequences])
            all_values = all_values[np.isfinite(all_values)]
            
            self.global_mean = np.mean(all_values)
            self.global_std = np.std(all_values)
            
            print(f"Global mean: {self.global_mean:.4f}, Global std: {self.global_std:.4f}")
        
        self.text_sequences = []
        
        for seq in tqdm(self.sequences, desc="Converting to text"):
            if self.normalize and self.global_std > 0:
                seq_normalized = (seq - self.global_mean) / self.global_std
                seq_normalized = np.clip(seq_normalized, -5, 5)
            else:
                seq_normalized = seq
            
            text_tokens = []
            
            for time_step in seq_normalized:
                chunk_size = 5
                feature_chunks = []
                
                for i in range(0, len(time_step), chunk_size):
                    chunk = time_step[i:i+chunk_size]
                    chunk_str = " ".join([f"{val:.3f}" for val in chunk])
                    feature_chunks.append(f"[{chunk_str}]")
                
                text_tokens.extend(feature_chunks)
            
            text_sequence = " ".join(text_tokens)
            self.text_sequences.append(text_sequence)
        
        if self.text_sequences:
            print(f"Sample text sequence (first 200 chars): {self.text_sequences[0][:200]}...")
            print(f"Total tokens in sample: {len(self.text_sequences[0].split())}")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        text = self.text_sequences[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


## Initialize Tokenizer and Load Data

In [None]:
# Initialize BERT tokenizer
print("Initializing BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Create dataset
print("Creating dataset...")
dataset = EyeTrackingDataset(
    data_path=DATA_PATH,
    feature_columns=FEATURE_COLUMNS,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    normalize=True
)

print(f"\nDataset created with {len(dataset)} samples")

## Model Training Functions

In [None]:
def create_data_loader(dataset, batch_size, shuffle=True):
    """Create a DataLoader for the dataset"""
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=0  # Set to 0 for Windows compatibility
    )

def train_epoch(model, data_loader, optimizer, scheduler, device):
    """Train the model for one epoch"""
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        # Calculate accuracy
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)
        
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        
        optimizer.step()
        scheduler.step()
    
    return total_loss / len(data_loader), correct_predictions / total_predictions

def evaluate_model(model, data_loader, device):
    """Evaluate the model"""
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            logits = outputs.logits
            
            total_loss += loss.item()
            
            predictions = torch.argmax(logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_predictions, average='binary'
    )
    
    return {
        'loss': total_loss / len(data_loader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'predictions': all_predictions,
        'labels': all_labels
    }

## Cross-Validation Training

In [None]:
# Prepare for cross-validation
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
labels = np.array(dataset.labels)

# Store results
fold_results = []
all_predictions = []
all_true_labels = []

for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(dataset)), labels)):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/{N_SPLITS}")
    print(f"{'='*50}")
    
    # Create train and validation datasets
    train_dataset = torch.utils.data.Subset(dataset, train_idx)
    val_dataset = torch.utils.data.Subset(dataset, val_idx)
    
    # Create data loaders
    train_loader = create_data_loader(train_dataset, BATCH_SIZE, shuffle=True)
    val_loader = create_data_loader(val_dataset, BATCH_SIZE, shuffle=False)
    
    print(f"Train samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")
    
    # Initialize model
    model = BertForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2,
        output_attentions=False,
        output_hidden_states=False
    )
    model.to(device)
    
    # Initialize optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
    total_steps = len(train_loader) * NUM_EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_STEPS,
        num_training_steps=total_steps
    )
    
    # Training loop
    best_val_accuracy = 0
    
    for epoch in range(NUM_EPOCHS):
        print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
        
        # Train
        train_loss, train_accuracy = train_epoch(
            model, train_loader, optimizer, scheduler, device
        )
        
        # Evaluate
        val_results = evaluate_model(model, val_loader, device)
        
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}")
        print(f"Val Loss: {val_results['loss']:.4f}, Val Acc: {val_results['accuracy']:.4f}")
        print(f"Val Precision: {val_results['precision']:.4f}, Val Recall: {val_results['recall']:.4f}, Val F1: {val_results['f1']:.4f}")
        
        # Save best model
        if val_results['accuracy'] > best_val_accuracy:
            best_val_accuracy = val_results['accuracy']
            best_model_state = model.state_dict().copy()
    
    # Load best model and get final evaluation
    model.load_state_dict(best_model_state)
    final_results = evaluate_model(model, val_loader, device)
    
    # Store results
    fold_results.append({
        'fold': fold + 1,
        'accuracy': final_results['accuracy'],
        'precision': final_results['precision'],
        'recall': final_results['recall'],
        'f1': final_results['f1']
    })
    
    all_predictions.extend(final_results['predictions'])
    all_true_labels.extend(final_results['labels'])
    
    print(f"\nFold {fold + 1} Final Results:")
    print(f"Accuracy: {final_results['accuracy']:.4f}")
    print(f"Precision: {final_results['precision']:.4f}")
    print(f"Recall: {final_results['recall']:.4f}")
    print(f"F1-Score: {final_results['f1']:.4f}")

## Results Analysis

In [None]:
# Calculate overall statistics
results_df = pd.DataFrame(fold_results)

print("\n" + "="*60)
print("CROSS-VALIDATION RESULTS SUMMARY")
print("="*60)

print("\nPer-Fold Results:")
print(results_df.to_string(index=False, float_format='%.4f'))

print("\nOverall Statistics:")
for metric in ['accuracy', 'precision', 'recall', 'f1']:
    mean_val = results_df[metric].mean()
    std_val = results_df[metric].std()
    print(f"{metric.capitalize()}: {mean_val:.4f} ± {std_val:.4f}")

# Calculate overall confusion matrix
cm = confusion_matrix(all_true_labels, all_predictions)
overall_accuracy = accuracy_score(all_true_labels, all_predictions)

print(f"\nOverall Accuracy (all folds): {overall_accuracy:.4f}")
print("\nConfusion Matrix:")
print(cm)

## Visualization

In [None]:
# Create visualizations
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot 1: Metrics by fold
ax1 = axes[0]
x = np.arange(len(results_df))
width = 0.2

ax1.bar(x - 1.5*width, results_df['accuracy'], width, label='Accuracy', alpha=0.8)
ax1.bar(x - 0.5*width, results_df['precision'], width, label='Precision', alpha=0.8)
ax1.bar(x + 0.5*width, results_df['recall'], width, label='Recall', alpha=0.8)
ax1.bar(x + 1.5*width, results_df['f1'], width, label='F1-Score', alpha=0.8)

ax1.set_xlabel('Fold')
ax1.set_ylabel('Score')
ax1.set_title('Performance Metrics by Fold (DGMs v3 Data)')
ax1.set_xticks(x)
ax1.set_xticklabels([f'Fold {i+1}' for i in range(len(results_df))])
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Confusion matrix
ax2 = axes[1]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2,
            xticklabels=['Illiterate', 'Literate'],
            yticklabels=['Illiterate', 'Literate'])
ax2.set_title('Overall Confusion Matrix')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')

plt.tight_layout()
plt.show()

# Box plot of metrics across folds
plt.figure(figsize=(10, 6))
metrics_data = [results_df['accuracy'], results_df['precision'], 
                results_df['recall'], results_df['f1']]
plt.boxplot(metrics_data, labels=['Accuracy', 'Precision', 'Recall', 'F1-Score'])
plt.title('Distribution of Metrics Across Folds (DGMs v3 Data)')
plt.ylabel('Score')
plt.grid(True, alpha=0.3)
plt.show()

## Model Analysis and Feature Importance

In [None]:
# Analyze some sample predictions
print("\nSample Predictions Analysis:")
print("-" * 40)

# Get a few samples for analysis
sample_indices = np.random.choice(len(dataset), size=min(5, len(dataset)), replace=False)

for idx in sample_indices:
    sample = dataset[idx]
    true_label = sample['labels'].item()
    
    # Get prediction (using the last trained model)
    model.eval()
    with torch.no_grad():
        input_ids = sample['input_ids'].unsqueeze(0).to(device)
        attention_mask = sample['attention_mask'].unsqueeze(0).to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        predicted_label = torch.argmax(logits, dim=-1).item()
        confidence = probabilities[0][predicted_label].item()
    
    label_names = {0: 'Illiterate', 1: 'Literate'}
    
    print(f"Sample {idx}:")
    print(f"  True Label: {label_names[true_label]}")
    print(f"  Predicted: {label_names[predicted_label]} (confidence: {confidence:.3f})")
    print(f"  Correct: {'✓' if true_label == predicted_label else '✗'}")
    print(f"  File: {dataset.samples[idx]}")
    print()

## Data Statistics and Insights

In [None]:
# Analyze sequence lengths and feature statistics
sequence_lengths = [len(seq) for seq in dataset.sequences]
text_lengths = [len(text.split()) for text in dataset.text_sequences]

print("Dataset Statistics:")
print(f"Number of samples: {len(dataset)}")
print(f"Number of features: {len(dataset.feature_columns)}")
print(f"Feature columns (first 10): {dataset.feature_columns[:10]}..." if len(dataset.feature_columns) > 10 else f"Feature columns: {dataset.feature_columns}")

print(f"\nSequence Length Statistics:")
print(f"Mean length: {np.mean(sequence_lengths):.1f}")
print(f"Median length: {np.median(sequence_lengths):.1f}")
print(f"Min length: {np.min(sequence_lengths)}")
print(f"Max length: {np.max(sequence_lengths)}")
print(f"Std deviation: {np.std(sequence_lengths):.1f}")

print(f"\nText Sequence Statistics:")
print(f"Mean text length (tokens): {np.mean(text_lengths):.1f}")
print(f"Max text length (tokens): {np.max(text_lengths)}")
print(f"Sequences exceeding BERT max length ({MAX_LENGTH}): {sum(1 for length in text_lengths if length > MAX_LENGTH)}")

# Display all feature names
print(f"\nAll Feature Columns ({len(dataset.feature_columns)} total):")
for i, col in enumerate(dataset.feature_columns):
    print(f"{i+1:2d}. {col}")

# Plot sequence length distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(sequence_lengths, bins=30, alpha=0.7, edgecolor='black')
plt.xlabel('Sequence Length (time steps)')
plt.ylabel('Frequency')
plt.title('Distribution of Sequence Lengths (DGMs v3)')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(text_lengths, bins=30, alpha=0.7, edgecolor='black')
plt.axvline(x=MAX_LENGTH, color='red', linestyle='--', label=f'BERT Max Length ({MAX_LENGTH})')
plt.xlabel('Text Length (tokens)')
plt.ylabel('Frequency')
plt.title('Distribution of Text Lengths')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Save Results

In [None]:
# Save results to CSV
results_df.to_csv('dgms_bert_cv_results.csv', index=False)
print("Results saved to 'dgms_bert_cv_results.csv'")

# Save detailed predictions
predictions_df = pd.DataFrame({
    'true_label': all_true_labels,
    'predicted_label': all_predictions,
    'correct': np.array(all_true_labels) == np.array(all_predictions)
})
predictions_df.to_csv('dgms_bert_predictions.csv', index=False)
print("Predictions saved to 'dgms_bert_predictions.csv'")

# Save feature list for reference
feature_df = pd.DataFrame({
    'feature_index': range(len(dataset.feature_columns)),
    'feature_name': dataset.feature_columns
})
feature_df.to_csv('dgms_feature_list.csv', index=False)
print("Feature list saved to 'dgms_feature_list.csv'")

print("\nDGMs v3 BERT model training and evaluation completed successfully!")
print(f"\nKey differences from Raw Data approach:")
print(f"- Used DGMs v3/3s processed data instead of Raw Data/all")
print(f"- Trained on ALL {len(dataset.feature_columns)} features instead of 6 selected ones")
print(f"- Adapted text representation for high-dimensional feature space")
print(f"- Used feature chunking to manage BERT token limits")