# Part 1 Language Family Prediction Using Neural Networks
# This script implements two neural network approaches for predicting language families from phonological features

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
# =====================================================================
# Configuration and Setup
# =====================================================================

class ConfigurationSettings:
    """Configuration settings for the neural network models"""
    def __init__(self):
        # System settings
        self.random_seed = 42
        self.computation_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Dataset paths
        self.phonological_features_path = 'cs_assignment3_data_1.csv'
        self.language_families_path = 'cs_assignment3_data_2.csv'
        
        # Training hyperparameters
        self.mini_batch_size = 128
        self.gradient_learning_rate = 0.001
        self.training_epochs = 25
        self.early_stopping_patience = 5
        
        # Model architecture
        self.feature_hidden_neurons = 256
        self.embedding_dimension = 64
        self.dropout_probability = 0.3
        
        # Results directory
        self.results_directory = 'model_results'
        os.makedirs(self.results_directory, exist_ok=True)

# Set up configuration
config = ConfigurationSettings()

# Set random seeds for reproducibility
torch.manual_seed(config.random_seed)
np.random.seed(config.random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(config.random_seed)

print(f"{'='*30} LANGUAGE FAMILY PREDICTION {'='*30}")
print(f"Computation device: {config.computation_device}")
start_time = time.time()


Computation device: cpu


In [3]:
# =====================================================================
# Data Loading and Preprocessing
# =====================================================================

def load_and_preprocess_data(config):
    """
    Load and preprocess the phonological features and language family datasets
    
    Args:
        config: Configuration settings object
        
    Returns:
        preprocessed_data: Dictionary containing processed data
    """
    print(f"\n{'='*20} LOADING AND PREPROCESSING DATA {'='*20}")
    
    # Load datasets
    print(f"Loading phonological features from: {config.phonological_features_path}")
    phonological_data = pd.read_csv(config.phonological_features_path, low_memory=False)
    
    print(f"Loading language families from: {config.language_families_path}")
    language_family_data = pd.read_csv(config.language_families_path, low_memory=False)
    
    print(f"Phonological features shape: {phonological_data.shape}")
    print(f"Language families shape: {language_family_data.shape}")
    
    # Extract unique language-family mappings
    language_to_family_mapping = language_family_data[['Glottocode', 'Family_Name']].drop_duplicates('Glottocode')
    print(f"Unique languages with family information: {len(language_to_family_mapping)}")
    
    # Check for languages with multiple family assignments
    duplicate_languages = language_to_family_mapping['Glottocode'].duplicated(keep=False)
    if duplicate_languages.any():
        print(f"WARNING: {duplicate_languages.sum()} languages have multiple family assignments!")
        print("Keeping first occurrence for each language")
        language_to_family_mapping = language_to_family_mapping.drop_duplicates('Glottocode', keep='first')
    
    # Merge datasets to get language family for each sound
    print("Merging datasets on Glottocode...")
    merged_data = phonological_data.merge(
        language_to_family_mapping, 
        on='Glottocode', 
        how='inner'
    )
    
    # Check for missing values in Family_Name
    missing_family = merged_data['Family_Name'].isna().sum()
    if missing_family > 0:
        print(f"WARNING: {missing_family} samples have missing family information. Removing these samples.")
        merged_data = merged_data.dropna(subset=['Family_Name'])
    
    print(f"Total samples after merging: {len(merged_data)}")
    
    # Display language family distribution
    family_distribution = merged_data['Family_Name'].value_counts()
    print(f"\nLanguage family distribution (top 10 of {len(family_distribution)} families):")
    for i, (family, count) in enumerate(family_distribution.head(10).items()):
        print(f"  {i+1}. {family}: {count} sounds")
    
    min_sounds = family_distribution.min()
    max_sounds = family_distribution.max()
    print(f"Sounds per family - Min: {min_sounds}, Max: {max_sounds}, Ratio: {max_sounds/min_sounds:.1f}x")
    
    # Extract and binarize phonological features
    phonological_feature_columns = merged_data.loc[:, 'tone':'click'].columns.tolist()
    print(f"\nExtracting {len(phonological_feature_columns)} phonological features")
    
    # Transform features to binary as per requirements
    print("Transforming features to binary format ('+' → 1, '-' and '0' → 0)")
    for feature_column in phonological_feature_columns:
        merged_data[feature_column] = merged_data[feature_column].replace({'+': 1, '-': 0, '0': 0}).astype('float32')
    
    # Extract feature vectors
    feature_vectors = merged_data[phonological_feature_columns].values
    print(f"Feature matrix shape: {feature_vectors.shape} (samples × features)")
    
    # Encode each unique sound with a numerical ID for the embedding model
    print("\nEncoding sounds for embedding model...")
    merged_data['sound_id'] = pd.Categorical(merged_data['Phoneme']).codes
    sound_ids = merged_data['sound_id'].values
    unique_sound_count = merged_data['sound_id'].nunique()
    print(f"Total unique sounds: {unique_sound_count}")
    
    # Encode language families as target classes
    print("\nEncoding language families as target classes...")
    merged_data['family_class_id'] = pd.Categorical(merged_data['Family_Name']).codes
    target_classes = merged_data['family_class_id'].values
    class_count = merged_data['family_class_id'].nunique()
    print(f"Total language families (target classes): {class_count}")
    
    # Create mapping dictionaries for interpretation
    sound_to_id = {sound: id for id, sound in enumerate(merged_data['Phoneme'].unique())}
    id_to_sound = {id: sound for sound, id in sound_to_id.items()}
    
    family_to_id = {family: id for id, family in enumerate(merged_data['Family_Name'].unique())}
    id_to_family = {id: family for family, id in family_to_id.items()}
    
    # Store processed data in dictionary
    preprocessed_data = {
        'feature_vectors': feature_vectors,
        'sound_ids': sound_ids.reshape(-1, 1),  # Reshape for embedding input
        'target_classes': target_classes,
        'unique_sound_count': unique_sound_count,
        'class_count': class_count,
        'feature_columns': phonological_feature_columns,
        'sound_to_id': sound_to_id,
        'id_to_sound': id_to_sound,
        'family_to_id': family_to_id,
        'id_to_family': id_to_family,
        'merged_data': merged_data
    }
    
    return preprocessed_data

# Load and preprocess data
preprocessed_data = load_and_preprocess_data(config)



Loading phonological features from: cs_assignment3_data_1.csv
Loading language families from: cs_assignment3_data_2.csv
Phonological features shape: (105488, 48)
Language families shape: (2886, 13)
Unique languages with family information: 2059
Merging datasets on Glottocode...
Total samples after merging: 97051

Language family distribution (top 10 of 173 families):
  1. Atlantic-Congo: 20002 sounds
  2. Indo-European: 11555 sounds
  3. Sino-Tibetan: 6793 sounds
  4. Pama-Nyungan: 6210 sounds
  5. Afro-Asiatic: 6021 sounds
  6. Austronesian: 3673 sounds
  7. Austroasiatic: 2788 sounds
  8. Uralic: 2424 sounds
  9. Mande: 2304 sounds
  10. Dravidian: 2167 sounds
Sounds per family - Min: 11, Max: 20002, Ratio: 1818.4x

Extracting 37 phonological features
Transforming features to binary format ('+' → 1, '-' and '0' → 0)
Feature matrix shape: (97051, 37) (samples × features)

Encoding sounds for embedding model...
Total unique sounds: 3032

Encoding language families as target classes...

In [4]:
# =====================================================================
# Data Splitting and DataLoader Creation
# =====================================================================

def prepare_data_loaders(preprocessed_data, config):
    """
    Prepare training and testing datasets and dataloaders
    
    Args:
        preprocessed_data: Dictionary containing processed data
        config: Configuration settings object
        
    Returns:
        Dictionary containing dataloaders and dataset info
    """
    print(f"\n{'='*20} PREPARING DATASETS AND DATALOADERS {'='*20}")
    
    # Extract data from dictionary
    feature_vectors = preprocessed_data['feature_vectors']
    sound_ids = preprocessed_data['sound_ids']
    target_classes = preprocessed_data['target_classes']
    
    # Create stratified train/test split
    print(f"Creating stratified train/test split with test_size=0.2, random_state={config.random_seed}")
    features_train, features_test, sounds_train, sounds_test, targets_train, targets_test = train_test_split(
        feature_vectors, 
        sound_ids, 
        target_classes,
        test_size=0.2, 
        random_state=config.random_seed, 
        stratify=target_classes
    )
    
    print(f"Training set: {len(features_train)} samples")
    print(f"Testing set: {len(features_test)} samples")
    
    # Convert to PyTorch tensors
    print("Converting data to PyTorch tensors...")
    # For Model 1 (Feature vectors)
    features_train_tensor = torch.FloatTensor(features_train)
    features_test_tensor = torch.FloatTensor(features_test)
    
    # For Model 2 (Embeddings)
    sounds_train_tensor = torch.LongTensor(sounds_train)
    sounds_test_tensor = torch.LongTensor(sounds_test)
    
    # Target tensors (same for both models)
    targets_train_tensor = torch.LongTensor(targets_train)
    targets_test_tensor = torch.LongTensor(targets_test)
    
    # Create TensorDatasets
    print("Creating TensorDatasets...")
    feature_train_dataset = TensorDataset(features_train_tensor, targets_train_tensor)
    feature_test_dataset = TensorDataset(features_test_tensor, targets_test_tensor)
    
    sound_train_dataset = TensorDataset(sounds_train_tensor, targets_train_tensor)
    sound_test_dataset = TensorDataset(sounds_test_tensor, targets_test_tensor)
    
    # Create DataLoaders
    print(f"Creating DataLoaders with batch_size={config.mini_batch_size}...")
    feature_train_loader = DataLoader(
        feature_train_dataset, 
        batch_size=config.mini_batch_size, 
        shuffle=True,
        pin_memory=True if config.computation_device.type == 'cuda' else False
    )
    
    feature_test_loader = DataLoader(
        feature_test_dataset, 
        batch_size=config.mini_batch_size, 
        shuffle=False,
        pin_memory=True if config.computation_device.type == 'cuda' else False
    )
    
    sound_train_loader = DataLoader(
        sound_train_dataset, 
        batch_size=config.mini_batch_size, 
        shuffle=True,
        pin_memory=True if config.computation_device.type == 'cuda' else False
    )
    
    sound_test_loader = DataLoader(
        sound_test_dataset, 
        batch_size=config.mini_batch_size, 
        shuffle=False,
        pin_memory=True if config.computation_device.type == 'cuda' else False
    )
    
    data_loaders = {
        'feature_train_loader': feature_train_loader,
        'feature_test_loader': feature_test_loader,
        'sound_train_loader': sound_train_loader,
        'sound_test_loader': sound_test_loader,
        'features_train': features_train,
        'features_test': features_test,
        'sounds_train': sounds_train,
        'sounds_test': sounds_test,
        'targets_train': targets_train,
        'targets_test': targets_test
    }
    
    return data_loaders

# Prepare data loaders
data_loaders = prepare_data_loaders(preprocessed_data, config)




Creating stratified train/test split with test_size=0.2, random_state=42
Training set: 77640 samples
Testing set: 19411 samples
Converting data to PyTorch tensors...
Creating TensorDatasets...
Creating DataLoaders with batch_size=128...


In [5]:
# =====================================================================
# Model Definitions
# =====================================================================

class PhonologicalFeatureModel(nn.Module):
    """
    Neural network model that takes phonological feature vectors as input
    and predicts language family.
    
    Model 1 as per requirements: Uses feature vectors
    """
    def __init__(self, input_features, hidden_size, num_classes, dropout_prob=0.3):
        """
        Args:
            input_features: Number of input features (phonological features)
            hidden_size: Size of hidden layer
            num_classes: Number of output classes (language families)
            dropout_prob: Dropout probability for regularization
        """
        super(PhonologicalFeatureModel, self).__init__()
        
        self.model_layers = nn.Sequential(
            # Input layer
            nn.Linear(input_features, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            
            # Output layer
            nn.Linear(hidden_size, num_classes)
        )
    
    def forward(self, features_batch):
        """Forward pass through the network"""
        return self.model_layers(features_batch)

class PhonemeEmbeddingModel(nn.Module):
    """
    Neural network model that uses sound embeddings as input
    and predicts language family.
    
    Model 2 as per requirements: Uses embeddings instead of feature vectors
    """
    def __init__(self, vocabulary_size, embedding_dim, hidden_size, num_classes, dropout_prob=0.3):
        """
        Args:
            vocabulary_size: Number of unique sounds in the dataset
            embedding_dim: Size of the embedding vectors
            hidden_size: Size of hidden layer
            num_classes: Number of output classes (language families)
            dropout_prob: Dropout probability for regularization
        """
        super(PhonemeEmbeddingModel, self).__init__()
        
        # Embedding layer maps each sound ID to a dense vector
        self.embedding_layer = nn.Embedding(vocabulary_size, embedding_dim)
        
        # Neural network layers after embedding
        self.model_layers = nn.Sequential(
            nn.Linear(embedding_dim, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            
            nn.Linear(hidden_size, num_classes)
        )
    
    def forward(self, sound_ids_batch):
        """Forward pass through the network"""
        # Get embeddings and remove extra dimension
        embedded_sounds = self.embedding_layer(sound_ids_batch).squeeze(1)
        
        # Pass through the rest of the network
        return self.model_layers(embedded_sounds)

In [6]:
# =====================================================================
# Model Training and Evaluation
# =====================================================================

def train_model(model, data_loader, optimizer, criterion, device, epoch, total_epochs):
    """
    Train the model for one epoch
    
    Args:
        model: PyTorch model to train
        data_loader: DataLoader with training data
        optimizer: Optimizer for weight updates
        criterion: Loss function
        device: Device to run training on (CPU/GPU)
        epoch: Current epoch number
        total_epochs: Total number of epochs
        
    Returns:
        average_loss: Average loss over the epoch
    """
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    
    # Create progress bar
    progress_bar = tqdm(data_loader, desc=f"Epoch {epoch}/{total_epochs} [Train]")
    
    for inputs, targets in progress_bar:
        # Move tensors to the configured device
        inputs = inputs.to(device)
        targets = targets.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Update statistics
        running_loss += loss.item() * inputs.size(0)
        
        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        total_samples += targets.size(0)
        correct_predictions += (predicted == targets).sum().item()
        
        # Update progress bar
        progress_bar.set_postfix(
            loss=f"{loss.item():.4f}", 
            accuracy=f"{100 * correct_predictions / total_samples:.2f}%"
        )
    
    average_loss = running_loss / len(data_loader.dataset)
    accuracy = 100 * correct_predictions / total_samples
    
    return average_loss, accuracy

def evaluate_model(model, data_loader, criterion, device, id_to_family=None):
    """
    Evaluate the model on the test set
    
    Args:
        model: Trained PyTorch model
        data_loader: DataLoader with test data
        criterion: Loss function
        device: Device to run evaluation on (CPU/GPU)
        id_to_family: Dictionary mapping class IDs to family names
        
    Returns:
        metrics: Dictionary containing evaluation metrics
    """
    model.eval()
    running_loss = 0.0
    all_predictions = []
    all_targets = []
    
    # Disable gradient calculation for evaluation
    with torch.no_grad():
        for inputs, targets in tqdm(data_loader, desc="Evaluating"):
            # Move tensors to the configured device
            inputs = inputs.to(device)
            targets = targets.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            # Update statistics
            running_loss += loss.item() * inputs.size(0)
            
            # Get predictions
            _, predicted = torch.max(outputs, 1)
            
            # Store predictions and targets for metrics calculation
            all_predictions.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(all_targets, all_predictions)
    mcc = matthews_corrcoef(all_targets, all_predictions)
    
    # Calculate per-class metrics
    class_report = classification_report(
        all_targets, 
        all_predictions, 
        output_dict=True,
        zero_division=0
    )
    
    # Convert numeric indices to family names if provided
    if id_to_family:
        class_report_named = {}
        for class_id, metrics in class_report.items():
            if class_id in ['accuracy', 'macro avg', 'weighted avg']:
                class_report_named[class_id] = metrics
            else:
                family_name = id_to_family[int(class_id)]
                class_report_named[family_name] = metrics
        class_report = class_report_named
    
    # Calculate confusion matrix (take a sample if too many classes)
    num_classes = len(set(all_targets))
    if num_classes <= 20:  # Only create confusion matrix for reasonable number of classes
        conf_matrix = confusion_matrix(all_targets, all_predictions)
    else:
        conf_matrix = None
    
    # Average loss
    average_loss = running_loss / len(data_loader.dataset)
    
    metrics = {
        'loss': average_loss,
        'accuracy': accuracy,
        'mcc': mcc,
        'classification_report': class_report,
        'confusion_matrix': conf_matrix,
        'predictions': all_predictions,
        'targets': all_targets
    }
    
    return metrics

def train_and_evaluate(model_name, model, train_loader, test_loader, config, id_to_family=None):
    """
    Complete training and evaluation pipeline for a model
    
    Args:
        model_name: Name of the model for logging
        model: PyTorch model to train
        train_loader: DataLoader with training data
        test_loader: DataLoader with test data
        config: Configuration settings
        id_to_family: Dictionary mapping class IDs to family names
        
    Returns:
        metrics: Dictionary containing evaluation metrics and training history
    """
    print(f"\n{'='*20} TRAINING {model_name} {'='*20}")
    
    # Move model to device
    model = model.to(config.computation_device)
    
    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(
        model.parameters(), 
        lr=config.gradient_learning_rate
    )
    
    # Training history
    history = {
        'train_loss': [],
        'train_accuracy': []
    }
    
    # Early stopping variables
    best_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    # Training loop
    for epoch in range(1, config.training_epochs + 1):
        # Train for one epoch
        epoch_loss, epoch_accuracy = train_model(
            model=model,
            data_loader=train_loader,
            optimizer=optimizer,
            criterion=criterion,
            device=config.computation_device,
            epoch=epoch,
            total_epochs=config.training_epochs
        )
        
        # Update history
        history['train_loss'].append(epoch_loss)
        history['train_accuracy'].append(epoch_accuracy)
        
        # Print epoch results
        print(f"Epoch {epoch}/{config.training_epochs} - Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")
        
        # Early stopping check
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()
        else:
            patience_counter += 1
            
        if patience_counter >= config.early_stopping_patience:
            print(f"Early stopping at epoch {epoch} - No improvement for {config.early_stopping_patience} epochs")
            break
    
    # Restore best model state if early stopping occurred
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    # Evaluate the model
    print(f"\n{'='*20} EVALUATING {model_name} {'='*20}")
    evaluation_metrics = evaluate_model(
        model=model,
        data_loader=test_loader,
        criterion=criterion,
        device=config.computation_device,
        id_to_family=id_to_family
    )
    
    # Combine history and evaluation metrics
    metrics = {
        'history': history,
        **evaluation_metrics
    }
    
    # Print evaluation results
    print(f"\n{model_name} Evaluation Results:")
    print(f"  Accuracy: {evaluation_metrics['accuracy']:.4f}")
    print(f"  MCC: {evaluation_metrics['mcc']:.4f}")
    print(f"  Loss: {evaluation_metrics['loss']:.4f}")
    
    # Print top 5 and bottom 5 performing language families
    if id_to_family:
        class_report = evaluation_metrics['classification_report']
        family_f1_scores = {}
        
        for class_name, metrics in class_report.items():
            if class_name not in ['accuracy', 'macro avg', 'weighted avg']:
                family_f1_scores[class_name] = metrics['f1-score']
        
        # Sort by F1 score
        sorted_families = sorted(family_f1_scores.items(), key=lambda x: x[1], reverse=True)
        
        print("\n  Top 5 performing language families (F1 score):")
        for i, (family, f1) in enumerate(sorted_families[:5]):
            print(f"    {i+1}. {family}: {f1:.4f}")
        
        print("\n  Bottom 5 performing language families (F1 score):")
        for i, (family, f1) in enumerate(sorted_families[-5:]):
            print(f"    {i+1}. {family}: {f1:.4f}")
    
    return metrics

In [7]:
# =====================================================================
# Initialize and Train Models
# =====================================================================

# Initialize Model 1: Phonological Feature Model
feature_model = PhonologicalFeatureModel(
    input_features=preprocessed_data['feature_vectors'].shape[1],
    hidden_size=config.feature_hidden_neurons,
    num_classes=preprocessed_data['class_count'],
    dropout_prob=config.dropout_probability
)

# Initialize Model 2: Phoneme Embedding Model
embedding_model = PhonemeEmbeddingModel(
    vocabulary_size=preprocessed_data['unique_sound_count'] + 1,  # +1 for padding
    embedding_dim=config.embedding_dimension,
    hidden_size=config.feature_hidden_neurons,
    num_classes=preprocessed_data['class_count'],
    dropout_prob=config.dropout_probability
)

# Display model architectures
print("\nModel 1: Phonological Feature Model")
print(feature_model)
print(f"Total parameters: {sum(p.numel() for p in feature_model.parameters() if p.requires_grad)}")

print("\nModel 2: Phoneme Embedding Model")
print(embedding_model)
print(f"Total parameters: {sum(p.numel() for p in embedding_model.parameters() if p.requires_grad)}")

# Train and evaluate Model 1
feature_model_metrics = train_and_evaluate(
    model_name="Model 1 (Phonological Feature Model)",
    model=feature_model,
    train_loader=data_loaders['feature_train_loader'],
    test_loader=data_loaders['feature_test_loader'],
    config=config,
    id_to_family=preprocessed_data['id_to_family']
)

# Train and evaluate Model 2
embedding_model_metrics = train_and_evaluate(
    model_name="Model 2 (Phoneme Embedding Model)",
    model=embedding_model,
    train_loader=data_loaders['sound_train_loader'],
    test_loader=data_loaders['sound_test_loader'],
    config=config,
    id_to_family=preprocessed_data['id_to_family']
)



Model 1: Phonological Feature Model
PhonologicalFeatureModel(
  (model_layers): Sequential(
    (0): Linear(in_features=37, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=173, bias=True)
  )
)
Total parameters: 54701

Model 2: Phoneme Embedding Model
PhonemeEmbeddingModel(
  (embedding_layer): Embedding(3033, 64)
  (model_layers): Sequential(
    (0): Linear(in_features=64, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=173, bias=True)
  )
)
Total parameters: 255725



Epoch 1/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:07<00:00, 77.14it/s, accuracy=23.19%, loss=3.0602]


Epoch 1/25 - Loss: 3.3622, Accuracy: 23.19%


Epoch 2/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:07<00:00, 77.62it/s, accuracy=25.45%, loss=3.1400]


Epoch 2/25 - Loss: 3.2044, Accuracy: 25.45%


Epoch 3/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:07<00:00, 76.43it/s, accuracy=25.86%, loss=3.4403]


Epoch 3/25 - Loss: 3.1634, Accuracy: 25.86%


Epoch 4/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:07<00:00, 77.86it/s, accuracy=26.07%, loss=2.7988]


Epoch 4/25 - Loss: 3.1439, Accuracy: 26.07%


Epoch 5/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:07<00:00, 75.90it/s, accuracy=26.28%, loss=3.5029]


Epoch 5/25 - Loss: 3.1303, Accuracy: 26.28%


Epoch 6/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:07<00:00, 78.36it/s, accuracy=26.47%, loss=3.2064]


Epoch 6/25 - Loss: 3.1190, Accuracy: 26.47%


Epoch 7/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:08<00:00, 74.63it/s, accuracy=26.51%, loss=3.2525]


Epoch 7/25 - Loss: 3.1100, Accuracy: 26.51%


Epoch 8/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:07<00:00, 78.12it/s, accuracy=26.62%, loss=2.9615]


Epoch 8/25 - Loss: 3.1045, Accuracy: 26.62%


Epoch 9/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:07<00:00, 77.79it/s, accuracy=26.63%, loss=3.1863]


Epoch 9/25 - Loss: 3.0988, Accuracy: 26.63%


Epoch 10/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 78.19it/s, accuracy=26.69%, loss=3.3027]


Epoch 10/25 - Loss: 3.0926, Accuracy: 26.69%


Epoch 11/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 80.18it/s, accuracy=26.83%, loss=2.9497]


Epoch 11/25 - Loss: 3.0888, Accuracy: 26.83%


Epoch 12/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:06<00:00, 86.90it/s, accuracy=26.84%, loss=3.3771]


Epoch 12/25 - Loss: 3.0847, Accuracy: 26.84%


Epoch 13/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 77.80it/s, accuracy=26.84%, loss=3.4607]


Epoch 13/25 - Loss: 3.0808, Accuracy: 26.84%


Epoch 14/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 76.87it/s, accuracy=26.82%, loss=3.4685]


Epoch 14/25 - Loss: 3.0775, Accuracy: 26.82%


Epoch 15/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 79.35it/s, accuracy=26.88%, loss=2.5757]


Epoch 15/25 - Loss: 3.0736, Accuracy: 26.88%


Epoch 16/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 79.13it/s, accuracy=26.99%, loss=3.3477]


Epoch 16/25 - Loss: 3.0721, Accuracy: 26.99%


Epoch 17/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 76.74it/s, accuracy=26.84%, loss=2.9809]


Epoch 17/25 - Loss: 3.0680, Accuracy: 26.84%


Epoch 18/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 80.72it/s, accuracy=27.00%, loss=3.2541]


Epoch 18/25 - Loss: 3.0657, Accuracy: 27.00%


Epoch 19/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 80.37it/s, accuracy=27.03%, loss=3.2996]


Epoch 19/25 - Loss: 3.0618, Accuracy: 27.03%


Epoch 20/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 77.75it/s, accuracy=27.06%, loss=3.3921]


Epoch 20/25 - Loss: 3.0599, Accuracy: 27.06%


Epoch 21/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:08<00:00, 75.53it/s, accuracy=27.03%, loss=2.6460]


Epoch 21/25 - Loss: 3.0583, Accuracy: 27.03%


Epoch 22/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 78.08it/s, accuracy=27.08%, loss=2.8932]


Epoch 22/25 - Loss: 3.0566, Accuracy: 27.08%


Epoch 23/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 78.73it/s, accuracy=27.22%, loss=3.2032]


Epoch 23/25 - Loss: 3.0544, Accuracy: 27.22%


Epoch 24/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 78.79it/s, accuracy=27.12%, loss=3.1393]


Epoch 24/25 - Loss: 3.0526, Accuracy: 27.12%


Epoch 25/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:07<00:00, 78.71it/s, accuracy=27.11%, loss=2.7166]


Epoch 25/25 - Loss: 3.0502, Accuracy: 27.11%



Evaluating: 100%|██████████████████████████████████████████████████████████████████████████| 152/152 [00:00<00:00, 216.35it/s]



Model 1 (Phonological Feature Model) Evaluation Results:
  Accuracy: 0.2723
  MCC: 0.1570
  Loss: 3.0844

  Top 5 performing language families (F1 score):
    1. Gumuz: 0.4651
    2. Baining: 0.4255
    3. Tai-Kadai: 0.3990
    4. Chocoan: 0.3857
    5. Central Sudanic: 0.2561

  Bottom 5 performing language families (F1 score):
    1. Marrku-Wurrugu: 0.0000
    2. Jarrakan: 0.0000
    3. Mirndi: 0.0000
    4. Tangkic: 0.0000
    5. Yangmanic: 0.0000



Epoch 1/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:09<00:00, 66.33it/s, accuracy=22.72%, loss=3.3166]


Epoch 1/25 - Loss: 3.4262, Accuracy: 22.72%


Epoch 2/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:09<00:00, 65.40it/s, accuracy=25.33%, loss=2.9951]


Epoch 2/25 - Loss: 3.2509, Accuracy: 25.33%


Epoch 3/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:09<00:00, 65.88it/s, accuracy=26.08%, loss=3.1626]


Epoch 3/25 - Loss: 3.1918, Accuracy: 26.08%


Epoch 4/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:08<00:00, 68.70it/s, accuracy=26.61%, loss=3.0324]


Epoch 4/25 - Loss: 3.1526, Accuracy: 26.61%


Epoch 5/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:09<00:00, 63.46it/s, accuracy=26.92%, loss=3.3784]


Epoch 5/25 - Loss: 3.1264, Accuracy: 26.92%


Epoch 6/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:09<00:00, 63.09it/s, accuracy=27.39%, loss=3.3053]


Epoch 6/25 - Loss: 3.1044, Accuracy: 27.39%


Epoch 7/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:09<00:00, 63.37it/s, accuracy=27.63%, loss=2.9480]


Epoch 7/25 - Loss: 3.0837, Accuracy: 27.63%


Epoch 8/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:09<00:00, 62.58it/s, accuracy=27.79%, loss=2.8564]


Epoch 8/25 - Loss: 3.0679, Accuracy: 27.79%


Epoch 9/25 [Train]: 100%|█████████████████████████████████████| 607/607 [00:09<00:00, 62.31it/s, accuracy=27.93%, loss=3.1460]


Epoch 9/25 - Loss: 3.0531, Accuracy: 27.93%


Epoch 10/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 61.83it/s, accuracy=28.19%, loss=2.8380]


Epoch 10/25 - Loss: 3.0399, Accuracy: 28.19%


Epoch 11/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 61.62it/s, accuracy=28.26%, loss=3.1906]


Epoch 11/25 - Loss: 3.0303, Accuracy: 28.26%


Epoch 12/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 63.18it/s, accuracy=28.45%, loss=2.8238]


Epoch 12/25 - Loss: 3.0193, Accuracy: 28.45%


Epoch 13/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 63.21it/s, accuracy=28.61%, loss=3.4549]


Epoch 13/25 - Loss: 3.0080, Accuracy: 28.61%


Epoch 14/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 63.17it/s, accuracy=28.64%, loss=3.2057]


Epoch 14/25 - Loss: 2.9995, Accuracy: 28.64%


Epoch 15/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 63.56it/s, accuracy=28.77%, loss=2.9287]


Epoch 15/25 - Loss: 2.9921, Accuracy: 28.77%


Epoch 16/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 63.73it/s, accuracy=29.04%, loss=2.6620]


Epoch 16/25 - Loss: 2.9813, Accuracy: 29.04%


Epoch 17/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 63.43it/s, accuracy=28.97%, loss=3.0640]


Epoch 17/25 - Loss: 2.9749, Accuracy: 28.97%


Epoch 18/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 63.70it/s, accuracy=29.07%, loss=2.8634]


Epoch 18/25 - Loss: 2.9709, Accuracy: 29.07%


Epoch 19/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 62.75it/s, accuracy=29.21%, loss=3.1435]


Epoch 19/25 - Loss: 2.9630, Accuracy: 29.21%


Epoch 20/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 63.91it/s, accuracy=29.20%, loss=2.9633]


Epoch 20/25 - Loss: 2.9586, Accuracy: 29.20%


Epoch 21/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 62.88it/s, accuracy=29.40%, loss=3.1370]


Epoch 21/25 - Loss: 2.9533, Accuracy: 29.40%


Epoch 22/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 62.33it/s, accuracy=29.41%, loss=2.9449]


Epoch 22/25 - Loss: 2.9478, Accuracy: 29.41%


Epoch 23/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 62.72it/s, accuracy=29.52%, loss=2.9676]


Epoch 23/25 - Loss: 2.9431, Accuracy: 29.52%


Epoch 24/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 62.66it/s, accuracy=29.55%, loss=2.7222]


Epoch 24/25 - Loss: 2.9369, Accuracy: 29.55%


Epoch 25/25 [Train]: 100%|████████████████████████████████████| 607/607 [00:09<00:00, 62.38it/s, accuracy=29.61%, loss=2.8260]


Epoch 25/25 - Loss: 2.9340, Accuracy: 29.61%



Evaluating: 100%|██████████████████████████████████████████████████████████████████████████| 152/152 [00:00<00:00, 231.36it/s]



Model 2 (Phoneme Embedding Model) Evaluation Results:
  Accuracy: 0.2799
  MCC: 0.1668
  Loss: 3.1341

  Top 5 performing language families (F1 score):
    1. Chocoan: 0.4519
    2. Tai-Kadai: 0.4073
    3. Central Sudanic: 0.2922
    4. Timor-Alor-Pantar: 0.2574
    5. Kiwaian: 0.1865

  Bottom 5 performing language families (F1 score):
    1. Marrku-Wurrugu: 0.0000
    2. Jarrakan: 0.0000
    3. Mirndi: 0.0000
    4. Tangkic: 0.0000
    5. Yangmanic: 0.0000
