# Architecture Model Comparison

This notebook compares the performance of Decision Tree, Random Forest, and AdaBoost models, both globally and per language, and explores stacking approaches for similar languages.

## Import libraries and set global configuration

Imports all necessary libraries for data processing, graph analysis, machine learning, and model evaluation. Sets the global random seed and defines the feature columns used throughout the notebook.

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import ast
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, zero_one_loss, roc_auc_score, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.combine import SMOTEENN
import joblib
import os
import warnings
warnings.filterwarnings("ignore")

# Global configuration
SEED = 42
np.random.seed(SEED)

# Feature columns
FEATURE_COLUMNS = [
    'degree', 'closeness', 'betweenness', ' eccentricity',
    'leaf_node', 'farness', 'subtree_height'
]

## Define centrality calculation function

Defines a function to compute various centrality measures and subtree heights for each node in a graph, returning a dictionary of features.

In [2]:
def centralities(edgelist):
    
    T = nx.from_edgelist(edgelist)
    
    # Calculate centrality measures
    degree = nx.degree_centrality(T)  
    closeness = nx.closeness_centrality(T)  
    betweenness = nx.betweenness_centrality(T)  
    eccentricity = nx.eccentricity(T)  
    leaf_node = [v for v, d in T.degree() if d == 1]  
    Farness = {v: 1 / (eccentricity[v] + 1) for v in T.nodes() if eccentricity[v] > 0}
    
    # Calculate subtree heights
    subtree_height = {}
    for node in T.nodes():
        subtree_height[node] = nx.single_source_shortest_path_length(T, node)
    for node, distances in subtree_height.items():
        subtree_height[node] = max(distances.values()) if distances else 0

    features = {}
    for v in T:
        features[v] = (
            degree[v],
            closeness[v],
            betweenness[v], 
            eccentricity[v],
            leaf_node.count(v),
            Farness[v],
            subtree_height[v]  # Added subtree height feature
        )
    return features


## Define data splitting function

Defines a function to split the dataset into training and validation sets by randomly selecting unique sentences for validation, ensuring no overlap.

In [3]:
import random

def split_data_set(data, seed=SEED, test_ratio=0.2): # Use global SEED
    random.seed(seed)
    unique_ids = list(set(data['sentence']))
    
    # Ensure test_size is not larger than the number of unique_ids
    if not unique_ids: # Handle empty data
        return pd.DataFrame(columns=data.columns), pd.DataFrame(columns=data.columns)
    
    actual_test_size = int(len(unique_ids) * test_ratio)
    if actual_test_size == 0 and len(unique_ids) > 0 and test_ratio > 0: # Ensure at least one sample if possible
        actual_test_size = 1 
    if actual_test_size > len(unique_ids): # Cap test size
        actual_test_size = len(unique_ids)

    if actual_test_size == 0 : # if still zero (e.g. unique_ids is empty or test_ratio is 0)
         test_ids = set()
    else:
        test_ids = set(random.sample(unique_ids, actual_test_size))

    train_ids = set(unique_ids) - test_ids

    train_set = data[data['sentence'].isin(train_ids)]
    val_set = data[data['sentence'].isin(test_ids)]

    return train_set, val_set


## Utility functions for feature extraction, normalization, and cross-validation

Defines utility functions for centrality calculation, data splitting, feature expansion, normalization, and k-fold cross-validation index creation. Also includes the main data preparation function.

In [4]:
# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def centralities(edgelist):
    """Calculate centrality measures for graph nodes."""
    if not edgelist:
        return {}
    
    T = nx.from_edgelist(edgelist)
    
    degree = nx.degree_centrality(T)
    closeness = nx.closeness_centrality(T)
    betweenness = nx.betweenness_centrality(T)
    eccentricity = nx.eccentricity(T)
    leaf_node = [v for v, d in T.degree() if d == 1]
    farness = {v: 1 / (eccentricity[v] + 1) for v in T.nodes() if eccentricity[v] > 0}
    
    # Calculate subtree heights
    subtree_height = {}
    for node in T.nodes():
        distances = nx.single_source_shortest_path_length(T, node)
        subtree_height[node] = max(distances.values()) if distances else 0

    features = {}
    for v in T:
        features[v] = (
            degree[v], closeness[v], betweenness[v], eccentricity[v],
            leaf_node.count(v), farness[v], subtree_height[v]
        )
    return features

def split_data_set(data, seed=SEED, test_ratio=0.2):
    """Split data by unique sentences."""
    random.seed(seed)
    unique_ids = list(set(data['sentence']))
    
    if not unique_ids:
        return pd.DataFrame(columns=data.columns), pd.DataFrame(columns=data.columns)
    
    test_size = max(1, int(len(unique_ids) * test_ratio))
    test_size = min(test_size, len(unique_ids))
    
    if test_size == 0:
        test_ids = set()
    else:
        test_ids = set(random.sample(unique_ids, test_size))
    
    train_ids = set(unique_ids) - test_ids
    
    train_set = data[data['sentence'].isin(train_ids)]
    val_set = data[data['sentence'].isin(test_ids)]
    
    return train_set, val_set

def get_expanded_data(data, train=True):
    """Expand data with centrality features."""
    expanded_set = []
    
    for index, row in data.iterrows():
        edgelist_str = row.get('rand_edgelist', '[]')
        try:
            edgelist = ast.literal_eval(edgelist_str)
            if not isinstance(edgelist, list):
                edgelist = []
        except (ValueError, SyntaxError):
            edgelist = []

        central_edges = centralities(edgelist)
        language = row['language']
        sentence = row['sentence']
        n = row['n']
        
        if train:
            root = row['root']
            for vertex, values in central_edges.items():
                expanded_set.append((language, sentence, n, vertex, *values, vertex==root))
        else:
            row_id = row.get('id', None)
            for vertex, values in central_edges.items():
                expanded_set.append((row_id, language, sentence, n, vertex, *values))

    if train:
        columns = ['language', 'sentence', 'n', 'vertex'] + FEATURE_COLUMNS + ['is_root']
    else:
        columns = ['id', 'language', 'sentence', 'n', 'vertex'] + FEATURE_COLUMNS
    
    return pd.DataFrame(expanded_set, columns=columns)

def normalize_by_sentence(df, feature_columns, groupby_cols=['language', 'sentence']):
    """Normalize features within each sentence group."""
    grouped = df.groupby(groupby_cols)
    normalized_groups = []
    numerical_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()

    for (lang, sentence), group in grouped:
        scaler = MinMaxScaler()
        normalized = scaler.fit_transform(group[numerical_features])
        normalized_df = pd.DataFrame(normalized, index=group.index, columns=numerical_features)
        combined = group.drop(columns=numerical_features).join(normalized_df)
        normalized_groups.append(combined)

    return pd.concat(normalized_groups)

def create_k_folds_indices(data, k=5, seed_base=SEED):
    """Create k-fold indices for cross-validation."""
    splits = []
    
    if not data.index.is_unique:
        data = data.reset_index(drop=True)
        
    for i in range(k):
        current_seed = seed_base + i
        fold_train_set, fold_val_set = split_data_set(data, seed=current_seed, test_ratio=0.2)
        
        train_indices = data.index.get_indexer(fold_train_set.index.values)
        val_indices = data.index.get_indexer(fold_val_set.index.values)
        
        splits.append((train_indices, val_indices))
    
    return splits

# =============================================================================
# DATA PREPARATION
# =============================================================================

def prepare_data():
    """Load and prepare training and test data."""
    train = pd.read_csv('train-random.csv')
    test = pd.read_csv('test-random.csv')
    
    # Split training data
    train_set, val_set = split_data_set(train, seed=SEED, test_ratio=0.2)
    
    # Expand data with features
    expanded_data_train = get_expanded_data(train_set)
    expanded_data_val = get_expanded_data(val_set)
    expanded_data_test = get_expanded_data(test, train=False)
    
    # Sort data
    expanded_data_train.sort_values(by=['language', 'sentence', 'n', 'vertex'], inplace=True)
    expanded_data_val.sort_values(by=['language', 'sentence', 'n', 'vertex'], inplace=True)
    expanded_data_test.sort_values(by=['id', 'language', 'sentence', 'n', 'vertex'], inplace=True)
    
    # Normalize features
    train_scaled = normalize_by_sentence(expanded_data_train, FEATURE_COLUMNS)
    val_scaled = normalize_by_sentence(expanded_data_val, FEATURE_COLUMNS)
    test_scaled = normalize_by_sentence(expanded_data_test, FEATURE_COLUMNS)
    
    return train_scaled, val_scaled, test_scaled

## Prepare the data for training and validation

Calls the data preparation function to load, process, and normalize the training, validation, and test datasets.

In [5]:
train_scaled, val_scaled, test_scaled = prepare_data()

## Model training functions

Defines functions to train Decision Tree, Random Forest, and AdaBoost models, both globally and per language. Each function handles resampling, training, validation, and model saving.

In [6]:
# =============================================================================
# MODEL TRAINING
# =============================================================================

import os

def train_individual_dt_models(train_scaled, val_scaled):
    """Train Decision Tree models: one for all data, and one for each language."""
    grouped_train = train_scaled.groupby(['language'])
    grouped_val = val_scaled.groupby(['language']) 
    models = {}

    # Ensure directory exists
    dt_dir = "./dt_models"
    os.makedirs(dt_dir, exist_ok=True)

    # Train on all data first
    X_all = train_scaled.drop(columns=['language', 'n', 'vertex', 'sentence', 'is_root'])
    y_all = train_scaled['is_root'].astype(int)
    smote_enn = SMOTEENN(random_state=SEED, sampling_strategy=0.5, n_jobs=-1)
    X_all_resampled, y_all_resampled = smote_enn.fit_resample(X_all, y_all)
    model_all = DecisionTreeClassifier(class_weight='balanced', random_state=SEED)
    model_all.fit(X_all_resampled, y_all_resampled)
    #models['all_languages'] = model_all

    # Validation on all data
    X_val_all = val_scaled.drop(columns=['language', 'n', 'vertex', 'sentence', 'is_root'])
    y_val_all = val_scaled['is_root']
    
    y_pred_all = model_all.predict(X_val_all)
    accuracy_all = classification_report(y_val_all, y_pred_all)
    print(f"Decision Tree Results for ALL languages:")
    print(accuracy_all)
    filename_all = os.path.join(dt_dir, "dt_model_all_languages.joblib")
    joblib.dump(model_all, filename_all)

    # Train per-language models
    for lang, group in grouped_train:
        # If lang is a tuple (e.g., ('Arabic',)), extract the string
        if isinstance(lang, tuple):
            lang_str = lang[0]
        else:
            lang_str = lang

        group_data = group.drop(columns=['language', 'n', 'vertex'])
        
        X = group_data.drop(columns=['sentence', 'is_root'])
        y = group_data['is_root'].astype(int)
        
        # Apply SMOTEENN resampling
        smote_enn = SMOTEENN(random_state=SEED, sampling_strategy=0.5, n_jobs=-1)
        X_resampled, y_resampled = smote_enn.fit_resample(X, y)
        
        # Train model
        model = DecisionTreeClassifier(class_weight='balanced', random_state=SEED)
        model.fit(X_resampled, y_resampled)
        
        models[lang_str] = model
        
        X_val_group = grouped_val.get_group(lang)
        y_val_group = X_val_group['is_root']
        X_val_group = X_val_group.drop(columns=['language', 'sentence', 'n', 'vertex', 'is_root'])

        y_pred = model.predict(X_val_group)
        accuracy = classification_report(y_val_group, y_pred)
        #print(f"Decision Tree Results for {lang_str}:")
        #print(accuracy)
        
        # Save model
        filename = os.path.join(dt_dir, f"dt_model_{lang_str}_.joblib")
        joblib.dump(model, filename)
    
    return models

def train_individual_rf_models(train_scaled, val_scaled):
    """Train Random Forest models: one for all data, and one for each language."""
    grouped_train = train_scaled.groupby(['language'])
    grouped_val = val_scaled.groupby(['language'])
    models = {}

    # Ensure directory exists
    rf_dir = "./rf_models"
    os.makedirs(rf_dir, exist_ok=True)

    # Train on all data first
    X_all = train_scaled.drop(columns=['language', 'n', 'vertex', 'sentence', 'is_root'])
    y_all = train_scaled['is_root'].astype(int)
    smote_enn = SMOTEENN(random_state=SEED, sampling_strategy=0.5, n_jobs=-1)
    X_all_resampled, y_all_resampled = smote_enn.fit_resample(X_all, y_all)
    model_all = RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=SEED,
        n_jobs=-1
    )
    model_all.fit(X_all_resampled, y_all_resampled)
    #models['all_languages'] = model_all

    # Validation on all data
    X_val_all = val_scaled.drop(columns=['language', 'n', 'vertex', 'sentence', 'is_root'])
    y_val_all = val_scaled['is_root']
    y_pred_all = model_all.predict(X_val_all)
    accuracy_all = classification_report(y_val_all, y_pred_all)
    print(f"Random Forest Results for ALL languages:")
    print(accuracy_all)
    filename_all = os.path.join(rf_dir, "rf_model_all_languages.joblib")
    joblib.dump(model_all, filename_all)

    # Train per-language models
    for lang, group in grouped_train:
        if isinstance(lang, tuple):
            lang_str = lang[0]
        else:
            lang_str = lang

        group_data = group.drop(columns=['language', 'n', 'vertex'])
        
        X = group_data.drop(columns=['sentence', 'is_root'])
        y = group_data['is_root'].astype(int)
        
        # Apply SMOTEENN resampling
        smote_enn = SMOTEENN(random_state=SEED, sampling_strategy=0.5, n_jobs=-1)
        X_resampled, y_resampled = smote_enn.fit_resample(X, y)
        
        # Train model
        model = RandomForestClassifier(
            n_estimators=100,
            class_weight='balanced',
            random_state=SEED,
            n_jobs=-1
        )
        model.fit(X_resampled, y_resampled)
        
        models[lang_str] = model
        
        X_val_group = grouped_val.get_group(lang)
        y_val_group = X_val_group['is_root']
        X_val_group = X_val_group.drop(columns=['language', 'sentence', 'n', 'vertex', 'is_root'])

        y_pred = model.predict(X_val_group)
        accuracy = classification_report(y_val_group, y_pred)
        #print(f"Random Forest Results for {lang_str}:")
        #print(accuracy)
        
        # Save model
        filename = os.path.join(rf_dir, f"rf_model_{lang_str}_.joblib")
        joblib.dump(model, filename)
    
    return models

def train_individual_ada_models(train_scaled, val_scaled):
    """Train AdaBoost models: one for all data, and one for each language."""
    grouped_train = train_scaled.groupby(['language'])
    grouped_val = val_scaled.groupby(['language'])
    models = {}

    # Ensure directory exists
    ada_dir = "./ada_models"
    os.makedirs(ada_dir, exist_ok=True)

    # Train on all data first
    X_all = train_scaled.drop(columns=['language', 'n', 'vertex', 'sentence', 'is_root'])
    y_all = train_scaled['is_root'].astype(int)
    smote_enn = SMOTEENN(random_state=SEED, sampling_strategy=0.5, n_jobs=-1)
    X_all_resampled, y_all_resampled = smote_enn.fit_resample(X_all, y_all)
    base_dt = DecisionTreeClassifier(
        class_weight='balanced',
        random_state=SEED
    )
    model_all = AdaBoostClassifier(
        estimator=base_dt,
        n_estimators=100,
        random_state=SEED
    )
    model_all.fit(X_all_resampled, y_all_resampled)
    #models['all_languages'] = model_all

    # Validation on all data
    X_val_all = val_scaled.drop(columns=['language', 'n', 'vertex', 'sentence', 'is_root'])
    y_val_all = val_scaled['is_root']
    y_pred_all = model_all.predict(X_val_all)
    accuracy_all = classification_report(y_val_all, y_pred_all)
    print(f"AdaBoost Results for ALL languages:")
    print(accuracy_all)
    filename_all = os.path.join(ada_dir, "ada_model_all_languages.joblib")
    joblib.dump(model_all, filename_all)

    # Train per-language models
    for lang, group in grouped_train:
        if isinstance(lang, tuple):
            lang_str = lang[0]
        else:
            lang_str = lang

        group_data = group.drop(columns=['language', 'n', 'vertex'])
        
        X = group_data.drop(columns=['sentence', 'is_root'])
        y = group_data['is_root'].astype(int)
        
        # Apply SMOTEENN resampling
        smote_enn = SMOTEENN(random_state=SEED, sampling_strategy=0.5, n_jobs=-1)
        X_resampled, y_resampled = smote_enn.fit_resample(X, y)
        
        # Train model
        base_dt = DecisionTreeClassifier(
            class_weight='balanced',
            random_state=SEED
        )
        model = AdaBoostClassifier(
            estimator=base_dt,
            n_estimators=50,
            random_state=SEED
        )
        model.fit(X_resampled, y_resampled)
        
        models[lang_str] = model
        
        X_val_group = grouped_val.get_group(lang)
        y_val_group = X_val_group['is_root']
        X_val_group = X_val_group.drop(columns=['language', 'sentence', 'n', 'vertex', 'is_root'])

        y_pred = model.predict(X_val_group)
        accuracy = classification_report(y_val_group, y_pred)
        #print(f"AdaBoost Results for {lang_str}:")
        #print(accuracy)
        
        # Save model
        filename = os.path.join(ada_dir, f"ada_model_{lang_str}_.joblib")
        joblib.dump(model, filename)
    
    return models

## Train all models

Trains Decision Tree, Random Forest, and AdaBoost models on the training data and saves the resulting models.

In [7]:
dtmodels = train_individual_dt_models(train_scaled,val_scaled)
rfmodels = train_individual_rf_models(train_scaled,val_scaled)
ada_models = train_individual_ada_models(train_scaled,val_scaled)

Decision Tree Results for ALL languages:
              precision    recall  f1-score   support

       False       0.96      0.88      0.92     38134
        True       0.16      0.42      0.24      2100

    accuracy                           0.86     40234
   macro avg       0.56      0.65      0.58     40234
weighted avg       0.92      0.86      0.89     40234

Random Forest Results for ALL languages:
              precision    recall  f1-score   support

       False       0.97      0.91      0.93     38134
        True       0.19      0.40      0.26      2100

    accuracy                           0.88     40234
   macro avg       0.58      0.65      0.60     40234
weighted avg       0.92      0.88      0.90     40234

AdaBoost Results for ALL languages:
              precision    recall  f1-score   support

       False       0.96      0.89      0.92     38134
        True       0.17      0.41      0.24      2100

    accuracy                           0.86     40234
   macro a

## Calculate pairwise model similarities

Defines a function to compute pairwise similarities between models trained on different languages, based on prediction agreement on validation sets.

In [8]:

def calculate_model_similarities(loaded_models, val_scaled, similarity_threshold=0.3):
    """Calculate pairwise model similarities."""
    pairwise_similarity = {}
    languages_val = sorted(set(val_scaled['language']))
    
    # Prepare validation data
    val_data_X = {}
    val_data_y = {}
    grouped_val = val_scaled.groupby(['language'])
    
    for lang in languages_val:
        if lang in grouped_val.groups:
            val_group_df = grouped_val.get_group(lang)
            X_val = val_group_df[FEATURE_COLUMNS].copy()
            y_val = val_group_df['is_root'].astype(int).copy()
            val_data_X[lang] = X_val
            val_data_y[lang] = y_val
    
    model_names = list(loaded_models.keys())
    #print(model_names)
    
    for i in range(len(model_names)):
        for j in range(i + 1, len(model_names)):
            lang_i = languages_val[i]
            lang_j = languages_val[j]
            
            model_i = loaded_models[model_names[i]]
            model_j = loaded_models[model_names[j]]
            
            X_val_i = val_data_X[lang_i]
            X_val_j = val_data_X[lang_j]
            
            # Calculate similarities
            preds_i_on_i = model_i.predict(X_val_i)
            preds_j_on_i = model_j.predict(X_val_i)
            preds_i_on_j = model_i.predict(X_val_j)
            preds_j_on_j = model_j.predict(X_val_j)
            
            similarity_on_i = accuracy_score(preds_i_on_i, preds_j_on_i)
            similarity_on_j = accuracy_score(preds_i_on_j, preds_j_on_j)
            
            if similarity_on_i >= similarity_threshold and similarity_on_j >= similarity_threshold:
                if lang_i not in pairwise_similarity:
                    pairwise_similarity[lang_i] = []
                if lang_j not in pairwise_similarity:
                    pairwise_similarity[lang_j] = []
                
                if lang_j not in pairwise_similarity[lang_i]:
                    pairwise_similarity[lang_i].append(lang_j)
                if lang_i not in pairwise_similarity[lang_j]:
                    pairwise_similarity[lang_j].append(lang_i)
    
    return pairwise_similarity

## Compute pairwise similarities for all model types

Calculates pairwise similarities for Decision Tree, Random Forest, and AdaBoost models using a specified similarity threshold.

In [9]:
pairwise_similarity_dt = calculate_model_similarities(dtmodels, val_scaled, similarity_threshold=0.85)
pairwise_similarity_rf = calculate_model_similarities(rfmodels, val_scaled, similarity_threshold=0.85)
pairwise_similarity_ada = calculate_model_similarities(ada_models, val_scaled, similarity_threshold=0.85)

## Find language groups based on model similarity

Defines a function to find groups of languages whose models are mutually similar, using a clique-finding approach.

In [10]:
def find_language_groups(lang_dict):
    """
    Find groups of languages where each language appears in all other group members' lists.
    Each language can only be grouped once.
    """
    
    def can_form_group(languages):
        """Check if a set of languages can form a valid group"""
        for lang in languages:
            if lang not in lang_dict:
                return False
            # Check if all other languages in the group are in this language's list
            others = languages - {lang}
            if not others.issubset(set(lang_dict[lang])):
                return False
        return True
    
    def find_all_cliques(available_langs):
        """Find all maximal cliques from available languages"""
        cliques = []
        
        # Start with each language as a potential clique
        for start_lang in available_langs:
            if start_lang not in lang_dict:
                continue
                
            # Build clique starting from this language
            current_clique = {start_lang}
            candidates = set(lang_dict[start_lang]) & available_langs
            
            # Greedily add languages that are connected to all in current clique
            for candidate in sorted(candidates):  # Sort for consistent results
                if candidate in available_langs and can_form_group(current_clique | {candidate}):
                    current_clique.add(candidate)
            
            # Only keep if it's a valid clique of size > 1
            if len(current_clique) > 1 and can_form_group(current_clique):
                cliques.append(current_clique)
        
        return cliques
    
    # Find groups iteratively
    groups = []
    used_languages = set()
    available_languages = set(lang_dict.keys())
    
    while available_languages:
        # Find all possible cliques from remaining languages
        cliques = find_all_cliques(available_languages)
        
        if not cliques:
            break
            
        # Choose the largest clique (greedy approach)
        best_clique = max(cliques, key=len)
        groups.append(sorted(list(best_clique)))
        
        # Remove used languages
        used_languages.update(best_clique)
        available_languages -= best_clique
    
    return groups

## Group languages for stacking

Finds and stores groups of similar languages for each model type, to be used in stacking.

In [11]:
group_dt = find_language_groups(pairwise_similarity_dt)
group_rf = find_language_groups(pairwise_similarity_rf)
group_ada = find_language_groups(pairwise_similarity_ada)

## Create stacked models for language groups

Defines a function to create stacked models for groups of similar languages, using a meta-classifier (Decision Tree, Random Forest, or AdaBoost) as the final estimator.

In [12]:
def create_stacked_models(loaded_models, train_scaled, pred_group, meta_type='rf'):
    """
    Create stacked models for similar language groups.

    Parameters:
    - loaded_models: dict of base models
    - train_scaled: DataFrame with training data
    - meta_type: str, one of {'rf', 'dt', 'ada'} to select meta-classifier
        'rf'  = RandomForestClassifier (default)
        'dt'  = DecisionTreeClassifier
        'ada' = AdaBoostClassifier
    """
    
    # Define predefined groups
    predefined_groups_lang_names = pred_group
    
    model_groups_for_stacking = []
    
    # Convert language names to tuple format
    for name_group in predefined_groups_lang_names:
        tuple_group = []
        for lang_name in name_group:
            if lang_name in loaded_models:
                tuple_group.append(lang_name)
        if tuple_group:
            model_groups_for_stacking.append(sorted(tuple_group))

    
    # Add remaining languages as individual groups
    all_languages = set(loaded_models.keys())
    languages_in_groups = set()

    for group in model_groups_for_stacking:
        for lang_tuple in group:
            languages_in_groups.add(lang_tuple)
    
    for lang_tuple in all_languages:
        if lang_tuple not in languages_in_groups:
            model_groups_for_stacking.append([lang_tuple])
    
    # Prepare resampled training data
    train_X_resampled_map = {}
    train_y_resampled_map = {}
    grouped_train = train_scaled.groupby(['language'])
    smote_enn = SMOTEENN(random_state=SEED, sampling_strategy=0.5, n_jobs=-1)
    
    for lang_tuple, group_df in grouped_train:
        if isinstance(lang_tuple, tuple):
            lang_str = lang_tuple[0]
        X_res = group_df[FEATURE_COLUMNS].copy()
        y_res = group_df['is_root'].astype(int).copy()
        
        X_resampled, y_resampled = smote_enn.fit_resample(X_res, y_res)
        
        train_X_resampled_map[lang_str] = pd.DataFrame(X_resampled, columns=FEATURE_COLUMNS)
        train_y_resampled_map[lang_str] = pd.Series(y_resampled, name='is_root')
    
    # Select meta-estimator based on meta_type
    if meta_type == 'rf':
        meta_estimator = RandomForestClassifier(
            n_estimators=100, random_state=SEED, class_weight='balanced', n_jobs=-1
        )
    elif meta_type == 'dt':
        meta_estimator = DecisionTreeClassifier(
            max_depth=10, random_state=SEED, class_weight='balanced'
        )
    elif meta_type == 'ada':
        meta_estimator = AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=5, random_state=SEED, class_weight='balanced'),
            n_estimators=50, random_state=SEED
        )
    else:
        raise ValueError("meta_type must be one of {'rf', 'dt', 'ada'}")
    
    # Create stacked models
    stacked_models_final = {}
    print (model_groups_for_stacking)
    for i, lang_group in enumerate(model_groups_for_stacking):
        group_name = f"stacked_group_{i+1}"
        
        X_train_list = []
        y_train_list = []
        base_estimators = []
        
        for lang_tuple in lang_group:
            
            X_train_list.append(train_X_resampled_map[lang_tuple])
            y_train_list.append(train_y_resampled_map[lang_tuple])
        
            if isinstance(lang_tuple, tuple):
                estimator_name = lang_tuple[0]
            else:
                estimator_name = lang_tuple
            base_estimators.append((estimator_name, loaded_models[lang_tuple]))
      
        X_train_combined = pd.concat(X_train_list, ignore_index=True)
        y_train_combined = pd.concat(y_train_list, ignore_index=True)
        
        if len(base_estimators) == 1:
            # Use original model for single-model groups
            stacked_models_final[estimator_name] = base_estimators[0][1]
        else:
            # Create stacking classifier
            #group_splits = create_k_folds_indices(X_train_combined)
            
            stacking_clf = StackingClassifier(
                estimators=base_estimators,
                final_estimator=meta_estimator,
                stack_method='auto',
                n_jobs=-1,
                passthrough=False
            )
            
            stacking_clf.fit(X_train_combined, y_train_combined)
            stacked_models_final[group_name] = stacking_clf
            
            # Save model
            if meta_type == 'dt':
                filename = f"./dt_models/{group_name}_stacked_dt.joblib"
            elif meta_type == 'rf':
                filename = f"./rf_models/{group_name}_stacked_rf.joblib"
            elif meta_type == 'ada':
                filename = f"./ada_models/{group_name}_stacked_ada.joblib"
            joblib.dump(stacking_clf, filename)
    
    # Create language to model mapping
    # language_to_model_map = {}
    # for i, lang_group in enumerate(model_groups_for_stacking):
    #     group_key = f"stacked_group_{i+1}"
    #     if group_key in stacked_models_final:
    #         model = stacked_models_final[group_key]
    #         for lang_tuple in lang_group:
    #             print(lang_tuple)
    #             language_to_model_map[lang_tuple] = model
    
    return stacked_models_final

## Train stacked models for each model type

Creates stacked models for each group of similar languages, for Decision Tree, Random Forest, and AdaBoost base models.

In [13]:
language_to_model_map_dt = create_stacked_models(dtmodels, train_scaled, group_dt, meta_type='dt')
language_to_model_map_rf = create_stacked_models(rfmodels, train_scaled, group_rf, meta_type='rf')
language_to_model_map_ada = create_stacked_models(ada_models, train_scaled, group_ada, meta_type='ada')

[['Galician', 'Icelandic', 'Spanish', 'Swedish'], ['German', 'Italian', 'Turkish'], ['Arabic', 'Finnish', 'Polish'], ['Indonesian', 'Portuguese', 'Russian'], ['Thai'], ['Japanese'], ['English'], ['French'], ['Korean'], ['Chinese'], ['Czech'], ['Hindi']]
[['Arabic', 'Chinese', 'Czech', 'Finnish', 'French', 'Galician', 'Hindi', 'Indonesian', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Spanish', 'Swedish', 'Thai'], ['English', 'German', 'Italian', 'Japanese'], ['Icelandic'], ['Turkish']]
[['Czech', 'Galician', 'Indonesian', 'Polish', 'Portuguese', 'Swedish'], ['German', 'Italian', 'Spanish', 'Turkish'], ['Arabic', 'Icelandic', 'Russian'], ['English', 'Finnish'], ['Thai'], ['Japanese'], ['French'], ['Korean'], ['Chinese'], ['Hindi']]


## Compute classification reports for stacked models

Defines a function to compute and print classification reports for the stacked models on the validation set, aggregating predictions across all languages.

In [14]:
def compute_accuracy_separate_datasets(stacked_models, val_scaled):
    import numpy as np
    from sklearn.metrics import classification_report as skl_classification_report

    all_predictions = []
    all_true_labels = []
    
    grouped_val = val_scaled.groupby(['language'])

    # Corrected version: properly matches languages to models and handles groupings
    for group_name, model in stacked_models.items():
        # Determine which languages this model is responsible for
        if isinstance(model, StackingClassifier):
            # For stacking, group_name is like 'stacked_group_X', so need to infer languages
            # Assume language names are in the base estimators
            language_names = [name for name, _ in model.estimators]
        else:
            language_names = [group_name]

        for lang in language_names:
            # grouped_val.groups keys are language names, not tuples
            if lang in grouped_val.groups:
                X_val_group = grouped_val.get_group(lang)[FEATURE_COLUMNS]
                y_val_group = grouped_val.get_group(lang)['is_root']

                # Make predictions for this group
                y_pred_group = model.predict(X_val_group)

                # Collect predictions and true labels
                all_predictions.extend(y_pred_group)
                all_true_labels.extend(y_val_group)

    # Convert to numpy arrays
    all_predictions = np.array(all_predictions)
    all_true_labels = np.array(all_true_labels)
    # Calculate metrics
    # Avoid shadowing the imported classification_report
    classification_rep = skl_classification_report(all_true_labels, all_predictions)

    return classification_rep


## Print classification reports for all model types

Computes and prints the classification reports for Decision Tree, Random Forest, and AdaBoost stacked models on the validation data.

In [15]:
classification_report_dt = compute_accuracy_separate_datasets(language_to_model_map_dt, val_scaled)
classification_report_rf = compute_accuracy_separate_datasets(language_to_model_map_rf, val_scaled)
classification_report_ada = compute_accuracy_separate_datasets(language_to_model_map_ada, val_scaled)
print("Decision Tree Classification Report:")
print(classification_report_dt)
print("Random Forest Classification Report:")
print(classification_report_rf)
print("AdaBoost Classification Report:")
print(classification_report_ada)


Decision Tree Classification Report:
              precision    recall  f1-score   support

       False       0.96      0.89      0.92     38134
        True       0.16      0.39      0.22      2100

    accuracy                           0.86     40234
   macro avg       0.56      0.64      0.57     40234
weighted avg       0.92      0.86      0.89     40234

Random Forest Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.86      0.91     38134
        True       0.16      0.48      0.24      2100

    accuracy                           0.84     40234
   macro avg       0.56      0.67      0.57     40234
weighted avg       0.93      0.84      0.87     40234

AdaBoost Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.84      0.90     38134
        True       0.15      0.50      0.23      2100

    accuracy                           0.83     40234
   macro avg       0.5