# Modelling
In this phase it would be developed the different models

# Modelling Notebook
According to the exploration of the boxplots the roots may have higher values in all of the metrics.
The baseline solution proposed would be generating a simple program that selects the node with higher values.

## Configurations of the Notebook

In [23]:
# Global variables
target = 'is_root'
folds_path = './data/cross_validation'

In [None]:
# Function declaration
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

def get_metrics(df, y_pred, target='is_root'):
    """
    For each (sentence, language), select the vertex with the highest predicted score.
    Compare it with the actual root vertex (where is_root == 1) and return:
        - overall accuracy
        - per-language accuracy breakdown
        - total and correct predictions
    
    Returns:
        accuracy (float): overall accuracy
        lang_stats (dict): per-language {'correct': int, 'total': int, 'accuracy': float}
        total (int): total number of predictions
        correct (int): number of correct predictions
    """
    df = df.copy()
    df['y_pred'] = y_pred

    # Group by sentence and language
    groups = df.groupby(['sentence', 'language'])

    total = 0
    correct = 0
    lang_stats = {}

    for (sentence, language), group in groups:
        # Skip if no actual root
        true_root_row = group[group[target] == 1]
        if true_root_row.empty:
            continue

        true_vertex = true_root_row['vertex'].values[0]
        predicted_vertex = group.loc[group['y_pred'].idxmax(), 'vertex']

        is_correct = predicted_vertex == true_vertex
        correct += int(is_correct)
        total += 1

        # Track per-language
        if language not in lang_stats:
            lang_stats[language] = {'correct': 0, 'total': 0}
        lang_stats[language]['correct'] += int(is_correct)
        lang_stats[language]['total'] += 1

    # Compute per-language accuracy
    for lang in lang_stats:
        lang_total = lang_stats[lang]['total']
        lang_correct = lang_stats[lang]['correct']
        lang_stats[lang]['accuracy'] = lang_correct / lang_total if lang_total > 0 else 0

    accuracy = correct / total if total > 0 else 0
    return accuracy, lang_stats, total, correct

def extract_features(df, features):
    # Create 'avg_metrics' if needed
    if 'avg_metrics' in features and 'avg_metrics' not in df.columns:
        df['avg_metrics'] = df[['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']].mean(axis=1)
    
    return df[features].values


import os
import pandas as pd

def test_model(model, folds_path, features, target, num_folds=5):
    metrics = {
        'train': pd.DataFrame(columns=['Accuracy']),
        'test': pd.DataFrame(columns=['Accuracy'])
    }
    
    language_stats = {
        'train': {},
        'test': {}
    }

    for fold in range(1, num_folds + 1):
        train_path = os.path.join(folds_path, f'fold_{fold}_train.csv')
        test_path = os.path.join(folds_path, f'fold_{fold}_test.csv')

        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        # Train model
        X_train = extract_features(train_df, features)
        y_train = train_df[target].values
        X_test = extract_features(test_df, features)
        y_test = test_df[target].values
        model.fit(X_train, y_train)

        # Predict
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        # Evaluate top-vertex prediction
        train_acc, train_lang_stats, train_total, train_correct = get_metrics(train_df, y_pred_train, target)
        test_acc, test_lang_stats, test_total, test_correct = get_metrics(test_df, y_pred_test, target)

        print(f"Fold-{fold}")
        print(f"  Train Accuracy: {train_acc:.4f} ({train_correct}/{train_total})")
        print(f"  Test Accuracy:  {test_acc:.4f} ({test_correct}/{test_total})")

        # Save metrics
        metrics['train'].loc[f'Fold {fold}'] = [train_acc]
        metrics['test'].loc[f'Fold {fold}'] = [test_acc]
        language_stats['train'][f'Fold {fold}'] = train_lang_stats
        language_stats['test'][f'Fold {fold}'] = test_lang_stats

    return metrics, language_stats


def print_metrics(metrics):    
    print("\n=== Average Metrics Across Folds ===")
    print("\nTrain:")
    print(metrics['train'].mean())
    print(metrics['train'])

    print("\nTest:")
    print(metrics['test'].mean())
    print(metrics['test'])


def summarize_model_results(model_name, metrics, language_stats):
    """
    Generate a summary row for the given model.
    
    Returns a DataFrame with:
    - model_name
    - train_mean_accuracy
    - test_mean_accuracy
    - average test accuracy per language
    """
    summary = {
        'model': model_name,
        'Train Accuracy': metrics['train']['Accuracy'].mean(),
        'Test Accuracy': metrics['test']['Accuracy'].mean()
    }

    # Aggregate language scores across folds
    lang_totals = {}
    for fold_stats in language_stats['test'].values():
        for lang, stats in fold_stats.items():
            if lang not in lang_totals:
                lang_totals[lang] = {'correct': 0, 'total': 0}
            lang_totals[lang]['correct'] += stats['correct']
            lang_totals[lang]['total'] += stats['total']

    # Compute mean accuracy per language
    for lang, stats in lang_totals.items():
        if stats['total'] > 0:
            summary[lang] = stats['correct'] / stats['total']
        else:
            summary[lang] = None  # or 0.0

    return pd.DataFrame([summary])

def save_model(summary_df):
    results_path = "./data/models/all_models.csv"

    # Load existing results or create a new one with same columns
    if os.path.exists(results_path):
        combined_results = pd.read_csv(results_path)

        # Ensure 'model' column exists in file
        if 'model' not in combined_results.columns:
            raise ValueError("Existing file is missing the 'model' column.")
    else:
        # Initialize with correct columns from summary_df
        combined_results = pd.DataFrame(columns=summary_df.columns)

    # Ensure 'model' column exists in summary_df
    if 'model' not in summary_df.columns:
        raise ValueError("The summary_df must contain a 'model' column.")

    # Drop duplicate models
    combined_results = combined_results[~combined_results['model'].isin(summary_df['model'])]

    # Append new and save
    combined_results = pd.concat([combined_results, summary_df], ignore_index=True)
    combined_results.to_csv(results_path, index=False)

    print("Updated combined results:")
    print(combined_results.head())
    print("✅ Saved summary to:", results_path)



## Linear Models

### Linear Regression V1 - Baseline

In [41]:
# Example usage
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']
lin_reg = LinearRegression()

metrics, language_stats = test_model(lin_reg, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("LinReg_V1", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2746 (2307/8400)
  Test Accuracy:  0.2571 (540/2100)
Fold-2
  Train Accuracy: 0.2689 (2259/8400)
  Test Accuracy:  0.2786 (585/2100)
Fold-3
  Train Accuracy: 0.2668 (2241/8400)
  Test Accuracy:  0.2886 (606/2100)
Fold-4
  Train Accuracy: 0.2717 (2282/8400)
  Test Accuracy:  0.2767 (581/2100)
Fold-5
  Train Accuracy: 0.2742 (2303/8400)
  Test Accuracy:  0.2552 (536/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.271238
dtype: float64
        Accuracy
Fold 1  0.274643
Fold 2  0.268929
Fold 3  0.266786
Fold 4  0.271667
Fold 5  0.274167

Test:
Accuracy    0.271238
dtype: float64
        Accuracy
Fold 1  0.257143
Fold 2  0.278571
Fold 3  0.288571
Fold 4  0.276667
Fold 5  0.255238
Updated combined results:
       model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0  LinReg_V2        0.270762       0.270762   0.294    0.252   0.21   
1  LinReg_V1        0.271238       0.271238   0.294    0.256   0.21   

   Indonesian  Polish   Thai  C

### Linear Regression with Feature Engineering

In that model we have applied the linear model, but with a new variable the average betweeen the metrics extracted from the sentences



In [42]:
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'avg_metrics']
lin_reg = LinearRegression()

metrics, language_stats = test_model(lin_reg, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("LinReg_V2", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2745 (2306/8400)
  Test Accuracy:  0.2562 (538/2100)
Fold-2
  Train Accuracy: 0.2676 (2248/8400)
  Test Accuracy:  0.2786 (585/2100)
Fold-3
  Train Accuracy: 0.2657 (2232/8400)
  Test Accuracy:  0.2871 (603/2100)
Fold-4
  Train Accuracy: 0.2707 (2274/8400)
  Test Accuracy:  0.2762 (580/2100)
Fold-5
  Train Accuracy: 0.2752 (2312/8400)
  Test Accuracy:  0.2557 (537/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.270762
dtype: float64
        Accuracy
Fold 1  0.274524
Fold 2  0.267619
Fold 3  0.265714
Fold 4  0.270714
Fold 5  0.275238

Test:
Accuracy    0.270762
dtype: float64
        Accuracy
Fold 1  0.256190
Fold 2  0.278571
Fold 3  0.287143
Fold 4  0.276190
Fold 5  0.255714
Updated combined results:
       model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0  LinReg_V1        0.271238       0.271238   0.294    0.256   0.21   
1  LinReg_V2        0.270762       0.270762   0.294    0.252   0.21   

   Indonesian  Polish   Thai  C

### Linear Model V3 - Tunning Parameters
Applied Ridge Regression

In [46]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']

# Define parameter grid for tuning alpha
param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}

# Ridge model with grid search (inner CV to pick alpha)
ridge_model = GridSearchCV(
    Ridge(),
    param_grid,
    cv=3,  # inner CV within each train fold
    scoring='neg_mean_squared_error',  # can be adjusted
    n_jobs=-1
)

metrics, language_stats = test_model(ridge_model, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("Ridge_V1", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2738 (2300/8400)
  Test Accuracy:  0.2571 (540/2100)
Fold-2
  Train Accuracy: 0.2675 (2247/8400)
  Test Accuracy:  0.2781 (584/2100)
Fold-3
  Train Accuracy: 0.2661 (2235/8400)
  Test Accuracy:  0.2876 (604/2100)
Fold-4
  Train Accuracy: 0.2693 (2262/8400)
  Test Accuracy:  0.2748 (577/2100)
Fold-5
  Train Accuracy: 0.2745 (2306/8400)
  Test Accuracy:  0.2548 (535/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.270238
dtype: float64
        Accuracy
Fold 1  0.273810
Fold 2  0.267500
Fold 3  0.266071
Fold 4  0.269286
Fold 5  0.274524

Test:
Accuracy    0.270476
dtype: float64
        Accuracy
Fold 1  0.257143
Fold 2  0.278095
Fold 3  0.287619
Fold 4  0.274762
Fold 5  0.254762
Updated combined results:
       model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0  LinReg_V1        0.271238       0.271238   0.294    0.256  0.210   
1  LinReg_V2        0.270762       0.270762   0.294    0.252  0.210   
2      RF_V1        0.859310    

## Non Linear Models

### Random Forest V1 - Baseline (no feature engineering)

In [43]:
from sklearn.ensemble import RandomForestRegressor
# V1
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']
rf = RandomForestRegressor() # overfitting de manual

metrics, language_stats = test_model(rf, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("RF_V1", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.8586 (7212/8400)
  Test Accuracy:  0.2490 (523/2100)
Fold-2
  Train Accuracy: 0.8587 (7213/8400)
  Test Accuracy:  0.2633 (553/2100)
Fold-3
  Train Accuracy: 0.8602 (7226/8400)
  Test Accuracy:  0.2662 (559/2100)
Fold-4
  Train Accuracy: 0.8594 (7219/8400)
  Test Accuracy:  0.2629 (552/2100)
Fold-5
  Train Accuracy: 0.8596 (7221/8400)
  Test Accuracy:  0.2514 (528/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.85931
dtype: float64
        Accuracy
Fold 1  0.858571
Fold 2  0.858690
Fold 3  0.860238
Fold 4  0.859405
Fold 5  0.859643

Test:
Accuracy    0.258571
dtype: float64
        Accuracy
Fold 1  0.249048
Fold 2  0.263333
Fold 3  0.266190
Fold 4  0.262857
Fold 5  0.251429
Updated combined results:
       model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0  LinReg_V1        0.271238       0.271238   0.294    0.256   0.21   
1  LinReg_V2        0.270762       0.270762   0.294    0.252   0.21   
2      RF_V1        0.859310     

### Random Forest V2 - Feature Engineering

In [44]:
# V2
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'avg_metrics']
rf = RandomForestRegressor() # overfitting de manual

metrics, language_stats = test_model(rf, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("RF_V2", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.8580 (7207/8400)
  Test Accuracy:  0.2424 (509/2100)
Fold-2
  Train Accuracy: 0.8583 (7210/8400)
  Test Accuracy:  0.2581 (542/2100)
Fold-3
  Train Accuracy: 0.8605 (7228/8400)
  Test Accuracy:  0.2629 (552/2100)
Fold-4
  Train Accuracy: 0.8596 (7221/8400)
  Test Accuracy:  0.2648 (556/2100)
Fold-5
  Train Accuracy: 0.8596 (7221/8400)
  Test Accuracy:  0.2433 (511/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.859214
dtype: float64
        Accuracy
Fold 1  0.857976
Fold 2  0.858333
Fold 3  0.860476
Fold 4  0.859643
Fold 5  0.859643

Test:
Accuracy    0.254286
dtype: float64
        Accuracy
Fold 1  0.242381
Fold 2  0.258095
Fold 3  0.262857
Fold 4  0.264762
Fold 5  0.243333
Updated combined results:
       model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0  LinReg_V1        0.271238       0.271238   0.294    0.256  0.210   
1  LinReg_V2        0.270762       0.270762   0.294    0.252  0.210   
2      RF_V1        0.859310    

### Random Forest V3 - Changing Parameters of the tree

We have decided to choose the features without the "avg_metrics", since the results are quite better without it.

In [45]:
# V3
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']
rf = RandomForestRegressor(max_depth=5,               # Restrict tree depth
    min_samples_split=10,      # Fewer splits
    min_samples_leaf=5,        # Larger leaves
    n_estimators=100,          # Keep moderate number of trees
    random_state=42)

metrics, language_stats = test_model(rf, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("RF_V3", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2958 (2485/8400)
  Test Accuracy:  0.2762 (580/2100)
Fold-2
  Train Accuracy: 0.2894 (2431/8400)
  Test Accuracy:  0.3048 (640/2100)
Fold-3
  Train Accuracy: 0.2863 (2405/8400)
  Test Accuracy:  0.3062 (643/2100)
Fold-4
  Train Accuracy: 0.2906 (2441/8400)
  Test Accuracy:  0.2905 (610/2100)
Fold-5
  Train Accuracy: 0.2962 (2488/8400)
  Test Accuracy:  0.2681 (563/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.291667
dtype: float64
        Accuracy
Fold 1  0.295833
Fold 2  0.289405
Fold 3  0.286310
Fold 4  0.290595
Fold 5  0.296190

Test:
Accuracy    0.289143
dtype: float64
        Accuracy
Fold 1  0.276190
Fold 2  0.304762
Fold 3  0.306190
Fold 4  0.290476
Fold 5  0.268095
Updated combined results:
       model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0  LinReg_V1        0.271238       0.271238   0.294    0.256  0.210   
1  LinReg_V2        0.270762       0.270762   0.294    0.252  0.210   
2      RF_V1        0.859310    