# Modelling
In this phase it would be developed the different models

# Modelling Notebook
According to the exploration of the boxplots the roots may have higher values in all of the metrics.
The baseline solution proposed would be generating a simple program that selects the node with higher values.

## Configurations of the Notebook

In [18]:
# Global variables
target = 'is_root'
folds_path = './data/cross_validation'

In [19]:
# Function declaration
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

def get_metrics(df, y_pred, target='is_root'):
    """
    For each (sentence, language), select the vertex with the highest predicted score.
    Compare it with the actual root vertex (where is_root == 1) and return:
        - overall accuracy
        - per-language accuracy breakdown
        - total and correct predictions
    
    Returns:
        accuracy (float): overall accuracy
        lang_stats (dict): per-language {'correct': int, 'total': int, 'accuracy': float}
        total (int): total number of predictions
        correct (int): number of correct predictions
    """
    df = df.copy()
    df['y_pred'] = y_pred

    # Group by sentence and language
    groups = df.groupby(['sentence', 'language'])

    total = 0
    correct = 0
    lang_stats = {}

    for (sentence, language), group in groups:
        # Skip if no actual root
        true_root_row = group[group[target] == 1]
        if true_root_row.empty:
            continue

        true_vertex = true_root_row['vertex'].values[0]
        predicted_vertex = group.loc[group['y_pred'].idxmax(), 'vertex']

        is_correct = predicted_vertex == true_vertex
        correct += int(is_correct)
        total += 1

        # Track per-language
        if language not in lang_stats:
            lang_stats[language] = {'correct': 0, 'total': 0}
        lang_stats[language]['correct'] += int(is_correct)
        lang_stats[language]['total'] += 1

    # Compute per-language accuracy
    for lang in lang_stats:
        lang_total = lang_stats[lang]['total']
        lang_correct = lang_stats[lang]['correct']
        lang_stats[lang]['accuracy'] = lang_correct / lang_total if lang_total > 0 else 0

    accuracy = correct / total if total > 0 else 0
    return accuracy, lang_stats, total, correct

def extract_features(df, features):
    # Create 'avg_metrics' if needed
    if 'avg_metrics' in features and 'avg_metrics' not in df.columns:
        df['avg_metrics'] = df[['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']].mean(axis=1)
    
    return df[features].values


import os
import pandas as pd

def test_model(model, folds_path, features, target, num_folds=5):
    metrics = {
        'train': pd.DataFrame(columns=['Accuracy']),
        'test': pd.DataFrame(columns=['Accuracy'])
    }
    
    language_stats = {
        'train': {},
        'test': {}
    }

    for fold in range(1, num_folds + 1):
        train_path = os.path.join(folds_path, f'fold_{fold}_train.csv')
        test_path = os.path.join(folds_path, f'fold_{fold}_test.csv')

        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        # Train model
        X_train = extract_features(train_df, features)
        y_train = train_df[target].values
        X_test = extract_features(test_df, features)
        y_test = test_df[target].values
        model.fit(X_train, y_train)

        # Predict
        y_proba_train = model.predict_proba(X_train)[:, 1]
        y_proba_test = model.predict_proba(X_test)[:, 1]

        # Evaluate top-vertex prediction
        train_acc, train_lang_stats, train_total, train_correct = get_metrics(train_df, y_proba_train, target)
        test_acc, test_lang_stats, test_total, test_correct = get_metrics(test_df, y_proba_test, target)

        print(f"Fold-{fold}")
        print(f"  Train Accuracy: {train_acc:.4f} ({train_correct}/{train_total})")
        print(f"  Test Accuracy:  {test_acc:.4f} ({test_correct}/{test_total})")

        # Save metrics
        metrics['train'].loc[f'Fold {fold}'] = [train_acc]
        metrics['test'].loc[f'Fold {fold}'] = [test_acc]
        language_stats['train'][f'Fold {fold}'] = train_lang_stats
        language_stats['test'][f'Fold {fold}'] = test_lang_stats

    return metrics, language_stats


def print_metrics(metrics):    
    print("\n=== Average Metrics Across Folds ===")
    print("\nTrain:")
    print(metrics['train'].mean())
    print("\nTest:")
    print(metrics['test'].mean())


def summarize_model_results(model_name, metrics, language_stats):
    """
    Generate a summary row for the given model.
    
    Returns a DataFrame with:
    - model_name
    - train_mean_accuracy
    - test_mean_accuracy
    - average test accuracy per language
    """
    summary = {
        'model': model_name,
        'Train Accuracy': metrics['train']['Accuracy'].mean(),
        'Test Accuracy': metrics['test']['Accuracy'].mean()
    }

    # Aggregate language scores across folds
    lang_totals = {}
    for fold_stats in language_stats['test'].values():
        for lang, stats in fold_stats.items():
            if lang not in lang_totals:
                lang_totals[lang] = {'correct': 0, 'total': 0}
            lang_totals[lang]['correct'] += stats['correct']
            lang_totals[lang]['total'] += stats['total']

    # Compute mean accuracy per language
    for lang, stats in lang_totals.items():
        if stats['total'] > 0:
            summary[lang] = stats['correct'] / stats['total']
        else:
            summary[lang] = None  # or 0.0

    return pd.DataFrame([summary])

def save_model(summary_df):
    results_path = "./data/models/all_models.csv"

    # Load existing results or create a new one with same columns
    if os.path.exists(results_path):
        combined_results = pd.read_csv(results_path)

        # Ensure 'model' column exists in file
        if 'model' not in combined_results.columns:
            raise ValueError("Existing file is missing the 'model' column.")
    else:
        # Initialize with correct columns from summary_df
        combined_results = pd.DataFrame(columns=summary_df.columns)

    # Ensure 'model' column exists in summary_df
    if 'model' not in summary_df.columns:
        raise ValueError("The summary_df must contain a 'model' column.")

    # Drop duplicate models
    combined_results = combined_results[~combined_results['model'].isin(summary_df['model'])]

    # Append new and save
    combined_results = pd.concat([combined_results, summary_df], ignore_index=True)
    combined_results.to_csv(results_path, index=False)

    print("Updated combined results:")
    print(combined_results.head())
    print("✅ Saved summary to:", results_path)



## Linear Models

### Logistic Regression V1 - Baseline

In [20]:
from sklearn.linear_model import LogisticRegression

# Example usage
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']
log_reg = LogisticRegression()

metrics, language_stats = test_model(log_reg, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("LogReg_V1", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2740 (2302/8400)
  Test Accuracy:  0.2581 (542/2100)
Fold-2
  Train Accuracy: 0.2688 (2258/8400)
  Test Accuracy:  0.2814 (591/2100)
Fold-3
  Train Accuracy: 0.2664 (2238/8400)
  Test Accuracy:  0.2867 (602/2100)
Fold-4
  Train Accuracy: 0.2707 (2274/8400)
  Test Accuracy:  0.2743 (576/2100)
Fold-5
  Train Accuracy: 0.2752 (2312/8400)
  Test Accuracy:  0.2562 (538/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.271048
dtype: float64

Test:
Accuracy    0.271333
dtype: float64
Updated combined results:
       model  Train Accuracy  Test Accuracy  ...  German  Turkish  Russian
0  LogReg_V1        0.271048       0.271333  ...   0.274      0.3    0.344

[1 rows x 24 columns]
✅ Saved summary to: ./data/models/all_models.csv


  combined_results = pd.concat([combined_results, summary_df], ignore_index=True)


### Logistic Regression with Feature Engineering

In that model we have applied the linear model, but with a new variable the average betweeen the metrics extracted from the sentences



In [21]:
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'avg_metrics']
log_reg = LogisticRegression()

metrics, language_stats = test_model(log_reg, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("LogReg_V2", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2739 (2301/8400)
  Test Accuracy:  0.2581 (542/2100)
Fold-2
  Train Accuracy: 0.2680 (2251/8400)
  Test Accuracy:  0.2810 (590/2100)
Fold-3
  Train Accuracy: 0.2667 (2240/8400)
  Test Accuracy:  0.2871 (603/2100)
Fold-4
  Train Accuracy: 0.2708 (2275/8400)
  Test Accuracy:  0.2733 (574/2100)
Fold-5
  Train Accuracy: 0.2755 (2314/8400)
  Test Accuracy:  0.2586 (543/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.270976
dtype: float64

Test:
Accuracy    0.271619
dtype: float64
Updated combined results:
       model  Train Accuracy  Test Accuracy  ...  German  Turkish  Russian
0  LogReg_V1        0.271048       0.271333  ...   0.274    0.300    0.344
1  LogReg_V2        0.270976       0.271619  ...   0.276    0.304    0.346

[2 rows x 24 columns]
✅ Saved summary to: ./data/models/all_models.csv


### Linear Model V3 - Tunning Parameters
Applied Ridge Regression

In [22]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']

# Define parameter grid for tuning alpha
param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}

# Ridge model with grid search (inner CV to pick alpha)
log_reg = LogisticRegression(
    penalty='l2',          # Change to 'l1' or 'elasticnet' if needed
    C=0.1,                 # Stronger regularization (default is 1.0)
    solver='liblinear',    # Use 'saga' for elasticnet
    max_iter=1000,
    random_state=42
)

metrics, language_stats = test_model(log_reg, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("LogReg_Regularized", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2744 (2305/8400)
  Test Accuracy:  0.2562 (538/2100)
Fold-2
  Train Accuracy: 0.2682 (2253/8400)
  Test Accuracy:  0.2795 (587/2100)
Fold-3
  Train Accuracy: 0.2660 (2234/8400)
  Test Accuracy:  0.2867 (602/2100)
Fold-4
  Train Accuracy: 0.2700 (2268/8400)
  Test Accuracy:  0.2757 (579/2100)
Fold-5
  Train Accuracy: 0.2755 (2314/8400)
  Test Accuracy:  0.2533 (532/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.27081
dtype: float64

Test:
Accuracy    0.270286
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  ...  German  Turkish  Russian
0           LogReg_V1        0.271048       0.271333  ...   0.274    0.300    0.344
1           LogReg_V2        0.270976       0.271619  ...   0.276    0.304    0.346
2  LogReg_Regularized        0.270810       0.270286  ...   0.276    0.294    0.346

[3 rows x 24 columns]
✅ Saved summary to: ./data/models/all_models.csv


## Non Linear Models

### Random Forest V1 - Baseline (no feature engineering)

In [23]:
from sklearn.ensemble import RandomForestClassifier
# V1
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']
rf = RandomForestClassifier() # overfitting de manual

metrics, language_stats = test_model(rf, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("RF_V1", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.8581 (7208/8400)
  Test Accuracy:  0.2495 (524/2100)
Fold-2
  Train Accuracy: 0.8586 (7212/8400)
  Test Accuracy:  0.2686 (564/2100)
Fold-3
  Train Accuracy: 0.8601 (7225/8400)
  Test Accuracy:  0.2576 (541/2100)
Fold-4
  Train Accuracy: 0.8602 (7226/8400)
  Test Accuracy:  0.2714 (570/2100)
Fold-5
  Train Accuracy: 0.8596 (7221/8400)
  Test Accuracy:  0.2557 (537/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.859333
dtype: float64

Test:
Accuracy    0.260571
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  ...  German  Turkish  Russian
0           LogReg_V1        0.271048       0.271333  ...   0.274    0.300    0.344
1           LogReg_V2        0.270976       0.271619  ...   0.276    0.304    0.346
2  LogReg_Regularized        0.270810       0.270286  ...   0.276    0.294    0.346
3               RF_V1        0.859333       0.260571  ...   0.298    0.248    0.334

[4 rows x 24 columns]
✅ Save

### Random Forest V2 - Feature Engineering

In [24]:
# V2
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'avg_metrics']
rf = RandomForestClassifier() # overfitting de manual

metrics, language_stats = test_model(rf, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("RF_V2", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.8581 (7208/8400)
  Test Accuracy:  0.2490 (523/2100)
Fold-2
  Train Accuracy: 0.8586 (7212/8400)
  Test Accuracy:  0.2638 (554/2100)
Fold-3
  Train Accuracy: 0.8601 (7225/8400)
  Test Accuracy:  0.2695 (566/2100)
Fold-4
  Train Accuracy: 0.8604 (7227/8400)
  Test Accuracy:  0.2624 (551/2100)
Fold-5
  Train Accuracy: 0.8596 (7221/8400)
  Test Accuracy:  0.2590 (544/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.859357
dtype: float64

Test:
Accuracy    0.260762
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  ...  German  Turkish  Russian
0           LogReg_V1        0.271048       0.271333  ...   0.274    0.300    0.344
1           LogReg_V2        0.270976       0.271619  ...   0.276    0.304    0.346
2  LogReg_Regularized        0.270810       0.270286  ...   0.276    0.294    0.346
3               RF_V1        0.859333       0.260571  ...   0.298    0.248    0.334
4               RF_V2        

### Random Forest V3 - Changing Parameters of the tree

We have decided to choose the features without the "avg_metrics", since the results are quite better without it.

In [25]:
# V3
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']
rf = RandomForestClassifier(max_depth=5,               # Restrict tree depth
    min_samples_split=10,      # Fewer splits
    min_samples_leaf=5,        # Larger leaves
    n_estimators=100,          # Keep moderate number of trees
    random_state=42)

metrics, language_stats = test_model(rf, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("RF_V3", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2960 (2486/8400)
  Test Accuracy:  0.2767 (581/2100)
Fold-2
  Train Accuracy: 0.2879 (2418/8400)
  Test Accuracy:  0.2986 (627/2100)
Fold-3
  Train Accuracy: 0.2882 (2421/8400)
  Test Accuracy:  0.3033 (637/2100)
Fold-4
  Train Accuracy: 0.2895 (2432/8400)
  Test Accuracy:  0.2914 (612/2100)
Fold-5
  Train Accuracy: 0.2944 (2473/8400)
  Test Accuracy:  0.2676 (562/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.29119
dtype: float64

Test:
Accuracy    0.287524
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  ...  German  Turkish  Russian
0           LogReg_V1        0.271048       0.271333  ...   0.274    0.300    0.344
1           LogReg_V2        0.270976       0.271619  ...   0.276    0.304    0.346
2  LogReg_Regularized        0.270810       0.270286  ...   0.276    0.294    0.346
3               RF_V1        0.859333       0.260571  ...   0.298    0.248    0.334
4               RF_V2        0

### Boosting V1 - Baseline

In [26]:
from xgboost import XGBClassifier

# === Feature selection ===
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']

# === XGBoost Classifier with regularization ===
xgb_model = XGBClassifier()

# === Evaluate model ===
metrics, language_stats = test_model(xgb_model, folds_path, features, target)

# === Print + save results ===
print_metrics(metrics)

summary_df = summarize_model_results("XGB_V1", metrics, language_stats)
summary_df.head()
save_model(summary_df)


Fold-1
  Train Accuracy: 0.3537 (2971/8400)
  Test Accuracy:  0.2800 (588/2100)
Fold-2
  Train Accuracy: 0.3615 (3037/8400)
  Test Accuracy:  0.2971 (624/2100)
Fold-3
  Train Accuracy: 0.3499 (2939/8400)
  Test Accuracy:  0.2905 (610/2100)
Fold-4
  Train Accuracy: 0.3507 (2946/8400)
  Test Accuracy:  0.2919 (613/2100)
Fold-5
  Train Accuracy: 0.3548 (2980/8400)
  Test Accuracy:  0.2738 (575/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.354119
dtype: float64

Test:
Accuracy    0.286667
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  ...  German  Turkish  Russian
0           LogReg_V1        0.271048       0.271333  ...   0.274    0.300    0.344
1           LogReg_V2        0.270976       0.271619  ...   0.276    0.304    0.346
2  LogReg_Regularized        0.270810       0.270286  ...   0.276    0.294    0.346
3               RF_V1        0.859333       0.260571  ...   0.298    0.248    0.334
4               RF_V2        

### Boosting V2 - Adding feature engineering

In [27]:
from xgboost import XGBClassifier

# === Feature selection ===
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'avg_metrics']

# === XGBoost Classifier with regularization ===
xgb_model = XGBClassifier()

# === Evaluate model ===
metrics, language_stats = test_model(xgb_model, folds_path, features, target)

# === Print + save results ===
print_metrics(metrics)

summary_df = summarize_model_results("XGB_V2", metrics, language_stats)
summary_df.head()
save_model(summary_df)


Fold-1
  Train Accuracy: 0.3527 (2963/8400)
  Test Accuracy:  0.2729 (573/2100)
Fold-2
  Train Accuracy: 0.3421 (2874/8400)
  Test Accuracy:  0.3033 (637/2100)
Fold-3
  Train Accuracy: 0.3517 (2954/8400)
  Test Accuracy:  0.3067 (644/2100)
Fold-4
  Train Accuracy: 0.3533 (2968/8400)
  Test Accuracy:  0.2867 (602/2100)
Fold-5
  Train Accuracy: 0.3512 (2950/8400)
  Test Accuracy:  0.2710 (569/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.350214
dtype: float64

Test:
Accuracy    0.288095
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  ...  German  Turkish  Russian
0           LogReg_V1        0.271048       0.271333  ...   0.274    0.300    0.344
1           LogReg_V2        0.270976       0.271619  ...   0.276    0.304    0.346
2  LogReg_Regularized        0.270810       0.270286  ...   0.276    0.294    0.346
3               RF_V1        0.859333       0.260571  ...   0.298    0.248    0.334
4               RF_V2        

### Boosting V3 - Changing Boosting Parameters

In [28]:
from xgboost import XGBClassifier

# === Feature selection ===
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']

# === XGBoost Classifier with regularization ===
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=1.0, # L2 regularization
    eval_metric='logloss',
    random_state=42)

# === Evaluate model ===
metrics, language_stats = test_model(xgb_model, folds_path, features, target)

# === Print + save results ===
print_metrics(metrics)

summary_df = summarize_model_results("XGB_V3", metrics, language_stats)
summary_df.head()
save_model(summary_df)


Fold-1
  Train Accuracy: 0.2975 (2499/8400)
  Test Accuracy:  0.2795 (587/2100)
Fold-2
  Train Accuracy: 0.2904 (2439/8400)
  Test Accuracy:  0.3048 (640/2100)
Fold-3
  Train Accuracy: 0.2917 (2450/8400)
  Test Accuracy:  0.3067 (644/2100)
Fold-4
  Train Accuracy: 0.2907 (2442/8400)
  Test Accuracy:  0.2914 (612/2100)
Fold-5
  Train Accuracy: 0.2961 (2487/8400)
  Test Accuracy:  0.2671 (561/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.293262
dtype: float64

Test:
Accuracy    0.289905
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  ...  German  Turkish  Russian
0           LogReg_V1        0.271048       0.271333  ...   0.274    0.300    0.344
1           LogReg_V2        0.270976       0.271619  ...   0.276    0.304    0.346
2  LogReg_Regularized        0.270810       0.270286  ...   0.276    0.294    0.346
3               RF_V1        0.859333       0.260571  ...   0.298    0.248    0.334
4               RF_V2        