# Modelling
In this phase it would be developed the different models

## Linear Classification
According to the exploration of the boxplots the roots may have higher values in all of the metrics.
The baseline solution proposed would be generating a simple program that selects the node with higher values.

In [45]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

def get_top_vertex_metrics(df, y_pred, target='is_root'):
    """
    For each (sentence, language), select the vertex with the highest predicted score.
    Compare it with the actual root vertex (where is_root == 1) and return match accuracy.
    
    Returns:
        accuracy (float): Proportion of correct root predictions.
    """
    df = df.copy()
    df['y_pred'] = y_pred

    # Group by sentence and language
    groups = df.groupby(['sentence', 'language'])
    total = 0
    correct = 0

    for (sentence, language), group in groups:
        # Get actual root vertex
        true_root_row = group[group[target] == 1]
        if true_root_row.empty:
            continue  # skip if no root in this group

        true_vertex = true_root_row['vertex'].values[0]

        # Get predicted top vertex (highest y_pred)
        predicted_vertex = group.loc[group['y_pred'].idxmax(), 'vertex']

        if predicted_vertex == true_vertex:
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy

def extract_features(df, features):
    if 'avg_metrics' in features and 'avg_metrics' not in df.columns:
        df['avg_metrics'] = df[['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']].mean(axis=1)
    return df[features].values


def test_linear_model(folds_path, features, target, num_folds=5):
    model = LinearRegression()
    metrics = {
        'train': pd.DataFrame(columns=['Accuracy']),
        'test': pd.DataFrame(columns=['Accuracy'])
    }

    for fold in range(1, num_folds + 1):
        train_path = os.path.join(folds_path, f'fold_{fold}_train.csv')
        test_path = os.path.join(folds_path, f'fold_{fold}_test.csv')

        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        # Train model
        X_train = extract_features(train_df, features)
        y_train = train_df[target].values
        X_test = extract_features(test_df, features)
        y_test = test_df[target].values
        model.fit(X_train, y_train)

        # Predict
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        # Evaluate top-vertex prediction
        train_scores = get_top_vertex_metrics(train_df, y_pred_train, target)
        test_scores = get_top_vertex_metrics(test_df, y_pred_test, target)

        metrics['train'].loc[f'Fold {fold}'] = [train_scores]
        metrics['test'].loc[f'Fold {fold}'] = [test_scores]

        print(f"\nFold {fold} Results:")
        print(f"  Train - Accuracy: {train_scores:.4f}")
        print(f"  Test  - Accuracy: {test_scores:.4f}")

    # Coefficients
    print("\nFinal Model Coefficients (from last fold):")
    for feature, coef in zip(features, model.coef_):
        print(f"  {feature}: {coef:.4f}")
    print(f"Intercept: {model.intercept_:.4f}")

    return metrics

# Example usage
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']
target = 'is_root'
folds_path = './data/cross_validation'

results_v1_dict = test_linear_model(folds_path, features, target)

# Average metrics summary
print("\n=== Average Metrics Across Folds ===")
print("\nTrain:")
print(results_v1_dict['train'].mean())
print("\nTest:")
print(results_v1_dict['test'].mean())



Fold 1 Results:
  Train - Accuracy: 0.2746
  Test  - Accuracy: 0.2571

Fold 2 Results:
  Train - Accuracy: 0.2689
  Test  - Accuracy: 0.2786

Fold 3 Results:
  Train - Accuracy: 0.2668
  Test  - Accuracy: 0.2886

Fold 4 Results:
  Train - Accuracy: 0.2717
  Test  - Accuracy: 0.2767

Fold 5 Results:
  Train - Accuracy: 0.2742
  Test  - Accuracy: 0.2552

Final Model Coefficients (from last fold):
  n_norm: -0.1137
  degree_norm: 0.0209
  closeness_norm: 0.0333
  betweenness_norm: 0.1122
  pagerank_norm: 0.0474
Intercept: 0.0199

=== Average Metrics Across Folds ===

Train:
Accuracy    0.271238
dtype: float64

Test:
Accuracy    0.271238
dtype: float64


In [46]:
def prepare_results(results, model_name):
    # Copy and prepare test results
    test_df = results['test'].copy()

    # Calculate mean and insert as second row
    mean_row = pd.DataFrame(test_df.mean()).T
    mean_row.index = ['Mean']

    # Add model name as the first row
    model_row = pd.DataFrame([[model_name]], columns=test_df.columns, index=['Model'])

    # Concatenate rows in desired order
    test_df = pd.concat([model_row, mean_row, test_df])

    # Transpose the DataFrame
    test_df_results = test_df.T
    test_df_results.index.values[0] = 'Metric'  # Optional: name first column
    test_df_results['Model'] = model_name       # Add model name as a column
    return test_df_results


# Prepare both models
results_v1 = prepare_results(results_v1_dict, "LM_v1")
#results_v2 = prepare_results(results_v2_dict, "LM_v2")

# Concatenate vertically
combined_results = pd.concat([results_v1], ignore_index=True)

# Save to CSV
combined_results.to_csv("./data/models/all_models.csv", index=False)

# Display
combined_results.head()

Unnamed: 0,Model,Mean,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5
0,LM_v1,0.271238,0.257143,0.278571,0.288571,0.276667,0.255238


### Linear Model with Feature Engineering

In that model we have applied the linear model, but with a new variable the average betweeen the metrics extracted from the sentences



In [47]:
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'avg_metrics']
target = 'is_root'
folds_path = './data/cross_validation'

results_v2_dict = test_linear_model(folds_path, features, target)

# Average metrics summary
print("\n=== Average Metrics Across Folds ===")
print("\nTrain:")
print(results_v2_dict['train'].mean())
print("\nTest:")
print(results_v2_dict['test'].mean())



Fold 1 Results:
  Train - Accuracy: 0.2745
  Test  - Accuracy: 0.2562

Fold 2 Results:
  Train - Accuracy: 0.2676
  Test  - Accuracy: 0.2786

Fold 3 Results:
  Train - Accuracy: 0.2657
  Test  - Accuracy: 0.2871

Fold 4 Results:
  Train - Accuracy: 0.2707
  Test  - Accuracy: 0.2762

Fold 5 Results:
  Train - Accuracy: 0.2752
  Test  - Accuracy: 0.2557

Final Model Coefficients (from last fold):
  n_norm: -0.1137
  degree_norm: 0.0102
  closeness_norm: 0.0226
  betweenness_norm: 0.1015
  pagerank_norm: 0.0367
  avg_metrics: 0.0427
Intercept: 0.0199

=== Average Metrics Across Folds ===

Train:
Accuracy    0.270762
dtype: float64

Test:
Accuracy    0.270762
dtype: float64


In [None]:
results_v2 = prepare_results(results_v2_dict, "LM_v2")

# Concatenate vertically
combined_results = pd.concat([results_v1, results_v2], ignore_index=True)

# Save to CSV
combined_results.to_csv("./data/models/all_models.csv", index=False)

# Sort by the average of the accuracy
sorted_df = combined_results.sort_values(by='Mean', ascending=False).T

# Display
sorted_df.head()

Unnamed: 0,Model,Mean,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5
0,LM_v1,0.271238,0.257143,0.278571,0.288571,0.276667,0.255238
1,LM_v2,0.270762,0.25619,0.278571,0.287143,0.27619,0.255714


### Testing

In [None]:
# create a code that picks the best 