# Modelling
In this phase it would be developed the different models

## Linear Classification
According to the exploration of the boxplots the roots may have higher values in all of the metrics.
The baseline solution proposed would be generating a simple program that selects the node with higher values.

In [16]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

def get_top_vertex_metrics(df, y_pred, target='is_root'):
    """
    For each (sentence, language), select the vertex with the highest predicted score.
    Compare it with the actual root vertex (where is_root == 1) and return match accuracy.
    
    Returns:
        accuracy (float): Proportion of correct root predictions.
    """
    df = df.copy()
    df['y_pred'] = y_pred

    # Group by sentence and language
    groups = df.groupby(['sentence', 'language'])
    total = 0
    correct = 0

    for (sentence, language), group in groups:
        # Get actual root vertex
        true_root_row = group[group[target] == 1]
        if true_root_row.empty:
            continue  # skip if no root in this group

        true_vertex = true_root_row['vertex'].values[0]

        # Get predicted top vertex (highest y_pred)
        predicted_vertex = group.loc[group['y_pred'].idxmax(), 'vertex']

        if predicted_vertex == true_vertex:
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy



def test_linear_model(folds_path, features, target, num_folds=5):
    model = LinearRegression()
    metrics = {
        'train': pd.DataFrame(columns=['Accuracy']),
        'test': pd.DataFrame(columns=['Accuracy'])
    }

    for fold in range(1, num_folds + 1):
        train_path = os.path.join(folds_path, f'fold_{fold}_train.csv')
        test_path = os.path.join(folds_path, f'fold_{fold}_test.csv')

        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        # Train model
        X_train = train_df[features].values
        y_train = train_df[target].values
        X_test = test_df[features].values
        y_test = test_df[target].values
        model.fit(X_train, y_train)

        # Predict
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        # Evaluate top-vertex prediction
        train_scores = get_top_vertex_metrics(train_df, y_pred_train, target)
        test_scores = get_top_vertex_metrics(test_df, y_pred_test, target)

        metrics['train'].loc[f'Fold {fold}'] = [train_scores]
        metrics['test'].loc[f'Fold {fold}'] = [test_scores]

        print(f"\nFold {fold} Results:")
        print(f"  Train - Accuracy: {train_scores:.4f}")
        print(f"  Test  - Accuracy: {test_scores:.4f}")

    # Coefficients
    print("\nFinal Model Coefficients (from last fold):")
    for feature, coef in zip(features, model.coef_):
        print(f"  {feature}: {coef:.4f}")
    print(f"Intercept: {model.intercept_:.4f}")

    return metrics

# Example usage
features = ['n_norm', 'degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm']
target = 'is_root'
folds_path = './data/cross_validation'

results = test_linear_model(folds_path, features, target)

# Average metrics summary
print("\n=== Average Metrics Across Folds ===")
print("\nTrain:")
print(results['train'].mean())
print("\nTest:")
print(results['test'].mean())



Fold 1 Results:
  Train - Accuracy: 0.2746
  Test  - Accuracy: 0.2571

Fold 2 Results:
  Train - Accuracy: 0.2689
  Test  - Accuracy: 0.2786

Fold 3 Results:
  Train - Accuracy: 0.2668
  Test  - Accuracy: 0.2886

Fold 4 Results:
  Train - Accuracy: 0.2717
  Test  - Accuracy: 0.2767

Fold 5 Results:
  Train - Accuracy: 0.2742
  Test  - Accuracy: 0.2552

Final Model Coefficients (from last fold):
  n_norm: -0.1137
  degree_norm: 0.0209
  closeness_norm: 0.0333
  betweenness_norm: 0.1122
  pagerank_norm: 0.0474
Intercept: 0.0199

=== Average Metrics Across Folds ===

Train:
Accuracy    0.271238
dtype: float64

Test:
Accuracy    0.271238
dtype: float64


In [None]:
# Copy and prepare test results
test_df = results['test'].copy()

# Calculate mean and insert as second row
mean_row = pd.DataFrame(test_df.mean()).T
mean_row.index = ['Mean']

# Add model name as the first row
model_row = pd.DataFrame([['LM_v1']], columns=test_df.columns, index=['Model'])

# Concatenate rows in desired order
test_df = pd.concat([model_row, mean_row, test_df])

# Transpose the DataFrame
test_df_results = test_df.T
test_df_results.index.values[0] = ''

# Save to CSV
test_df_results.to_csv("./data/models/LM_v1.csv", index=False)

# Display
test_df_results.head()


Index(['Accuracy'], dtype='object')
Index(['Model', 'Mean', 'Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5'], dtype='object')


Unnamed: 0,Model,Mean,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5
,LM_v1,0.271238,0.257143,0.278571,0.288571,0.276667,0.255238


### Testing

In [None]:
# create a code that picks the best 