# Test


## Preprocessing Test Submission

In [None]:
import pandas as pd
import os
import ast
import numpy as np
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

df = pd.read_csv("./data/raw_files/test.csv")
df['edgelist'] = df['edgelist'].apply(ast.literal_eval) # to verify that we are sending a list instead of an array

print("Dataframe columns")
print(df.columns)

print("Dataframe rows")
print(len(df))

Dataframe columns
Index(['id', 'language', 'sentence', 'n', 'edgelist'], dtype='object')
Dataframe rows
10395


In [13]:
import networkx as nx

def centralities(edgelist):
    """
    - edgelist is a list of node pairs e.g. [(7,2),(1,7),(1,9),...]
    - returns a dictionary of vertex -> (centrality values)
    """
    T = nx.from_edgelist(edgelist)
    dc = nx.degree_centrality(T)
    cc = nx.harmonic_centrality(T)
    bc = nx.betweenness_centrality(T)
    pc = nx.pagerank(T)
    katz = nx.katz_centrality_numpy(T)
    load = nx.load_centrality(T)

    return {v: (dc[v], cc[v], bc[v], pc[v], katz[v], load[v]) for v in T}


In [None]:
rows = []

for idx, row in df.iterrows():
    # Compute centralities
    centrality_dict = centralities(row['edgelist'])
    
    for vertex, (deg, clos, betw, pr, katz, load) in centrality_dict.items():
        rows.append({
            'id': row['id'],
            'language': row['language'],
            'sentence': row['sentence'],
            'n': row['n'],
            'vertex': vertex,
            'degree': deg,
            'closeness': clos,
            'betweenness': betw,
            'pagerank': pr,
            'katz': katz,
            'load': load
        })

df_filtered = pd.DataFrame(rows)

print(df_filtered.head())
df_filtered.to_csv("./data/preprocessed/test/test_preprocessed.csv", index=False)



   id  language  sentence   n  vertex    degree  closeness  betweenness  \
0   1  Japanese         1  43      38  0.047619   8.953882     0.047619   
1   1  Japanese         1  43      33  0.023810   7.094756     0.000000   
2   1  Japanese         1  43      10  0.095238  11.348363     0.335656   
3   1  Japanese         1  43      24  0.047619   6.855212     0.047619   
4   1  Japanese         1  43      16  0.023810   5.649594     0.000000   

   pagerank      katz      load  
0  0.024647  0.152204  0.047619  
1  0.013964  0.135660  0.000000  
2  0.043718  0.181985  0.335656  
3  0.026723  0.149016  0.047619  
4  0.014845  0.135341  0.000000  


In [15]:
import numpy as np
import pandas as pd

# Min-max normalization function
def min_max_normalize(metric_dict):
    values = np.array(list(metric_dict.values()), dtype=np.float64)
    min_val = np.min(values)
    max_val = np.max(values)
    if max_val == min_val:
        return {k: 0.0 for k in metric_dict}  # Avoid division by zero
    return {k: (v - min_val) / (max_val - min_val) for k, v in metric_dict.items()}

# Normalize centralities per sentence-language pair
def normalize_centralities(df):
    required_columns = ['id', 'sentence', 'language', 'vertex', 'degree', 'closeness', 'betweenness', 'pagerank', 'katz', 'load']

    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {set(required_columns) - set(df.columns)}")
    
    metrics = metrics = ['degree', 'closeness', 'betweenness', 'pagerank', 'katz', 'load']

    result_frames = []

    # Group by sentence and language
    for (sentence, language), group in df.groupby(['sentence', 'language']):
        norm_data = {}

        for metric in metrics:
            metric_dict = dict(zip(group['vertex'], group[metric]))
            norm_data[metric] = min_max_normalize(metric_dict)

        # Copy group and apply normalized values
        norm_df = group.copy()
        for metric in metrics:
            norm_df[f'{metric}_norm'] = norm_df['vertex'].map(norm_data[metric])

        result_frames.append(norm_df)

    # Combine all normalized groups
    return pd.concat(result_frames, ignore_index=True)

# Normalize length 
def normalize_length(df):
    df['n_norm'] = (df['n'] - df['n'].min()) / (df['n'].max() - df['n'].min())
    return df


# Apply normalization and save
df_normalized = normalize_centralities(df_filtered)

df_normalized = normalize_length(df_normalized)

df_normalized.to_csv("./data/preprocessed/test/test_normalized.csv", index=False)


## Linear Regression

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

def get_top_vertex_metrics(df, y_pred, target='is_root'):
    """
    For each (sentence, language), select the vertex with the highest predicted score.
    Compare it with the actual root vertex (where is_root == 1) and return match accuracy.
    
    Returns:
        accuracy (float): Proportion of correct root predictions.
    """
    df = df.copy()
    df['y_pred'] = y_pred

    # Group by sentence and language
    groups = df.groupby(['sentence', 'language'])
    total = 0
    correct = 0

    for (sentence, language), group in groups:
        # Get actual root vertex
        true_root_row = group[group[target] == 1]
        if true_root_row.empty:
            continue  # skip if no root in this group

        true_vertex = true_root_row['vertex'].values[0]

        # Get predicted top vertex (highest y_pred)
        predicted_vertex = group.loc[group['y_pred'].idxmax(), 'vertex']

        if predicted_vertex == true_vertex:
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy

def extract_features(df, features):
    # Create 'avg_metrics' if needed
    if 'avg_metrics' in features and 'avg_metrics' not in df.columns:
        df['avg_metrics'] = df[['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']].mean(axis=1)
    
    # One-hot encode 'language' (if present)
    if 'language' in features:
        df = pd.get_dummies(df, columns=['language'], drop_first=True)
        # Update features list to include new dummy columns
        new_language_features = [col for col in df.columns if col.startswith('language_')]
        features = [f for f in features if f != 'language'] + new_language_features
    
    return df[features].values
    
def get_test_results(df, y_pred):
    """
    Return a DataFrame with one row per sentence ID, showing the predicted root vertex.
    """
    df = df.copy()
    df['y_pred'] = y_pred

    results = []
    for id_val, group in df.groupby('id'):
        top_vertex = group.loc[group['y_pred'].idxmax()]
        results.append({
            'id': int(id_val),
            'root': int(top_vertex['vertex'])
        })

    return pd.DataFrame(results)


def test_linear_model(model, features, target):
    """
    Trains and evaluates a model. Returns test predictions (root vertex per sentence-language group).
    """
    train_path = './data/preprocessed/data_normalized.csv'
    test_path = './data/preprocessed/test/test_normalized.csv'

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # Train
    X_train = extract_features(train_df, features)
    y_train = train_df[target].values
    X_test = extract_features(test_df, features)
    model.fit(X_train, y_train)

    # Predict
    y_proba_train = model.predict_proba(X_train)[:, 1]
    y_proba_test = model.predict_proba(X_test)[:, 1]

    # Accuracy (top vertex match)
    train_accuracy = get_top_vertex_metrics(train_df, y_proba_train, target)
    print(f"Train - Accuracy: {train_accuracy:.4f}")

    # Return test root predictions
    test_results = get_test_results(test_df, y_proba_test)
    return test_results

## Logistic Regression Test

In [21]:
model = LogisticRegression()
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']
target = 'is_root'

test_predictions = test_linear_model(model, features, target)
test_predictions.to_csv(f'./data/results/LogReg_V1.csv', index=False)
print(test_predictions.head())

Train - Accuracy: 0.2717
   id  root
0   1    37
1   2    46
2   3     2
3   4    11
4   5     3


## Random Forest Test

In [25]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']
target = 'is_root'

test_predictions = test_linear_model(model, features, target)
test_predictions.to_csv(f'./data/results/RF_V4.csv', index=False)
print(test_predictions.head())


Train - Accuracy: 0.8521
   id  root
0   1     2
1   2    15
2   3     2
3   4     5
4   5     1


## Boosting

In [26]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=1.0, # L2 regularization
    eval_metric='logloss',
    random_state=42)
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']
target = 'is_root'

test_predictions = test_linear_model(model, features, target)
test_predictions.to_csv(f'./data/results/XGB.csv', index=False)
print(test_predictions.head())

Train - Accuracy: 0.2893
   id  root
0   1    37
1   2    46
2   3     2
3   4    11
4   5     3


In [None]:
model = QuadraticDiscriminantAnalysis()
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']
target = 'is_root'

test_predictions = test_linear_model(model, features, target)
test_predictions.to_csv(f'./data/results/QDA.csv', index=False)
print(test_predictions.head())




Train - Accuracy: 0.2435
   id  root
0   1    25
1   2    24
2   3    11
3   4    11
4   5     1


## Best models by language (with train data)


In [None]:
executed_models = [f.split('.')[0] for f in os.listdir('./data/results') if f.endswith('.csv')]

models_metrics = models_metrics[models_metrics['model'].isin(executed_models)]


exclude_columns = ['model', 'Train Accuracy', 'Test Accuracy']
language_columns = [col for col in models_metrics.columns if col not in exclude_columns]

results = []
for language in language_columns:
    best_idx = models_metrics[language].idxmax()
    best_model = models_metrics.loc[best_idx, 'model']
    best_accuracy = models_metrics.loc[best_idx, language]
    results.append({'language': language, 'best_model': best_model, 'accuracy': best_accuracy})


best_models_df = pd.DataFrame(results)

best_models_df = best_models_df.sort_values(by='best_model')


best_models_df.to_csv('./data/models/best_model_per_language.csv', index=False)
print("Best models per language saved to: ./data/models/best_model_per_language.csv")
best_models_df.head()


✅ Best models per language saved to: ./data/models/best_model_per_language.csv


Unnamed: 0,language,best_model,accuracy
16,Chinese,LogReg_V1,0.29
0,Arabic,RF_V3,0.31
18,German,RF_V3,0.288
17,Galician,RF_V3,0.306
15,Italian,RF_V3,0.262


## All models tested


In [33]:
# Path where your model predictions are stored
test_file_path = './data/results'

# Load the ground truth submission file
final_results = pd.read_csv('./data/tests/results.csv')

# Loop through all files in the results directory
for model_file in os.listdir(test_file_path):
    if not model_file.endswith('.csv'):
        continue  # skip non-CSV files

    model_path = os.path.join(test_file_path, model_file)
    model_df = pd.read_csv(model_path)

    # Ensure consistent formatting
    merged = final_results.merge(model_df, on='id', suffixes=('_true', '_pred'))

    # Count matches
    correct = (merged['root'] == merged['leaked_root']).sum()
    accuracy = correct / len(merged)

    print(f"Model: {model_file} — Accuracy: {accuracy:.4f} ({correct}/{len(merged)})\n")


Model: LinReg_V1.csv — Accuracy: 0.2977 (3095/10395)

Model: LogReg_V1.csv — Accuracy: 0.2933 (3049/10395)

Model: QDA.csv — Accuracy: 0.2618 (2721/10395)

Model: RF_V3.csv — Accuracy: 0.2516 (2615/10395)

Model: RF_V4.csv — Accuracy: 0.2567 (2668/10395)

Model: XGB.csv — Accuracy: 0.3135 (3259/10395)

