# Modelling Notebook
According to the exploration of the boxplots the roots may have higher values in all of the metrics.
The baseline solution proposed would be generating a simple program that selects the node with higher values.

## Configurations of the Notebook

In [None]:
# Global variables
target = 'is_root'
folds_path = './data/cross_validation'

In [None]:
# Function declaration
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


def get_metrics(df, y_pred, target='is_root'):
    """
    For each (sentence, language), select the vertex with the highest predicted score.
    Compare it with the actual root vertex (where is_root == 1) and return:
        - overall accuracy
        - per-language accuracy breakdown
        - total and correct predictions
    
    Returns:
        accuracy (float): overall accuracy
        lang_stats (dict): per-language {'correct': int, 'total': int, 'accuracy': float}
        total (int): total number of predictions
        correct (int): number of correct predictions
    """
    df = df.copy()
    df['y_pred'] = y_pred

    # Group by sentence and language
    groups = df.groupby(['sentence', 'language'])

    total = 0
    correct = 0
    lang_stats = {}

    for (sentence, language), group in groups:
        # Skip if no actual root
        true_root_row = group[group[target] == 1]
        if true_root_row.empty:
            continue

        true_vertex = true_root_row['vertex'].values[0]
        predicted_vertex = group.loc[group['y_pred'].idxmax(), 'vertex']

        is_correct = predicted_vertex == true_vertex
        correct += int(is_correct)
        total += 1

        # Track per-language
        if language not in lang_stats:
            lang_stats[language] = {'correct': 0, 'total': 0}
        lang_stats[language]['correct'] += int(is_correct)
        lang_stats[language]['total'] += 1

    # Compute per-language accuracy
    for lang in lang_stats:
        lang_total = lang_stats[lang]['total']
        lang_correct = lang_stats[lang]['correct']
        lang_stats[lang]['accuracy'] = lang_correct / lang_total if lang_total > 0 else 0

    accuracy = correct / total if total > 0 else 0
    return accuracy, lang_stats, total, correct

def extract_features(df, features):
    # Create 'avg_metrics' if needed
    if 'avg_metrics' in features and 'avg_metrics' not in df.columns:
        df['avg_metrics'] = df[['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']].mean(axis=1)
    
    return df[features].values


import os
import pandas as pd

def test_model(model, folds_path, features, target, num_folds=5):
    metrics = {
        'train': pd.DataFrame(columns=['Accuracy']),
        'test': pd.DataFrame(columns=['Accuracy'])
    }
    
    language_stats = {
        'train': {},
        'test': {}
    }

    for fold in range(1, num_folds + 1):
        train_path = os.path.join(folds_path, f'fold_{fold}_train.csv')
        test_path = os.path.join(folds_path, f'fold_{fold}_test.csv')

        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        # Train model
        X_train = extract_features(train_df, features)
        y_train = train_df[target].values
        X_test = extract_features(test_df, features)
        y_test = test_df[target].values
        model.fit(X_train, y_train)

        # Predict
        y_proba_train = model.predict_proba(X_train)[:, 1]
        y_proba_test = model.predict_proba(X_test)[:, 1]

        # Evaluate top-vertex prediction
        train_acc, train_lang_stats, train_total, train_correct = get_metrics(train_df, y_proba_train, target)
        test_acc, test_lang_stats, test_total, test_correct = get_metrics(test_df, y_proba_test, target)

        print(f"Fold-{fold}")
        print(f"  Train Accuracy: {train_acc:.4f} ({train_correct}/{train_total})")
        print(f"  Test Accuracy:  {test_acc:.4f} ({test_correct}/{test_total})")

        # Save metrics
        metrics['train'].loc[f'Fold {fold}'] = [train_acc]
        metrics['test'].loc[f'Fold {fold}'] = [test_acc]
        language_stats['train'][f'Fold {fold}'] = train_lang_stats
        language_stats['test'][f'Fold {fold}'] = test_lang_stats

    return metrics, language_stats


def print_metrics(metrics):    
    print("\n=== Average Metrics Across Folds ===")
    print("\nTrain:")
    print(metrics['train'].mean())
    print("\nTest:")
    print(metrics['test'].mean())


def summarize_model_results(model_name, metrics, language_stats):
    """
    Generate a summary row for the given model.
    
    Returns a DataFrame with:
    - model_name
    - train_mean_accuracy
    - test_mean_accuracy
    - average test accuracy per language
    """
    summary = {
        'model': model_name,
        'Train Accuracy': metrics['train']['Accuracy'].mean(),
        'Test Accuracy': metrics['test']['Accuracy'].mean()
    }

    # Aggregate language scores across folds
    lang_totals = {}
    for fold_stats in language_stats['test'].values():
        for lang, stats in fold_stats.items():
            if lang not in lang_totals:
                lang_totals[lang] = {'correct': 0, 'total': 0}
            lang_totals[lang]['correct'] += stats['correct']
            lang_totals[lang]['total'] += stats['total']

    # Compute mean accuracy per language
    for lang, stats in lang_totals.items():
        if stats['total'] > 0:
            summary[lang] = stats['correct'] / stats['total']
        else:
            summary[lang] = None  # or 0.0

    return pd.DataFrame([summary])

def save_model(summary_df):
    results_path = "./data/models/all_models.csv"

    # Load existing results or create a new one with same columns
    if os.path.exists(results_path):
        combined_results = pd.read_csv(results_path)

        # Ensure 'model' column exists in file
        if 'model' not in combined_results.columns:
            raise ValueError("Existing file is missing the 'model' column.")
    else:
        # Initialize with correct columns from summary_df
        combined_results = pd.DataFrame(columns=summary_df.columns)

    # Ensure 'model' column exists in summary_df
    if 'model' not in summary_df.columns:
        raise ValueError("The summary_df must contain a 'model' column.")

    # Drop duplicate models
    combined_results = combined_results[~combined_results['model'].isin(summary_df['model'])]

    # Append new and save
    combined_results = pd.concat([combined_results, summary_df], ignore_index=True)
    combined_results.to_csv(results_path, index=False)

    print("Updated combined results:")
    print(combined_results.head())
    print("✅ Saved summary to:", results_path)



## Linear Models

### Logistic Regression V1 - Baseline

In [None]:
# Example usage
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']
log_reg = LogisticRegression()

metrics, language_stats = test_model(log_reg, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("LogReg_V1", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2749 (2309/8400)
  Test Accuracy:  0.2590 (544/2100)
Fold-2
  Train Accuracy: 0.2701 (2269/8400)
  Test Accuracy:  0.2767 (581/2100)
Fold-3
  Train Accuracy: 0.2681 (2252/8400)
  Test Accuracy:  0.2886 (606/2100)
Fold-4
  Train Accuracy: 0.2724 (2288/8400)
  Test Accuracy:  0.2762 (580/2100)
Fold-5
  Train Accuracy: 0.2750 (2310/8400)
  Test Accuracy:  0.2605 (547/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.272095
dtype: float64

Test:
Accuracy    0.27219
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0           LogReg_V2        0.270976       0.271619   0.292    0.250  0.208   
1  LogReg_Regularized        0.270810       0.270286   0.292    0.252  0.208   
2               RF_V1        0.859333       0.260571   0.286    0.282  0.172   
3               RF_V2        0.859357       0.260762   0.292    0.274  0.168   
4               RF_V3        0.291190       0.2875

### Logistic Regression with Feature Engineering

In that model we have applied the linear model, but with a new variable the average betweeen the metrics extracted from the sentences



In [4]:
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm', 'avg_metrics']
log_reg = LogisticRegression()

metrics, language_stats = test_model(log_reg, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("LogReg_V2", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2751 (2311/8400)
  Test Accuracy:  0.2600 (546/2100)
Fold-2
  Train Accuracy: 0.2707 (2274/8400)
  Test Accuracy:  0.2771 (582/2100)
Fold-3
  Train Accuracy: 0.2679 (2250/8400)
  Test Accuracy:  0.2886 (606/2100)
Fold-4
  Train Accuracy: 0.2717 (2282/8400)
  Test Accuracy:  0.2771 (582/2100)
Fold-5
  Train Accuracy: 0.2749 (2309/8400)
  Test Accuracy:  0.2595 (545/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.272048
dtype: float64

Test:
Accuracy    0.272476
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0  LogReg_Regularized        0.270810       0.270286   0.292    0.252  0.208   
1               RF_V1        0.859333       0.260571   0.286    0.282  0.172   
2               RF_V2        0.859357       0.260762   0.292    0.274  0.168   
3               RF_V3        0.291190       0.287524   0.308    0.280  0.224   
4              XGB_V1        0.354119       0.286

### Linear Model V3 - Tunning Parameters
Applied Ridge Regression

In [5]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']

# Define parameter grid for tuning alpha
param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}

# Ridge model with grid search (inner CV to pick alpha)
log_reg = LogisticRegression(
    penalty='l2',          # Change to 'l1' or 'elasticnet' if needed
    C=0.1,                 # Stronger regularization (default is 1.0)
    solver='liblinear',    # Use 'saga' for elasticnet
    max_iter=1000,
    random_state=42
)

metrics, language_stats = test_model(log_reg, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("LogReg_Regularized", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2754 (2313/8400)
  Test Accuracy:  0.2571 (540/2100)
Fold-2
  Train Accuracy: 0.2696 (2265/8400)
  Test Accuracy:  0.2776 (583/2100)
Fold-3
  Train Accuracy: 0.2674 (2246/8400)
  Test Accuracy:  0.2871 (603/2100)
Fold-4
  Train Accuracy: 0.2710 (2276/8400)
  Test Accuracy:  0.2786 (585/2100)
Fold-5
  Train Accuracy: 0.2750 (2310/8400)
  Test Accuracy:  0.2571 (540/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.271667
dtype: float64

Test:
Accuracy    0.271524
dtype: float64
Updated combined results:
    model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  Indonesian  \
0   RF_V1        0.859333       0.260571   0.286    0.282  0.172       0.296   
1   RF_V2        0.859357       0.260762   0.292    0.274  0.168       0.286   
2   RF_V3        0.291190       0.287524   0.308    0.280  0.224       0.300   
3  XGB_V1        0.354119       0.286667   0.304    0.298  0.216       0.310   
4  XGB_V2        0.350214       0.288095   0.308 

## Non Linear Models

### Random Forest V1 - Baseline (no feature engineering)

In [None]:
# V1
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']
rf = RandomForestClassifier() # overfitting de manual

metrics, language_stats = test_model(rf, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("RF_V1", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.8588 (7214/8400)
  Test Accuracy:  0.2448 (514/2100)
Fold-2
  Train Accuracy: 0.8605 (7228/8400)
  Test Accuracy:  0.2700 (567/2100)
Fold-3
  Train Accuracy: 0.8617 (7238/8400)
  Test Accuracy:  0.2686 (564/2100)
Fold-4
  Train Accuracy: 0.8613 (7235/8400)
  Test Accuracy:  0.2671 (561/2100)
Fold-5
  Train Accuracy: 0.8610 (7232/8400)
  Test Accuracy:  0.2471 (519/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.860643
dtype: float64

Test:
Accuracy    0.259524
dtype: float64
Updated combined results:
    model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  Indonesian  \
0   RF_V2        0.859357       0.260762   0.292    0.274  0.168       0.286   
1   RF_V3        0.291190       0.287524   0.308    0.280  0.224       0.300   
2  XGB_V1        0.354119       0.286667   0.304    0.298  0.216       0.310   
3  XGB_V2        0.350214       0.288095   0.308    0.288  0.200       0.316   
4  XGB_V3        0.293262       0.289905   0.312 

### Random Forest V2 - Feature Engineering

In [7]:
# V2
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm', 'avg_metrics']
rf = RandomForestClassifier() # overfitting de manual

metrics, language_stats = test_model(rf, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("RF_V2", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.8585 (7211/8400)
  Test Accuracy:  0.2486 (522/2100)
Fold-2
  Train Accuracy: 0.8611 (7233/8400)
  Test Accuracy:  0.2776 (583/2100)
Fold-3
  Train Accuracy: 0.8619 (7240/8400)
  Test Accuracy:  0.2738 (575/2100)
Fold-4
  Train Accuracy: 0.8619 (7240/8400)
  Test Accuracy:  0.2700 (567/2100)
Fold-5
  Train Accuracy: 0.8605 (7228/8400)
  Test Accuracy:  0.2538 (533/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.860762
dtype: float64

Test:
Accuracy    0.264762
dtype: float64
Updated combined results:
       model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0      RF_V3        0.291190       0.287524   0.308    0.280  0.224   
1     XGB_V1        0.354119       0.286667   0.304    0.298  0.216   
2     XGB_V2        0.350214       0.288095   0.308    0.288  0.200   
3     XGB_V3        0.293262       0.289905   0.312    0.282  0.224   
4  LogReg_V1        0.272095       0.272190   0.292    0.252  0.208   

   Indonesian  Polish 

### Random Forest V3 - Changing Parameters of the tree

We have decided to choose the features without the "avg_metrics", since the results are quite better without it.

In [8]:
# V3
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']
rf = RandomForestClassifier(max_depth=5,               # Restrict tree depth
    min_samples_split=10,      # Fewer splits
    min_samples_leaf=5,        # Larger leaves
    n_estimators=100,          # Keep moderate number of trees
    random_state=42)

metrics, language_stats = test_model(rf, folds_path, features, target)

# Average metrics summary
print_metrics(metrics)

summary_df = summarize_model_results("RF_V3", metrics, language_stats)
summary_df.head()
save_model(summary_df)

Fold-1
  Train Accuracy: 0.2937 (2467/8400)
  Test Accuracy:  0.2776 (583/2100)
Fold-2
  Train Accuracy: 0.2880 (2419/8400)
  Test Accuracy:  0.2990 (628/2100)
Fold-3
  Train Accuracy: 0.2874 (2414/8400)
  Test Accuracy:  0.3024 (635/2100)
Fold-4
  Train Accuracy: 0.2889 (2427/8400)
  Test Accuracy:  0.2924 (614/2100)
Fold-5
  Train Accuracy: 0.2969 (2494/8400)
  Test Accuracy:  0.2676 (562/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.290976
dtype: float64

Test:
Accuracy    0.28781
dtype: float64
Updated combined results:
       model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0     XGB_V1        0.354119       0.286667   0.304    0.298  0.216   
1     XGB_V2        0.350214       0.288095   0.308    0.288  0.200   
2     XGB_V3        0.293262       0.289905   0.312    0.282  0.224   
3  LogReg_V1        0.272095       0.272190   0.292    0.252  0.208   
4  LogReg_V2        0.272048       0.272476   0.292    0.254  0.208   

   Indonesian  Polish  

### Boosting V1 - Baseline

In [None]:
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']

xgb_model = XGBClassifier()

metrics, language_stats = test_model(xgb_model, folds_path, features, target)

print_metrics(metrics)

summary_df = summarize_model_results("XGB_V1", metrics, language_stats)
summary_df.head()
save_model(summary_df)


Fold-1
  Train Accuracy: 0.3417 (2870/8400)
  Test Accuracy:  0.2800 (588/2100)
Fold-2
  Train Accuracy: 0.3436 (2886/8400)
  Test Accuracy:  0.3024 (635/2100)
Fold-3
  Train Accuracy: 0.3473 (2917/8400)
  Test Accuracy:  0.2995 (629/2100)
Fold-4
  Train Accuracy: 0.3398 (2854/8400)
  Test Accuracy:  0.2981 (626/2100)
Fold-5
  Train Accuracy: 0.3480 (2923/8400)
  Test Accuracy:  0.2743 (576/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.344048
dtype: float64

Test:
Accuracy    0.290857
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0              XGB_V2        0.350214       0.288095   0.308    0.288  0.200   
1              XGB_V3        0.293262       0.289905   0.312    0.282  0.224   
2           LogReg_V1        0.272095       0.272190   0.292    0.252  0.208   
3           LogReg_V2        0.272048       0.272476   0.292    0.254  0.208   
4  LogReg_Regularized        0.271667       0.271

### Boosting V2 - Adding feature engineering

In [None]:

features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm', 'avg_metrics']

xgb_model = XGBClassifier()

metrics, language_stats = test_model(xgb_model, folds_path, features, target)

print_metrics(metrics)

summary_df = summarize_model_results("XGB_V2", metrics, language_stats)
summary_df.head()
save_model(summary_df)


Fold-1
  Train Accuracy: 0.3477 (2921/8400)
  Test Accuracy:  0.2748 (577/2100)
Fold-2
  Train Accuracy: 0.3475 (2919/8400)
  Test Accuracy:  0.3048 (640/2100)
Fold-3
  Train Accuracy: 0.3424 (2876/8400)
  Test Accuracy:  0.3010 (632/2100)
Fold-4
  Train Accuracy: 0.3405 (2860/8400)
  Test Accuracy:  0.2976 (625/2100)
Fold-5
  Train Accuracy: 0.3412 (2866/8400)
  Test Accuracy:  0.2676 (562/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.343857
dtype: float64

Test:
Accuracy    0.289143
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0              XGB_V3        0.293262       0.289905   0.312    0.282  0.224   
1           LogReg_V1        0.272095       0.272190   0.292    0.252  0.208   
2           LogReg_V2        0.272048       0.272476   0.292    0.254  0.208   
3  LogReg_Regularized        0.271667       0.271524   0.292    0.254  0.208   
4               RF_V1        0.860643       0.259

### Boosting V3 - Changing Boosting Parameters

In [None]:
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']

xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=1.0, # L2 regularization
    eval_metric='logloss',
    random_state=42)

metrics, language_stats = test_model(xgb_model, folds_path, features, target)

print_metrics(metrics)

summary_df = summarize_model_results("XGB_V3", metrics, language_stats)
summary_df.head()
save_model(summary_df)


Fold-1
  Train Accuracy: 0.2931 (2462/8400)
  Test Accuracy:  0.2781 (584/2100)
Fold-2
  Train Accuracy: 0.2900 (2436/8400)
  Test Accuracy:  0.2986 (627/2100)
Fold-3
  Train Accuracy: 0.2885 (2423/8400)
  Test Accuracy:  0.3014 (633/2100)
Fold-4
  Train Accuracy: 0.2883 (2422/8400)
  Test Accuracy:  0.2933 (616/2100)
Fold-5
  Train Accuracy: 0.2931 (2462/8400)
  Test Accuracy:  0.2648 (556/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.290595
dtype: float64

Test:
Accuracy    0.287238
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0           LogReg_V1        0.272095       0.272190   0.292    0.252  0.208   
1           LogReg_V2        0.272048       0.272476   0.292    0.254  0.208   
2  LogReg_Regularized        0.271667       0.271524   0.292    0.254  0.208   
3               RF_V1        0.860643       0.259524   0.240    0.284  0.172   
4               RF_V2        0.860762       0.264

### Quadratic Discriminant Analysis

In [None]:
features = ['degree_norm', 'closeness_norm', 'betweenness_norm', 'pagerank_norm', 'katz_norm', 'load_norm']

qda_model = QuadraticDiscriminantAnalysis()

metrics, language_stats = test_model(qda_model, folds_path, features, target)

print_metrics(metrics)

summary_df = summarize_model_results("QDA", metrics, language_stats)
summary_df.head()
save_model(summary_df)




Fold-1
  Train Accuracy: 0.2468 (2073/8400)
  Test Accuracy:  0.2195 (461/2100)




Fold-2
  Train Accuracy: 0.2390 (2008/8400)
  Test Accuracy:  0.2595 (545/2100)




Fold-3
  Train Accuracy: 0.2256 (1895/8400)
  Test Accuracy:  0.2462 (517/2100)




Fold-4
  Train Accuracy: 0.2456 (2063/8400)
  Test Accuracy:  0.2495 (524/2100)




Fold-5
  Train Accuracy: 0.2487 (2089/8400)
  Test Accuracy:  0.2290 (481/2100)

=== Average Metrics Across Folds ===

Train:
Accuracy    0.241143
dtype: float64

Test:
Accuracy    0.240762
dtype: float64
Updated combined results:
                model  Train Accuracy  Test Accuracy  Arabic  English  Hindi  \
0           LogReg_V1        0.272095       0.272190   0.292    0.252  0.208   
1           LogReg_V2        0.272048       0.272476   0.292    0.254  0.208   
2  LogReg_Regularized        0.271667       0.271524   0.292    0.254  0.208   
3               RF_V1        0.860643       0.259524   0.240    0.284  0.172   
4               RF_V2        0.860762       0.264762   0.268    0.290  0.172   

   Indonesian  Polish   Thai  Czech  ...  Korean  Spanish  Finnish  Icelandic  \
0       0.288   0.286  0.248  0.282  ...   0.306    0.302    0.336      0.330   
1       0.288   0.286  0.248  0.280  ...   0.306    0.302    0.332      0.330   
2       0.282   0.284  0.248  0.284  ...   0.