# Hypertuning

## Import necessary libraries and set up global configuration

This cell imports all required libraries for data processing, graph analysis, machine learning, and model evaluation. It also sets a global random seed and defines the feature columns used throughout the notebook.

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import ast
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, zero_one_loss, roc_auc_score, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.combine import SMOTEENN
import joblib
import os
import warnings
import time
warnings.filterwarnings("ignore")

# Global configuration
SEED = 42
np.random.seed(SEED)

# Feature columns
FEATURE_COLUMNS = [
    'degree', 'closeness', 'betweenness', ' eccentricity',
    'leaf_node', 'farness', 'subtree_height'
]

## Define function to compute centrality features

This cell defines the `centralities` function, which computes various graph centrality measures for each node in a given edgelist using NetworkX. The function returns a dictionary of features for each node.

In [2]:
def centralities(edgelist):
    
    T = nx.from_edgelist(edgelist)
    
    # Calculate centrality measures
    degree = nx.degree_centrality(T)  
    closeness = nx.closeness_centrality(T)  
    betweenness = nx.betweenness_centrality(T)  
    eccentricity = nx.eccentricity(T)  
    leaf_node = [v for v, d in T.degree() if d == 1]  
    Farness = {v: 1 / (eccentricity[v] + 1) for v in T.nodes() if eccentricity[v] > 0}
    
    # Calculate subtree heights
    subtree_height = {}
    for node in T.nodes():
        subtree_height[node] = nx.single_source_shortest_path_length(T, node)
    for node, distances in subtree_height.items():
        subtree_height[node] = max(distances.values()) if distances else 0

    features = {}
    for v in T:
        features[v] = (
            degree[v],
            closeness[v],
            betweenness[v], 
            eccentricity[v],
            leaf_node.count(v),
            Farness[v],
            subtree_height[v]  # Added subtree height feature
        )
    return features


## Split the training data into train and validation sets
This cell defines and uses the `split_data_set` function to split the training data into training and validation sets. The split is performed by randomly selecting a subset of unique sentences for validation, ensuring no sentence appears in both sets.

In [3]:
import random

def split_data_set(data, seed=SEED, test_ratio=0.2): # Use global SEED
    random.seed(seed)
    unique_ids = list(set(data['sentence']))
    
    # Ensure test_size is not larger than the number of unique_ids
    if not unique_ids: # Handle empty data
        return pd.DataFrame(columns=data.columns), pd.DataFrame(columns=data.columns)
    
    actual_test_size = int(len(unique_ids) * test_ratio)
    if actual_test_size == 0 and len(unique_ids) > 0 and test_ratio > 0: # Ensure at least one sample if possible
        actual_test_size = 1 
    if actual_test_size > len(unique_ids): # Cap test size
        actual_test_size = len(unique_ids)

    if actual_test_size == 0 : # if still zero (e.g. unique_ids is empty or test_ratio is 0)
         test_ids = set()
    else:
        test_ids = set(random.sample(unique_ids, actual_test_size))

    train_ids = set(unique_ids) - test_ids

    train_set = data[data['sentence'].isin(train_ids)]
    val_set = data[data['sentence'].isin(test_ids)]

    return train_set, val_set

## Utility functions for feature extraction, normalization, and cross-validation

Defines utility functions for centrality calculation, data splitting, feature expansion, normalization, and k-fold cross-validation index creation. Also includes the main data preparation function.

In [4]:
# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def centralities(edgelist):
    """Calculate centrality measures for graph nodes."""
    if not edgelist:
        return {}
    
    T = nx.from_edgelist(edgelist)
    
    degree = nx.degree_centrality(T)
    closeness = nx.closeness_centrality(T)
    betweenness = nx.betweenness_centrality(T)
    eccentricity = nx.eccentricity(T)
    leaf_node = [v for v, d in T.degree() if d == 1]
    farness = {v: 1 / (eccentricity[v] + 1) for v in T.nodes() if eccentricity[v] > 0}
    
    # Calculate subtree heights
    subtree_height = {}
    for node in T.nodes():
        distances = nx.single_source_shortest_path_length(T, node)
        subtree_height[node] = max(distances.values()) if distances else 0

    features = {}
    for v in T:
        features[v] = (
            degree[v], closeness[v], betweenness[v], eccentricity[v],
            leaf_node.count(v), farness[v], subtree_height[v]
        )
    return features

def split_data_set(data, seed=SEED, test_ratio=0.2):
    """Split data by unique sentences."""
    random.seed(seed)
    unique_ids = list(set(data['sentence']))
    
    if not unique_ids:
        return pd.DataFrame(columns=data.columns), pd.DataFrame(columns=data.columns)
    
    test_size = max(1, int(len(unique_ids) * test_ratio))
    test_size = min(test_size, len(unique_ids))
    
    if test_size == 0:
        test_ids = set()
    else:
        test_ids = set(random.sample(unique_ids, test_size))
    
    train_ids = set(unique_ids) - test_ids
    
    train_set = data[data['sentence'].isin(train_ids)]
    val_set = data[data['sentence'].isin(test_ids)]
    
    return train_set, val_set

def get_expanded_data(data, train=True):
    """Expand data with centrality features."""
    expanded_set = []
    
    for index, row in data.iterrows():
        edgelist_str = row.get('rand_edgelist', '[]')
        try:
            edgelist = ast.literal_eval(edgelist_str)
            if not isinstance(edgelist, list):
                edgelist = []
        except (ValueError, SyntaxError):
            edgelist = []

        central_edges = centralities(edgelist)
        language = row['language']
        sentence = row['sentence']
        n = row['n']
        
        if train:
            root = row['root']
            for vertex, values in central_edges.items():
                expanded_set.append((language, sentence, n, vertex, *values, vertex==root))
        else:
            row_id = row.get('id', None)
            for vertex, values in central_edges.items():
                expanded_set.append((row_id, language, sentence, n, vertex, *values))

    if train:
        columns = ['language', 'sentence', 'n', 'vertex'] + FEATURE_COLUMNS + ['is_root']
    else:
        columns = ['id', 'language', 'sentence', 'n', 'vertex'] + FEATURE_COLUMNS
    
    return pd.DataFrame(expanded_set, columns=columns)

def normalize_by_sentence(df, feature_columns, groupby_cols=['language', 'sentence']):
    """Normalize features within each sentence group."""
    grouped = df.groupby(groupby_cols)
    normalized_groups = []
    numerical_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()

    for (lang, sentence), group in grouped:
        scaler = MinMaxScaler()
        normalized = scaler.fit_transform(group[numerical_features])
        normalized_df = pd.DataFrame(normalized, index=group.index, columns=numerical_features)
        combined = group.drop(columns=numerical_features).join(normalized_df)
        normalized_groups.append(combined)

    return pd.concat(normalized_groups)

def create_k_folds_indices(data, k=5, seed_base=SEED):
    """Create k-fold indices for cross-validation."""
    splits = []
    
    if not data.index.is_unique:
        data = data.reset_index(drop=True)
        
    for i in range(k):
        current_seed = seed_base + i
        fold_train_set, fold_val_set = split_data_set(data, seed=current_seed, test_ratio=0.2)
        
        train_indices = data.index.get_indexer(fold_train_set.index.values)
        val_indices = data.index.get_indexer(fold_val_set.index.values)
        
        splits.append((train_indices, val_indices))
    
    return splits

# =============================================================================
# DATA PREPARATION
# =============================================================================

def prepare_data():
    """Load and prepare training and test data."""
    train = pd.read_csv('train-random.csv')
    test = pd.read_csv('test-random.csv')
    
    # Split training data
    train_set, val_set = split_data_set(train, seed=SEED, test_ratio=0.2)
    
    # Expand data with features
    expanded_data_train = get_expanded_data(train_set)
    expanded_data_val = get_expanded_data(val_set)
    expanded_data_test = get_expanded_data(test, train=False)
    
    # Sort data
    expanded_data_train.sort_values(by=['language', 'sentence', 'n', 'vertex'], inplace=True)
    expanded_data_val.sort_values(by=['language', 'sentence', 'n', 'vertex'], inplace=True)
    expanded_data_test.sort_values(by=['id', 'language', 'sentence', 'n', 'vertex'], inplace=True)
    
    # Normalize features
    train_scaled = normalize_by_sentence(expanded_data_train, FEATURE_COLUMNS)
    val_scaled = normalize_by_sentence(expanded_data_val, FEATURE_COLUMNS)
    test_scaled = normalize_by_sentence(expanded_data_test, FEATURE_COLUMNS)
    
    return train_scaled, val_scaled, test_scaled

## Prepare the data for training and validation

Calls the `prepare_data` function to load, process, and normalize the training, validation, and test datasets.

In [5]:
train_scaled, val_scaled, test_scaled = prepare_data()

## Resample the training data and set up hyperparameter search

Resamples the training data using SMOTEENN to address class imbalance, sets up the RandomForestClassifier, defines the hyperparameter search space, and configures the RandomizedSearchCV for model selection.

In [6]:
from sklearn.model_selection import RandomizedSearchCV
train_resampled = train_scaled.drop(columns=['language', 'n', 'vertex'])

smoteenn = SMOTEENN(random_state=42, sampling_strategy=0.5, n_jobs=-1)

train_resampled, _ = smoteenn.fit_resample(train_resampled, train_resampled.is_root)

splits = create_k_folds_indices(train_resampled)

X_resampled = train_resampled.drop(columns=['sentence', 'is_root'])
y_resampled = train_resampled['is_root'].astype(int)


rf_model = RandomForestClassifier(random_state=SEED, n_jobs=-1,oob_score=True)

ntrees = [100, 200, 300]
max_depth = [50, 100, 200]
min_samples_split = [4, 6, 8]
min_samples_leaf = [2, 4, 6]
balance = ['balanced', 'balanced_subsample']
max_features = ['sqrt', 'log2', None, 0.5, 0.7]
criterion = ['gini', 'entropy']
bootstrap = [True, False]

f1_class_0_scorer = make_scorer(f1_score, pos_label=0)
f1_class_1_scorer = make_scorer(f1_score, pos_label=1)

scoring_dict = {
    'f1_mac': 'f1_macro',
    'f1_class_0': f1_class_0_scorer,
    'f1_class_1': f1_class_1_scorer,
    'acc': 'accuracy'
}

trc = RandomizedSearchCV(estimator=rf_model,
                   scoring=scoring_dict,
                   param_distributions={
                       'n_estimators': ntrees,
                       'max_depth':max_depth,
                       'min_samples_split':min_samples_split,
                       'min_samples_leaf':min_samples_leaf, 
                       'class_weight':balance,
                       'max_features': max_features,
                       'criterion': criterion,
                       'bootstrap': bootstrap
                   },
                   n_iter=500,
                   cv=splits,
                   return_train_score=False,
                   refit='f1_mac',
                   random_state=SEED,
                   n_jobs=-1)

model_5CV = trc.fit(X_resampled, y_resampled)
importances = model_5CV.best_estimator_.feature_importances_

## Analyze and save cross-validation results

Creates a DataFrame from the cross-validation results, saves them to a CSV file, and displays the top results sorted by mean macro F1 score.

In [7]:
scoring_cols = [
    'param_n_estimators', 'param_max_depth', 'param_min_samples_split',
    'param_min_samples_leaf', 'param_class_weight', 'param_max_features',
    'param_criterion', 'param_bootstrap',
    'mean_test_f1_mac', 'mean_test_f1_class_0', 'mean_test_f1_class_1',
    'mean_test_acc'
]
# Create DataFrame from CV results
cv_results_df = pd.DataFrame(model_5CV.cv_results_)

# Save all results to CSV
cv_results_df.to_csv('randomized_search_results.csv', index=False)

pd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac', ascending=False)[scoring_cols].head()

Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_class_weight,param_max_features,param_criterion,param_bootstrap,mean_test_f1_mac,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_acc
365,200,100,6,2,balanced,sqrt,entropy,True,0.886425,0.926852,0.845997,0.900831
296,200,200,6,2,balanced,sqrt,entropy,True,0.886425,0.926852,0.845997,0.900831
452,300,100,6,2,balanced,log2,entropy,True,0.88604,0.926601,0.84548,0.900492
376,300,200,6,2,balanced,sqrt,entropy,True,0.88604,0.926601,0.84548,0.900492
420,300,50,6,2,balanced,sqrt,entropy,True,0.885985,0.92656,0.845409,0.90044


In [3]:
# Show only selected columns for the top 10 results sorted by mean_test_f1_mac

selected_cols = [
    'param_n_estimators', 'param_max_depth', 'param_min_samples_split',
    'param_min_samples_leaf', 'param_class_weight', 'param_max_features',
    'param_criterion', 'param_bootstrap',
    'mean_test_f1_mac', 'mean_test_f1_class_0', 'mean_test_f1_class_1',
    'mean_test_acc'
]

top10 = cv_results_df.sort_values(by='mean_test_f1_mac', ascending=False).head(10)
top10[selected_cols]

Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_class_weight,param_max_features,param_criterion,param_bootstrap,mean_test_f1_mac,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_acc
365,200,100,6,2,balanced,sqrt,entropy,True,0.886425,0.926852,0.845997,0.900831
296,200,200,6,2,balanced,sqrt,entropy,True,0.886425,0.926852,0.845997,0.900831
452,300,100,6,2,balanced,log2,entropy,True,0.88604,0.926601,0.84548,0.900492
376,300,200,6,2,balanced,sqrt,entropy,True,0.88604,0.926601,0.84548,0.900492
420,300,50,6,2,balanced,sqrt,entropy,True,0.885985,0.92656,0.845409,0.90044
140,100,200,6,2,balanced,log2,entropy,True,0.885785,0.926426,0.845144,0.900261
436,100,200,6,2,balanced,sqrt,entropy,True,0.885785,0.926426,0.845144,0.900261
412,100,50,6,2,balanced,sqrt,entropy,True,0.885722,0.926359,0.845085,0.900188
362,200,50,8,2,balanced,log2,entropy,True,0.885712,0.926222,0.845202,0.900087
308,300,50,4,2,balanced_subsample,log2,entropy,True,0.885711,0.926505,0.844916,0.900286


## Extract best hyperparameters from cross-validation

Finds and displays the best hyperparameter set based on the highest mean macro F1 score from the cross-validation results.

In [8]:
best_params = pd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac',ascending=False)[['params']].iloc[0,0]
best_params

{'n_estimators': 200,
 'min_samples_split': 6,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 100,
 'criterion': 'entropy',
 'class_weight': 'balanced',
 'bootstrap': True}

## Train the final model with best hyperparameters and evaluate

Trains a new RandomForestClassifier using the best hyperparameters, fits it to the resampled training data, saves the model, and evaluates its performance on the validation set.

In [9]:
rf_model_tuned = RandomForestClassifier(**best_params)
rf_model_tuned.fit(X_resampled, y_resampled)

#save the model
joblib.dump(rf_model_tuned, './rf_model_tuned.pkl')

X_val = val_scaled.drop(columns=['language', 'n', 'vertex', 'sentence', 'is_root'])
y_val = val_scaled['is_root'].astype(int)

y_pred = rf_model_tuned.predict(X_val)

print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.93      0.95     38134
           1       0.23      0.34      0.27      2100

    accuracy                           0.90     40234
   macro avg       0.59      0.64      0.61     40234
weighted avg       0.92      0.90      0.91     40234



## Generate predictions for the test set and create submission file

Uses the tuned model to predict probabilities for the test set, selects the most likely root for each graph, and saves the results in a submission CSV file.

In [11]:
X_test = test_scaled.drop(columns=['id', 'language', 'n', 'vertex', 'sentence'])

y_pred_proba = rf_model_tuned.predict_proba(X_test)[:, 1]  # Get probability of positive class

test_scaled['probability'] = y_pred_proba
submission = {}
# Group predictions by ID
grouped = test_scaled.groupby('id')

for graph_id, group in grouped:
    # Get the row with the highest predicted root probability
    best_row = group.loc[group['probability'].idxmax()]
    submission[graph_id] = best_row['vertex']

sub = pd.DataFrame(submission.items(), columns=['id','root'])

sub['root'] = sub['root'].astype(int)
print(type(sub.root.iloc[0]))

sub.to_csv("./submission.csv", index=False)



<class 'numpy.int32'>
