## Import necessary libraries and load train/test data
This cell imports required libraries (pandas, numpy, networkx, ast, torch) and loads the training and test datasets from CSV files.

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import ast
import torch

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Define function to compute centrality features
This cell defines a function `centralities` that computes various centrality measures for a given edgelist using NetworkX.

In [2]:
from sklearn.preprocessing import MinMaxScaler

def centralities(edgelist):
    
    T = nx.from_edgelist(edgelist)
    
    # Your existing centrality measures
    degree = nx.degree_centrality(T)
    eigenvector = nx.katz_centrality(T)
    closeness = nx.closeness_centrality(T)
    # current_flow_closeness = nx.current_flow_closeness_centrality(T)
    betweenness = nx.betweenness_centrality(T)
    # current_flow_betweenness = nx.current_flow_betweenness_centrality(T)
    communicability_betweenness = nx.communicability_betweenness_centrality(T)
    # load = nx.load_centrality(T)
    subgraph = nx.subgraph_centrality(T)
    harmonic = nx.harmonic_centrality(T)
    # percolation = nx.percolation_centrality(T)
    second_order = nx.second_order_centrality(T)
    voterank = nx.voterank(T)
    laplacian = nx.laplacian_centrality(T)

    features = {}
    
    for v in T:
        features[v] = (
            degree[v], 
            eigenvector[v], 
            closeness[v],  
            betweenness[v], 
            communicability_betweenness[v],
            subgraph[v], 
            harmonic[v],  
            second_order[v],
            1 if v in voterank else 0, 
            laplacian[v],
        )
    return features


## Split the training data into train and validation sets
This cell defines and applies a function to split the training data into training and validation sets based on unique sentences.

In [3]:
import random

def split_data_set(data, seed=42, test_ratio=0.2):
    random.seed(seed)
    unique_ids = list(set(data['sentence']))
    test_size = int(len(unique_ids) * test_ratio)

    test_ids = set(random.sample(unique_ids, test_size))
    train_ids = set(unique_ids) - test_ids

    train_set = data[data['sentence'].isin(train_ids)]
    val_set = data[data['sentence'].isin(test_ids)]


    return train_set, val_set

train_set, val_set = split_data_set(train, seed=42, test_ratio=0.2)

## Expand the data with centrality features for each vertex
This cell defines a function to expand the dataset by computing centrality features for each vertex in each sentence's edgelist, and applies it to train, validation, and test sets.

In [4]:
def get_expanded_data(data, train=True):
    expanded_set = []
    for index, row in data.iterrows():
        edgelist = ast.literal_eval(row['edgelist'])
        central_edges = centralities(edgelist)
        language = row['language']
        sentence = row['sentence']
        n = row['n']
        if train:
            root = row['root']
        else:
            id = row['id']
        for vertex, values in central_edges.items():
            if train:
                expanded_set.append((language, sentence, n, vertex, *values, vertex==root))
            else:
                expanded_set.append((id, language, sentence, n, vertex, *values))

    if train:
        return pd.DataFrame(expanded_set, columns=['language', 'sentence', 'n', 'vertex', 
                        'degree', 'eigenvector', 'closeness',
                        'betweenness', 'communicability_betweenness',
                        'subgraph', 'harmonic',
                        'second_order', 'voterank', 'laplacian', 
                        'is_root'])
    
    else:
        return pd.DataFrame(expanded_set, columns=['id', 'language', 'sentence', 'n', 'vertex', 
                                                    'degree', 'eigenvector', 'closeness',
                                                    'betweenness', 'communicability_betweenness',
                                                    'subgraph', 'harmonic',
                                                    'second_order', 'voterank', 'laplacian',
                                            ])


## Sort the expanded dataframes
This cell sorts the expanded train, validation, and test dataframes by relevant columns for consistency.

In [5]:
expanded_data_train = get_expanded_data(train_set)
expanded_data_val = get_expanded_data(val_set)
expanded_data_test = get_expanded_data(test, train=False)

expanded_data_train.sort_values(by=['language', 'sentence', 'n', 'vertex'], inplace=True)
expanded_data_val.sort_values(by=['language', 'sentence', 'n', 'vertex'], inplace=True)
expanded_data_test.sort_values(by=['id', 'language', 'sentence', 'n', 'vertex'], inplace=True)

## Copy expanded dataframes for further processing
This cell creates copies of the expanded train, validation, and test dataframes for further processing.

In [6]:
train_expanded = expanded_data_train.copy()
val_expanded = expanded_data_val.copy()
test_expanded = expanded_data_test.copy()

## Define and apply normalization by sentence
This cell defines a function to normalize feature columns within each sentence group using MinMaxScaler, and lists the feature columns to be normalized.

In [7]:
from sklearn.preprocessing import MinMaxScaler

feature_columns = [
   'degree', 'eigenvector', 'closeness',
    'betweenness', 'communicability_betweenness',
    'subgraph', 'harmonic', 
    'second_order', 'voterank', 'laplacian',
]


def normalize_by_sentence(df, feature_columns, groupby_cols=['language', 'sentence']):
    """
    Normalize features within each sentence group.
    """
    grouped = df.groupby(groupby_cols)
    normalized_groups = []

    numerical_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()

    for (lang, sentence), group in grouped:
        # Create a StandardScaler for each group
        scaler = MinMaxScaler()

        # Fit and transform the numerical features
        normalized = scaler.fit_transform(group[numerical_features])

        # Create a DataFrame with the same index and columns as the original group
        normalized_df = pd.DataFrame(normalized, index=group.index, columns=numerical_features)

        # Combine with non-features columns
        combined = group.drop(columns=numerical_features).join(normalized_df)
        normalized_groups.append(combined)

    normalized_df = pd.concat(normalized_groups)
    return normalized_df

## Normalize the train, validation, and test sets
This cell applies the normalization function to the train, validation, and test expanded dataframes.

In [8]:
train_scaled = normalize_by_sentence(train_expanded, feature_columns)
val_scaled = normalize_by_sentence(val_expanded, feature_columns)
test_scaled = normalize_by_sentence(test_expanded, feature_columns)

## Train and evaluate a Logistic Regression model
This cell prepares the data, applies SMOTEENN for class balancing, trains a Logistic Regression model, and prints a classification report on the validation set.

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.combine import SMOTEENN

# Prepare training data
y_train = train_scaled['is_root']
X_train = train_scaled.drop(columns=['language', 'sentence', 'n', 'vertex', 'is_root'])

# Prepare validation data
y_val = val_scaled['is_root']
X_val = val_scaled.drop(columns=['language', 'sentence', 'n', 'vertex', 'is_root'])

# Apply SMOTE to the training data
smote = SMOTEENN(random_state=42, sampling_strategy=1, n_jobs=-1)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Initialize and fit Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
logreg.fit(X_train_res, y_train_res)

# Predict on validation set
y_pred = logreg.predict(X_val)

# Compute and print metrics
print("Logistic Regression Classification Report on Validation Set:")
print(classification_report(y_val, y_pred))


Logistic Regression Classification Report on Validation Set:
              precision    recall  f1-score   support

       False       0.98      0.70      0.82     38134
        True       0.13      0.81      0.22      2100

    accuracy                           0.70     40234
   macro avg       0.56      0.75      0.52     40234
weighted avg       0.94      0.70      0.79     40234

