# Dataset Preparation 

In [22]:

import networkx as nx

def compute_node_features(edgelist):
    G = nx.from_edgelist(edgelist)

    features = {}
    try:
        katz = nx.katz_centrality_numpy(G)
    except Exception:
        katz = {n: 0.0 for n in G.nodes}

    try:
        load = nx.load_centrality(G)
    except Exception:
        load = {n: 0.0 for n in G.nodes}

    try:
        ecc = nx.eccentricity(G)
    except Exception:
        ecc = {n: 0.0 for n in G.nodes}

    degree = nx.degree_centrality(G)
    harmonic = nx.harmonic_centrality(G)
    betweenness = nx.betweenness_centrality(G)
    closeness = nx.closeness_centrality(G)
    pagerank = nx.pagerank(G)

    for node in G.nodes:
        features[node] = {
            "degree": degree[node],
            "harmonic": harmonic[node],
            "betweenness": betweenness[node],
            "closeness": closeness[node],
            "pagerank": pagerank[node],
            "katz": katz[node],
            "load": load[node],
        }
    return features


Configure the paths and the libraries used for preprocessing data

In [23]:
import pandas as pd
import os
import ast
import numpy as np

print(os.getcwd())


d:\Users\gabri\Desktop\MDS\Semester_2\ML\01ml-project_testing\src


### Data Extraction
First we have extracted the data

In [24]:
df = pd.read_csv("./data/raw_files/train.csv")
df['edgelist'] = df['edgelist'].apply(ast.literal_eval) # to verify that we are sending a list instead of an array

print("Dataframe columns")
print(df.columns)

print("Dataframe rows")
print(len(df))



Dataframe columns
Index(['language', 'sentence', 'n', 'edgelist', 'root'], dtype='object')
Dataframe rows
10500


### Transforming the dataframe
-  To meet the project's requirements, the following code breaks down sentences into finer granularity.
- Instead of working with full sentences, we now operate at the level of vertices (as nodes in a sentence treemap).

In [25]:
import networkx as nx

def centralities(edgelist):
    """
    - edgelist is a list of node pairs e.g. [(7,2),(1,7),(1,9),...]
    - returns a dictionary of vertex -> (centrality values)
    """
    T = nx.from_edgelist(edgelist)
    dc = nx.degree_centrality(T)
    cc = nx.harmonic_centrality(T)
    bc = nx.betweenness_centrality(T)
    pc = nx.pagerank(T)
    katz = nx.katz_centrality_numpy(T)
    load = nx.load_centrality(T)

    return {v: (dc[v], cc[v], bc[v], pc[v], katz[v], load[v]) for v in T}

In [26]:
# List to collect rows for the final DataFrame
rows = []

# Iterate over DataFrame rows
for idx, row in df.iterrows():
    # Compute centralities
    centrality_dict = centralities(row['edgelist'])

    # Build a flat row per node
    for vertex, (deg, clos, betw, pr, katz, load) in centrality_dict.items():
        rows.append({
            'language': row['language'],
            'sentence': row['sentence'],
            'n': row['n'],
            'vertex': vertex,
            'degree': deg,
            'closeness': clos,
            'betweenness': betw,
            'pagerank': pr,
            'katz': katz,
            'load': load,
            'is_root': int(vertex == row['root'])
        })

# Convert list of rows to a DataFrame
df_filtered = pd.DataFrame(rows)

print(df_filtered.head())
df_filtered.to_csv("./data/preprocessed/data_preprocessed.csv", index=False)


   language  sentence   n  vertex    degree  closeness  betweenness  pagerank  \
0  Japanese         2  23       6  0.090909   5.823846     0.090909  0.048565   
1  Japanese         2  23       4  0.045455   4.561122     0.000000  0.027162   
2  Japanese         2  23       2  0.136364   6.991703     0.255411  0.066901   
3  Japanese         2  23      23  0.045455   5.157179     0.000000  0.025477   
4  Japanese         2  23      20  0.090909   7.146825     0.311688  0.042552   

       katz      load  is_root  
0  0.209086  0.090909        0  
1  0.188298  0.000000        0  
2  0.228660  0.255411        0  
3  0.190256  0.000000        0  
4  0.213357  0.311688        0  


### Outlier Detection
We have decided to keep the outliers until it is find a very sensitive to outliers problem.
Keeping outliers ensures that the full variability and complexity of the data are preserved, which may reflect meaningful linguistic or structural phenomena. Removing them could eliminate rare but important patterns that are critical for understanding or modeling real-world sentence structures.

### Data normalization

Different metrics are normalized at a Language sentence level

In [27]:
import numpy as np
import pandas as pd

# Min-max normalization function
def min_max_normalize(metric_dict):
    values = np.array(list(metric_dict.values()), dtype=np.float64)
    min_val = np.min(values)
    max_val = np.max(values)
    if max_val == min_val:
        return {k: 0.0 for k in metric_dict}  # Avoid division by zero
    return {k: (v - min_val) / (max_val - min_val) for k, v in metric_dict.items()}

# Normalize centralities per sentence-language pair
def normalize_centralities(df):
    required_columns = ['sentence', 'language', 'vertex', 'degree', 'closeness', 'betweenness', 'pagerank', 'is_root']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {set(required_columns) - set(df.columns)}")
    
    metrics = ['degree', 'closeness', 'betweenness', 'pagerank', 'katz', 'load']
    result_frames = []

    # Group by sentence and language
    for (sentence, language), group in df.groupby(['sentence', 'language']):
        norm_data = {}

        for metric in metrics:
            metric_dict = dict(zip(group['vertex'], group[metric]))
            norm_data[metric] = min_max_normalize(metric_dict)

        # Copy group and apply normalized values
        norm_df = group.copy()
        for metric in metrics:
            norm_df[f'{metric}_norm'] = norm_df['vertex'].map(norm_data[metric])

        result_frames.append(norm_df)

    # Combine all normalized groups
    return pd.concat(result_frames, ignore_index=True)

# Normalize length 
def normalize_length(df):
    df['n_norm'] = (df['n'] - df['n'].min()) / (df['n'].max() - df['n'].min())
    return df


# Apply normalization and save
df_normalized = normalize_centralities(df_filtered)

df_normalized = normalize_length(df_normalized)

df_normalized.to_csv("./data/preprocessed/data_normalized.csv", index=False)
