# Dataset Preparation 

Configure the paths and the libraries used for preprocessing data

In [7]:
import pandas as pd
import os
import ast
import numpy as np

print(os.getcwd())


c:\Users\Adria Espinoza\FIB\mds\Q2\ML\ml-project\src


### Data Extraction
First we have extracted the data

In [None]:
df = pd.read_csv("./data/raw_files/train.csv")
df['edgelist'] = df['edgelist'].apply(ast.literal_eval) # to verify that we are sending a list instead of an array

print("Dataframe columns")
print(df.columns)

print("Dataframe rows")
print(len(df))



Dataframe columns
Index(['language', 'sentence', 'n', 'edgelist', 'root'], dtype='object')
Dataframe rows
10500


### Function declaration 

In [9]:
import networkx as nx

def centralities(edgelist):
    """
    - edgelist is a list of node pairs e.g. [(7,2),(1,7),(1,9),...]
    - returns a dictionary of vertex -> (centrality values)
    """
    T = nx.from_edgelist(edgelist)
    dc = nx.degree_centrality(T)
    cc = nx.harmonic_centrality(T)
    bc = nx.betweenness_centrality(T)
    pc = nx.pagerank(T)
    return {v: (dc[v], cc[v], bc[v], pc[v]) for v in T}


### Transforming the dataframe
-  To meet the project's requirements, the following code breaks down sentences into finer granularity.
- Instead of working with full sentences, we now operate at the level of vertices (as nodes in a sentence treemap).

In [None]:
# List to collect rows for the final DataFrame
rows = []

# Iterate over DataFrame rows
for idx, row in df.iterrows():
    # Compute centralities
    centrality_dict = centralities(row['edgelist'])
    
    # Build a flat row per node
    for vertex, (deg, clos, betw, pr) in centrality_dict.items():
        rows.append({
            'id': id,
            'language': row['language'],
            'sentence': row['sentence'],
            'n': row['n'],
            'vertex': vertex,
            'degree': deg,
            'closeness': clos,
            'betweenness': betw,
            'pagerank': pr,
            'is_root': int(vertex == row['root'])
        })

# Convert list of rows to a DataFrame
df_filtered = pd.DataFrame(rows)

print(df_filtered.head)
df_filtered.to_csv("./data/preprocessed/data_preprocessed.csv", index=False)


### Data normalization

Different metrics are normalized at a Language sentence level

In [5]:
import numpy as np
import pandas as pd

# Min-max normalization function
def min_max_normalize(metric_dict):
    values = np.array(list(metric_dict.values()), dtype=np.float64)
    min_val = np.min(values)
    max_val = np.max(values)
    if max_val == min_val:
        return {k: 0.0 for k in metric_dict}  # Avoid division by zero
    return {k: (v - min_val) / (max_val - min_val) for k, v in metric_dict.items()}

# Normalize centralities per sentence-language pair
def normalize_centralities(df):
    required_columns = ['sentence', 'language', 'vertex', 'degree', 'closeness', 'betweenness', 'pagerank', 'is_root']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {set(required_columns) - set(df.columns)}")
    
    metrics = ['n', 'degree', 'closeness', 'betweenness', 'pagerank']
    result_frames = []

    # Group by sentence and language
    for (sentence, language), group in df.groupby(['sentence', 'language']):
        norm_data = {}

        for metric in metrics:
            metric_dict = dict(zip(group['vertex'], group[metric]))
            norm_data[metric] = min_max_normalize(metric_dict)

        # Copy group and apply normalized values
        norm_df = group.copy()
        for metric in metrics:
            norm_df[f'{metric}_norm'] = norm_df['vertex'].map(norm_data[metric])

        result_frames.append(norm_df)

    # Combine all normalized groups
    return pd.concat(result_frames, ignore_index=True)

# Apply normalization and save
df_normalized = normalize_centralities(df_filtered)
df_normalized.to_csv("./data/preprocessed/data_normalized.csv", index=False)
