# Dataset Preparation 

Configure the paths and the libraries used for preprocessing data

In [None]:
import pandas as pd
import os
import ast

print(os.getcwd())


c:\Users\Adria Espinoza G\FIB\mds\Q2\ML\ml-project\src


### Data Extraction
First we have extracted the data

In [None]:

df = pd.read_csv("./data/raw_files/train.csv")
df['edgelist'] = df['edgelist'].apply(ast.literal_eval) # to verify that we are sending a list instead of an array

print("Dataframe columns")
print(df.columns)

print("Dataframe rows")
print(len(df))



Dataframe columns
Index(['language', 'sentence', 'n', 'edgelist', 'root'], dtype='object')
Dataframe rows
10500


### Function declaration 

In [42]:
import networkx as nx

def centralities(edgelist):
    """
    - edgelist is a list of node pairs e.g. [(7,2),(1,7),(1,9),...]
    - returns a dictionary of vertex -> (centrality values)
    """
    T = nx.from_edgelist(edgelist)
    dc = nx.degree_centrality(T)
    cc = nx.harmonic_centrality(T)
    bc = nx.betweenness_centrality(T)
    pc = nx.pagerank(T)
    return {v: (dc[v], cc[v], bc[v], pc[v]) for v in T}


### Transforming the dataframe
-  To meet the project's requirements, the following code breaks down sentences into finer granularity.
- Instead of working with full sentences, we now operate at the level of vertices (as nodes in a sentence treemap).

In [43]:
# List to collect rows for the final DataFrame
rows = []

# Iterate over DataFrame rows
for idx, row in df.iterrows():
    # Compute centralities
    centrality_dict = centralities(row['edgelist'])
    
    # Build a flat row per node
    for vertex, (deg, clos, betw, pr) in centrality_dict.items():
        rows.append({
            'language': row['language'],
            'sentence': row['sentence'],
            'n': row['n'],
            'vertex': vertex,
            'degree': deg,
            'closeness': clos,
            'betweenness': betw,
            'pagerank': pr,
            'is_root': int(vertex == row['root'])
        })

# Convert list of rows to a DataFrame
df_filtered = pd.DataFrame(rows)

print(df_filtered.head)
df_filtered.to_csv("./data/preprocessed/data_preprocessed_v1.csv", index=False)


<bound method NDFrame.head of         language  sentence   n  vertex    degree  closeness  betweenness  \
0       Japanese         2  23       6  0.090909   5.823846     0.090909   
1       Japanese         2  23       4  0.045455   4.561122     0.000000   
2       Japanese         2  23       2  0.136364   6.991703     0.255411   
3       Japanese         2  23      23  0.045455   5.157179     0.000000   
4       Japanese         2  23      20  0.090909   7.146825     0.311688   
...          ...       ...  ..     ...       ...        ...          ...   
197474   Russian       995  19      19  0.055556   5.005159     0.000000   
197475   Russian       995  19       1  0.055556   6.034524     0.000000   
197476   Russian       995  19      14  0.055556   6.034524     0.000000   
197477   Russian       995  19       5  0.111111   6.701190     0.111111   
197478   Russian       995  19      16  0.055556   5.005159     0.000000   

        pagerank  is_root  
0       0.048565        0  
1