In [1]:
from conllu import parse_tree

In [2]:
language = "UD_Spanish-GSD"
file_name = "es_gsd-ud-train"

conllu_file = f"../data/deep/{language}/{file_name}.conllu"

with open(conllu_file, 'r', encoding='utf-8') as file:
        data = file.read()
    
sentences = parse_tree(data)



In [36]:
token = sentences[138]

In [37]:
print(token.print_tree())

(deprel:root) form:encuentra lemma:encontrar upos:VERB [3]
    (deprel:nsubj) form:Batavia lemma:batavia upos:PROPN [1]
    (deprel:iobj) form:se lemma:él upos:PRON [2]
    (deprel:advcl) form:ubicada lemma:ubicado upos:PROPN [4]
        (deprel:nmod) form:coordenadas lemma:coordenada upos:NOUN [7]
            (deprel:case) form:en lemma:en upos:ADP [5]
            (deprel:det) form:las lemma:el upos:DET [6]
    (deprel:punct) form:. lemma:. upos:PUNCT [8]
None


In [38]:
token.token["upos"]

'VERB'

In [87]:
token.children

[TokenTree<token={id=1, form=Además}, children=None>,
 TokenTree<token={id=2, form=se}, children=None>,
 TokenTree<token={id=3, form=le}, children=None>,
 TokenTree<token={id=7, form=empresas}, children=[...]>,
 TokenTree<token={id=15, form=hagan}, children=[...]>,
 TokenTree<token={id=29, form=.}, children=None>]

In [6]:
import networkx as nx
import os

final_dir = f"../data/graphs_labeled/{language}"
if not os.path.exists(final_dir):
    os.makedirs(final_dir)


def add_edges(graph, nodo):
    children = nodo.children
    if children!=[]:
        for hijo in nodo.children:
            graph.add_node(hijo.token["id"], form=hijo.token["form"])
            graph.add_edge(nodo.token["id"], hijo.token["id"])
            
    return list(children)
 

def get_tree(root):
    graph = nx.DiGraph()
    actual_node = root
    graph.graph["phrase"] = actual_node.metadata["text"]
    graph.graph["root"] = actual_node.token["id"]
    nodes = root.children
    while nodes!=[]:
        graph.add_node(actual_node.token["id"], form=actual_node.token["form"], label=actual_node.token["upos"])
        nodes = nodes + add_edges(graph=graph, nodo=actual_node)
        actual_node = nodes.pop(0)
    
    return graph
        

for i, sentence in enumerate(sentences):
    root = sentence
    graph = get_tree(root)
    nx.write_graphml(graph, f"../data/graphs/{language}/{file_name}_{i}.graphml")

# Crear tabla con centralidad

In [6]:
import networkx as nx
from asg_cen.all_subgraphs_centrality import all_subgraphs_centrality as asg
import pandas as pd


G = nx.read_graphml("UD_Spanish-GSD/es_gsd-ud-train_2571.graphml").to_undirected()

bet = nx.betweenness_centrality(G)
pr = nx.pagerank(G)
close = nx.closeness_centrality(G)
harm = nx.harmonic_centrality(G)
asg_cen = asg(G)
deg = nx.degree(G)

dicc = {}
for node in G.nodes():
    dicc[G.nodes[node].get("form")] = {} 
    dicc[G.nodes[node].get("form")]["Betweenness"] = bet[node]
    dicc[G.nodes[node].get("form")]["PageRank"] = pr[node]
    dicc[G.nodes[node].get("form")]["Closeness"] = close[node]
    dicc[G.nodes[node].get("form")]["Harmonic"] = harm[node]
    dicc[G.nodes[node].get("form")]["All_subgraphs"] = asg_cen[node]
    dicc[G.nodes[node].get("form")]["Degree"] = deg[node]

print(dicc)


{'realizó': {'Betweenness': 0.05094905094905095, 'PageRank': 0.02060452564421404, 'Closeness': 0.10025706940874037, 'Harmonic': 11.943647447323922, 'All_subgraphs': 52.974663803391344, 'Degree': 3}, 'Tesla': {'Betweenness': 0.02564102564102564, 'PageRank': 0.013841277759893373, 'Closeness': 0.1491395793499044, 'Harmonic': 14.986544011544002, 'All_subgraphs': 52.88167955255407, 'Degree': 2}, 'trabajos': {'Betweenness': 0.07492507492507493, 'PageRank': 0.01306872400945453, 'Closeness': 0.11063829787234042, 'Harmonic': 12.885095460095457, 'All_subgraphs': 53.296591898278706, 'Degree': 2}, '.': {'Betweenness': 0.0, 'PageRank': 0.007737343416799795, 'Closeness': 0.0912280701754386, 'Harmonic': 9.76257835301952, 'All_subgraphs': 51.974663803391344, 'Degree': 1}, 'avanzados': {'Betweenness': 0.09856809856809856, 'PageRank': 0.01254349454502634, 'Closeness': 0.12302839116719243, 'Harmonic': 14.470626595626584, 'All_subgraphs': 53.5596263041125, 'Degree': 2}, 'más': {'Betweenness': 0.1455211455

In [10]:
pd.DataFrame(dicc).T.to_csv("../2571.csv", index=False)

# Pruebas stanza and conllu

In [8]:
!pip3 install stanza

Collecting stanza
  Using cached stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Using cached emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Using cached stanza-1.10.1-py3-none-any.whl (1.1 MB)
Using cached emoji-2.14.1-py3-none-any.whl (590 kB)
Installing collected packages: emoji, stanza
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [stanza]2m1/2[0m [stanza]
[1A[2KSuccessfully installed emoji-2.14.1 stanza-1.10.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import stanza
from stanza.utils.conll import CoNLL
import pandas as pd
import logging
# Silenciar los logs informativos de Stanza
logging.getLogger('stanza').setLevel(logging.ERROR)
stanza.download('es')

corpus_file_path = '../data/deep/UD_Spanish-GSD/es_gsd-ud-train.conllu' # <--- CAMBIA ESTO A LA RUTA DE TU ARCHIVO

try:
    # Abrir y leer el archivo CoNLL-U
    # CoNLL.conll2doc() convierte el texto CoNLL-U a un objeto Document de Stanza
    print(f"Abriendo y procesando el archivo: {corpus_file_path}...")
    # Leer directamente el archivo CoNLL-U usando CoNLL.conll2doc
    doc = CoNLL.conll2doc(corpus_file_path)

except FileNotFoundError:
    print(f"Error: El archivo {corpus_file_path} no se encontró.")



Abriendo y procesando el archivo: ../data/deep/UD_Spanish-GSD/es_gsd-ud-train.conllu...


AttributeError: 'Sentence' object has no attribute 'print_tree'

In [None]:
id2word = {w.id: w for w in doc.sentences[0].words}
def get_depth(word):
        # Root has head = 0 → depth 0
        if word.head == 0:
            return 0
        # Otherwise, 1 + depth of its head
        return 1 + get_depth(id2word[word.head])
    
w1 = id2word[13]      # word with id 5
w2 = id2word[8]      # word with id 8
    
    #    or by text (first match):
    # w_text = {w.text: w for w in sent.words}
    # w1, w2 = w_text['word1'], w_text['word2']
    
    # 5. Compute and compare
depth1 = get_depth(w1)
depth2 = get_depth(w2)
if depth1 == depth2:
    print(f"Words '{w1.text}' and '{w2.text}' are at the same dependency level ({depth1}).")
else:
    print(f"'{w1.text}' (level {depth1}) vs '{w2.text}' (level {depth2}) – not the same level.")

Words 'que' and 'interesadas' are at the same dependency level (2).


In [28]:
doc.sentences[0].words[3]

{
  "id": 4,
  "text": "pediría",
  "lemma": "pedir",
  "upos": "VERB",
  "feats": "Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin",
  "head": 0,
  "deprel": "root",
  "deps": "0:root"
}

In [30]:
doc.sentences[0].words[0]

{
  "id": 1,
  "text": "Además",
  "lemma": "además",
  "upos": "ADV",
  "head": 4,
  "deprel": "advmod",
  "deps": "4:advmod"
}