# Análisis niveles lingüisticos

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ivanvladimir/analisis_linguistico/blob/main/Analisis%20niveles%20linguisticos.ipynb)
[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/ivanvladimir/analisis_linguistico/blob/main/Analisis%20niveles%20linguisticos.ipynb)

Este es el código para ejemplificar análisis computacional lingüístico: diferentes niveles

### Instrucciones

Ejecutar las celdas en el orden que se encuentran.

### Licencia de la notebook

<a rel="license" href="http://creativecommons.org/licenses/by/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by/4.0/80x15.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.

### Información general

> **Author(s)**: <a href="https://twitter.com/ivanvladimir">@ivanvladimir</a> </br>
> **Last updated**: 15/06/2025

# ❶  Preparar librerias 

In [None]:
# Instalar librerias
!pip install stanza

In [None]:
# Cargar librerias
import pandas as pd
import stanza
import os
import random
from stanza import DownloadMethod

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import FancyBboxPatch, ConnectionPatch
import networkx as nx
from typing import List, Tuple, Dict
import json

# ❷ Preparar datos 

In [None]:
# Bajar datos mañanera
!git clone https://github.com/NOSTRODATA/conferencias_matutinas_amlo.git

In [None]:
# Poner todos los datos en un dataframe

dataframes=[]

for root, dirs, files in os.walk("conferencias_matutinas_amlo/", topdown=False):
   for name in files:
      if name.startswith('mananera') and name.endswith(".csv"):
        try:
            filename=os.path.join(root,name)
            df = pd.read_csv(filename)
            df['source_file'] = filename
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {filename}: {str(e)}")
            continue

try:
    df = pd.concat(dataframes, ignore_index=True, sort=False)
except Exception as e:
    print(f"Error combining dataframes: {str(e)}")


In [None]:
df

# ❸ Escoger aleatoriamente un texto

In [None]:
amlo_df = df[df['Participante']=='PRESIDENTE ANDRES MANUEL LOPEZ OBRADOR']
indice_texto = random.randint(0,len(amlo_df))
text=amlo_df.iloc[indice_texto]['Texto'] # <--- Cambiar el índice para escoger otro texto

print(f"Texto a analizar [{indice_texto}]: {text}")

# ❹ Tokens

In [None]:
# Para más información ver: https://stanfordnlp.github.io/stanza/tokenize.html

nlp = stanza.Pipeline(lang='es', processors='tokenize',download_method=DownloadMethod.REUSE_RESOURCES)
doc = nlp(text)
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

In [None]:
# Agrupando Multi-palabra token

nlp = stanza.Pipeline(lang='es', processors='tokenize,mwt',download_method=DownloadMethod.REUSE_RESOURCES)
doc = nlp(text)
for i,sent in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    for token in sent.tokens:
        print(f'token: {token.text}\twords: {", ".join([word.text for word in token.words])}')

# ❺ Etiquetación de partes del habla

In [None]:
# ver más: https://stanfordnlp.github.io/stanza/pos.html
nlp = stanza.Pipeline(lang='es', processors='tokenize,mwt,pos')
doc = nlp(text)
print(*[f'word: {word.text}\tupos: {word.upos}\txpos: {word.xpos}\tfeats: {word.feats if word.feats else "_"}' for sent in doc.sentences for word in sent.words], sep='\n')

# ❻ Lematización

In [None]:
# Ver más: https://stanfordnlp.github.io/stanza/lemma.html

nlp = stanza.Pipeline(lang='es', processors='tokenize,mwt,pos,lemma')
doc = nlp(text)
print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')

# ❼ Árbol de dependencias

In [None]:
# Ver más en: https://stanfordnlp.github.io/stanza/depparse.html
nlp = stanza.Pipeline(lang='es', processors='tokenize,mwt,pos,lemma,depparse')
doc = nlp(text)
print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

In [None]:
def extract_dependencies(doc: stanza.Document) -> List[Dict]:
    """
    Extract dependency information from parsed document.
    
    Args:
        doc (stanza.Document): Parsed Stanza document
        
    Returns:
        List[Dict]: List of sentences with dependency information
    """
    sentences_data = []
    
    for sent_idx, sent in enumerate(doc.sentences):
        words = []
        dependencies = []
        
        for word in sent.words:
            words.append({
                'id': word.id,
                'text': word.text,
                'lemma': word.lemma,
                'pos': word.upos,
                'xpos': word.xpos,
                'head': word.head,
                'deprel': word.deprel
            })
            
            # Add dependency relation (skip root)
            if word.head != 0:
                dependencies.append((word.head, word.id, word.deprel))
        
        sentences_data.append({
            'sentence_idx': sent_idx,
            'text': sent.text,
            'words': words,
            'dependencies': dependencies
        })
    
    return sentences_data

def visualize_dependency_tree(doc, sentence_idx=0):
    """Visualize dependency tree using NetworkX"""
    sent = doc.sentences[sentence_idx]
    
    # Create directed graph
    G = nx.DiGraph()
    
    # Add nodes and edges
    for word in sent.words:
        G.add_node(word.id, label=f"{word.text}\n({word.pos})")
        if word.head != 0:  # Not root
            G.add_edge(word.head, word.id, label=word.deprel)
    
    # Create layout
    pos = nx.spring_layout(G, k=3, iterations=50)
    
    # Draw graph
    plt.figure(figsize=(12, 8))
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, node_color='lightblue', 
                          node_size=2000, alpha=0.7)
    
    # Draw edges
    nx.draw_networkx_edges(G, pos, edge_color='gray', 
                          arrows=True, arrowsize=20, arrowstyle='->')
    
    # Add node labels
    node_labels = nx.get_node_attributes(G, 'label')
    nx.draw_networkx_labels(G, pos, node_labels, font_size=8)
    
    # Add edge labels
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=6)
    
    plt.title(f"Dependency Tree: {' '.join([w.text for w in sent.words])}")
    plt.axis('off')
    plt.tight_layout()
    plt.show()


visualize_tree_matplotlib(doc,0)

# ❽ Reconocimiento de entidades nombradas y más

In [None]:
nlp = stanza.Pipeline(lang='es', processors='tokenize,ner')
doc = nlp(text)
print(*[f'entity: {ent.text}\ttype: {ent.type}' for sent in doc.sentences for ent in sent.ents], sep='\n')

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')
doc = nlp(text)
for i, sentence in enumerate(doc.sentences):
    print("%d -> %s" % (i, "positivo" if sentence.sentiment == 1 else "negativo"))