# Análisis de datos lingüisticos

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ivanvladimir/analisis_linguistico/blob/main/Análisis de corpus.ipynb)
[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/ivanvladimir/analisis_linguistico/blob/main/An%C3%A1lisis%20de%20corpus.ipynb)

Este es el código para ejemplificar análisis computacional lingüístico.

### Instrucciones

Ejecutar las celdas en el orden que se encuentran.

### Licencia de la notebook

<a rel="license" href="http://creativecommons.org/licenses/by/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by/4.0/80x15.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.

### Información general

> **Author(s)**: <a href="https://twitter.com/ivanvladimir">@ivanvladimir</a> </br>
> **Last updated**: 15/06/2025

# ❶  Preparar librerias 

In [None]:
# Cargar librerias

import nltk
import pandas as pd
import os

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')

# ❷ Preparar datos 

In [None]:
# Bajar datos mañanera
!git clone https://github.com/NOSTRODATA/conferencias_matutinas_amlo.git

In [None]:
# Poner todos los datos en un dataframe

dataframes=[]

for root, dirs, files in os.walk("conferencias_matutinas_amlo/", topdown=False):
   for name in files:
      if name.startswith('mananera') and name.endswith(".csv"):
        try:
            filename=os.path.join(root,name)
            df = pd.read_csv(filename)
            df['source_file'] = filename
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {str(e)}")
            continue

try:
    df = pd.concat(dataframes, ignore_index=True, sort=False)
except Exception as e:
    print(f"Error combining dataframes: {str(e)}")


In [None]:
df

# ❸ Calcular concordancias

In [None]:
from nltk.text import Text, ConcordanceIndex
from nltk.tokenize import word_tokenize

text = '\n'.join(df[df['Participante']=='PRESIDENTE ANDRES MANUEL LOPEZ OBRADOR']['Texto'].astype(str))

tokens = word_tokenize(text)
concordance_index = ConcordanceIndex(tokens)

In [None]:
offsets = concordance_index.offsets("Pemex")
width=60
concordances = []

for offset in offsets:
    # Calculate context boundaries
    left_start = max(0, offset - width)
    right_end = min(len(tokens), offset + width + 1)
    
    # Extract contexts
    left_context = tokens[left_start:offset]
    keyword = tokens[offset]
    right_context = tokens[offset + 1:right_end]
    
    concordances.append({
        'position': offset,
        'left_context': left_context,
        'keyword': keyword,
        'right_context': right_context,
        'left_text': ' '.join(left_context),
        'right_text': ' '.join(right_context),
        'full_context': ' '.join(tokens[left_start:right_end])
    })

print(f"Total de concordancias {len(concordances)}")

In [None]:
INI=0
FIN=20

for conc in concordances[INI:FIN]:
    print(f"{conc['left_text'][-width:]:>60} [{conc['keyword']}] {conc['right_text'][:width]:<60}")

# ❹ Calcular colocaciones

In [None]:
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.corpus import stopwords
import string

min_freq=2
num_collocations=20

# Remove stopwords and punctuation
stop_words = set(stopwords.words('spanish'))
tokens = [token for token in tokens 
          if token not in stop_words and token not in string.punctuation]

# Bigram collocations
bigram_finder = BigramCollocationFinder.from_words(tokens)
bigram_finder.apply_freq_filter(min_freq)

# Different scoring methods
print("=== BIGRAM COLLOCATIONS ===")

print(f"\nTop {num_collocations} by PMI (Pointwise Mutual Information):")
pmi_bigrams = bigram_finder.nbest(BigramAssocMeasures.pmi, num_collocations)
for bigram in pmi_bigrams:
    print(f"  {bigram[0]} {bigram[1]}")

print(f"\nTop {num_collocations} by Chi-square:")
chi_sq_bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, num_collocations)
for bigram in chi_sq_bigrams:
    print(f"  {bigram[0]} {bigram[1]}")

print(f"\nTop {num_collocations} by Likelihood Ratio:")
likelihood_bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, num_collocations)
for bigram in likelihood_bigrams:
    print(f"  {bigram[0]} {bigram[1]}")

# Trigram collocations
trigram_finder = TrigramCollocationFinder.from_words(tokens)
trigram_finder.apply_freq_filter(min_freq)

print(f"\n=== TRIGRAM COLLOCATIONS ===")
print(f"\nTop {num_collocations//2} by PMI:")
pmi_trigrams = trigram_finder.nbest(TrigramAssocMeasures.pmi, num_collocations//2)
for trigram in pmi_trigrams:
    print(f"  {trigram[0]} {trigram[1]} {trigram[2]}")