In [None]:
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
import re

In [None]:
# Elegimos un texto 
text = gutenberg.raw('austen-emma.txt')  # "Emma" de Jane Austen

In [None]:
# Preprocesar: minúsculas, eliminar signos y números
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)

In [None]:
# Tokenizar en frases
sentences = nltk.sent_tokenize(text)

In [None]:
print(f"Número de frases cargadas: {len(sentences)}")
print("\nPrimeras 1 frases:")
print(sentences[:1])

## Bag of Words


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Vectorizador simple (solo palabras)
vectorizer_bow = CountVectorizer(stop_words='english')
X_bow = vectorizer_bow.fit_transform(sentences)

In [None]:
# Sacar las palabras más frecuentes
sum_words = X_bow.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer_bow.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)

In [None]:
# Mostrar top 20
top_words = words_freq[:20]
print("\nTop 20 palabras más frecuentes (Bag of Words):")
print(top_words)

In [None]:
# Graficar
words, counts = zip(*top_words)
plt.figure(figsize=(10,6))
plt.barh(words[::-1], counts[::-1])
plt.title("Top 20 Palabras - Bag of Words")
plt.xlabel("Frecuencia")
plt.grid(True)
plt.show()

# N-grams (Bigrams y Trigrams)

In [None]:
# Bigrams
vectorizer_bigram = CountVectorizer(ngram_range=(2,2), stop_words='english')
X_bigram = vectorizer_bigram.fit_transform(sentences)

sum_bigrams = X_bigram.sum(axis=0)
bigrams_freq = [(bigram, sum_bigrams[0, idx]) for bigram, idx in vectorizer_bigram.vocabulary_.items()]
bigrams_freq = sorted(bigrams_freq, key=lambda x: x[1], reverse=True)

print("\nTop 10 Bigrams más frecuentes:")
print(bigrams_freq[:10])

In [None]:
# Trigrams
vectorizer_trigram = CountVectorizer(ngram_range=(3,3), stop_words='english')
X_trigram = vectorizer_trigram.fit_transform(sentences)

sum_trigrams = X_trigram.sum(axis=0)
trigrams_freq = [(trigram, sum_trigrams[0, idx]) for trigram, idx in vectorizer_trigram.vocabulary_.items()]
trigrams_freq = sorted(trigrams_freq, key=lambda x: x[1], reverse=True)

print("\nTop 10 Trigrams más frecuentes:")
print(trigrams_freq[:10])

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Vectorizador TF-IDF
vectorizer_tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer_tfidf.fit_transform(sentences)

In [None]:
# Calcular importancia promedio
tfidf_means = X_tfidf.toarray().mean(axis=0)
vocab_tfidf = vectorizer_tfidf.get_feature_names_out()

top_indices = tfidf_means.argsort()[::-1][:20]
top_tfidf_words = [(vocab_tfidf[i], tfidf_means[i]) for i in top_indices]

print("\nTop 20 palabras más distintivas por TF-IDF:")
print(top_tfidf_words)

In [None]:
# Graficar
words, scores = zip(*top_tfidf_words)
plt.figure(figsize=(10,6))
plt.barh(words[::-1], scores[::-1])
plt.title("Top 20 Palabras más importantes - TF-IDF")
plt.xlabel("Peso TF-IDF")
plt.grid(True)
plt.show()