### Bag of Words

In [None]:
from collections import Counter
import math

In [None]:
# 1. Definir las frases
corpus = [
    "el gato duerme",
    "el perro corre"
]

In [None]:
# 2. Tokenizar las frases
tokenized_corpus = [sentence.split() for sentence in corpus]

In [None]:
tokenized_corpus

In [None]:
# 3. Construir el vocabulario
all_words = []
for sentence in tokenized_corpus:
    for word in sentence:
        all_words.append(word)

vocabulary = sorted(set(all_words))
print("Vocabulario:", vocabulary)

In [None]:
# 4. Construir Bag of Words (BoW)
def build_bow(tokens, vocabulary):
    bow = Counter(tokens)
    return [bow[word] for word in vocabulary]

bows = [build_bow(sentence, vocabulary) for sentence in tokenized_corpus]

print("\nBag of Words (conteo):")
for i, bow in enumerate(bows):
    print(f"Frase {i+1}: {bow}")


## N-grams,  Bigramas

In [None]:
# 5. Construir Bigrams
def generate_bigrams(tokens):
    return list(zip(tokens, tokens[1:]))

print("\nBigrams:")
for i, sentence in enumerate(tokenized_corpus):
    print(f"Frase {i+1}: {generate_bigrams(sentence)}")

In [None]:
tokenized_corpus[1:]

#### Otro ejemplo con bigramas

In [None]:
# corpus pequeño de ejemplo
corpus = [
    "el gato duerme",
    "el perro ladra",
    "el gato salta",
    "el perro corre",
    "el gato maulla"
]

# Paso 1: Tokenizar (separar palabras)
tokens = []
for sentence in corpus:
    tokens.extend(sentence.split())

print("Tokens:", tokens)


In [None]:
# Contar bigramas y palabras
# utilizando la libreria nltk ya tiene una funcion que permite generar bigrams
import matplotlib.pyplot as plt
from collections import Counter
from nltk import bigrams

# Contar palabras (unigramas)
unigram_counts = Counter(tokens)

# Contar bigramas
bigram_counts = Counter(bigrams(tokens))

print("\nConteo de unigramas:")
print(unigram_counts)

print("\nConteo de bigramas:")
print(bigram_counts)




In [None]:
Counter(tokens)

In [None]:
Counter(bigrams(tokens))

In [None]:
from nltk import trigrams
Counter(trigrams(tokens))

### Calculando la probabilidad del bigrama

![image.png](attachment:03cf8ff2-ef5d-40fa-9dad-ed31a741c85f.png)


In [None]:
def mle_probability(word1, word2):
    bigram_count = bigram_counts[(word1, word2)]
    unigram_count = unigram_counts[word1]
    
    if unigram_count == 0:
        return 0.0  # Evitar división por cero
    return bigram_count / unigram_count

# Ejemplos:

print("\nProbabilidades estimadas:")

pairs_to_check = [
    ("el", "gato"),
    ("el", "perro"),
    ("gato", "duerme"),
    ("perro", "ladra"),
    ("perro", "corre")
]

for w1, w2 in pairs_to_check:
    prob = mle_probability(w1, w2)
    print(f"P({w2} | {w1}) = {prob:.3f}")


In [None]:
# Calculamos probabilidades
labels = []
probs = []

for w1, w2 in pairs_to_check:
    prob = mle_probability(w1, w2)
    labels.append(f"{w1} → {w2}")
    probs.append(prob)

# Mediante el siguiente codigo se realiza el grafico
plt.figure(figsize=(10, 6))
plt.bar(labels, probs)
plt.xlabel('Bigramas (word1 → word2)')
plt.ylabel('Probabilidad (MLE)')
plt.title('Probabilidad MLE de Bigramas en el Corpus')
plt.ylim(0, 1)  # Probabilidades están entre 0 y 1
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.show()

### TF - IDF

In [None]:
# 6. Calcular TF
def compute_tf(tokens, vocabulary):
    counts = Counter(tokens)
    total_terms = len(tokens)
    return [counts[word]/total_terms for word in vocabulary]

tfs = [compute_tf(sentence, vocabulary) for sentence in tokenized_corpus]

In [None]:
tfs

In [None]:
# 7. Calcular IDF
def compute_idf(tokenized_corpus, vocabulary):
    N = len(tokenized_corpus)
    idf = {}
    for word in vocabulary:
        containing = sum(1 for sentence in tokenized_corpus if word in sentence)
        idf[word] = math.log(N / containing) if containing > 0 else 0.0
    return idf

idf = compute_idf(tokenized_corpus, vocabulary)
print("\nIDF por palabra:", idf)

In [None]:
# 8. Calcular TF-IDF
def compute_tfidf(tf, idf, vocabulary):
    return [tf_i * idf[word] for tf_i, word in zip(tf, vocabulary)]

tfidfs = [compute_tfidf(tf, idf, vocabulary) for tf in tfs]

print("\nTF-IDF:")
for i, tfidf in enumerate(tfidfs):
    print(f"Frase {i+1}: {tfidf}")