# 1. Language Modeling

For language modeling, we'll use nltk's bigrams and trigrams functions to demonstrate a simple n-gram model.

In [8]:
import nltk
from nltk.util import ngrams
from collections import Counter

In [9]:
# Sample text 
text = "The cat sat on the mat. The dog lay on the mat."

In [11]:
# Tokenize the text
tokens = nltk.word_tokenize(text.lower())
print(tokens)


['the', 'cat', 'sat', 'on', 'the', 'mat', '.', 'the', 'dog', 'lay', 'on', 'the', 'mat', '.']


In [13]:
# Generate bigrams and trigrams
bigrams = list(ngrams(tokens,2))
trigrams = list(ngrams(tokens,3))

print(bigrams)
print(trigrams)

[('the', 'cat'), ('cat', 'sat'), ('sat', 'on'), ('on', 'the'), ('the', 'mat'), ('mat', '.'), ('.', 'the'), ('the', 'dog'), ('dog', 'lay'), ('lay', 'on'), ('on', 'the'), ('the', 'mat'), ('mat', '.')]
[('the', 'cat', 'sat'), ('cat', 'sat', 'on'), ('sat', 'on', 'the'), ('on', 'the', 'mat'), ('the', 'mat', '.'), ('mat', '.', 'the'), ('.', 'the', 'dog'), ('the', 'dog', 'lay'), ('dog', 'lay', 'on'), ('lay', 'on', 'the'), ('on', 'the', 'mat'), ('the', 'mat', '.')]


In [14]:
# Frequency Distribution
bigrams_freq = Counter(bigrams)
trigrams_freq = Counter(trigrams)

In [16]:
print("Bigrams:")
print(bigrams_freq)
print("\nTrigrams:")
print(trigrams_freq)

Bigrams:
Counter({('on', 'the'): 2, ('the', 'mat'): 2, ('mat', '.'): 2, ('the', 'cat'): 1, ('cat', 'sat'): 1, ('sat', 'on'): 1, ('.', 'the'): 1, ('the', 'dog'): 1, ('dog', 'lay'): 1, ('lay', 'on'): 1})

Trigrams:
Counter({('on', 'the', 'mat'): 2, ('the', 'mat', '.'): 2, ('the', 'cat', 'sat'): 1, ('cat', 'sat', 'on'): 1, ('sat', 'on', 'the'): 1, ('mat', '.', 'the'): 1, ('.', 'the', 'dog'): 1, ('the', 'dog', 'lay'): 1, ('dog', 'lay', 'on'): 1, ('lay', 'on', 'the'): 1})


# 2nd method

In [17]:
import nltk
from nltk.util import bigrams
from nltk.lm.preprocessing import pad_both_ends, flatten

# Example sentence
sentence = "The cat is on the mat."

# Tokenize and create bigrams
tokens = nltk.word_tokenize(sentence.lower())
bigram_model = list(bigrams(pad_both_ends(tokens, n=2)))

# Print bigrams
for bigram in bigram_model:
    print(bigram)

('<s>', 'the')
('the', 'cat')
('cat', 'is')
('is', 'on')
('on', 'the')
('the', 'mat')
('mat', '.')
('.', '</s>')


# 2. N-grams

We can create n-grams using nltk's ngrams function.

In [18]:
# Generate n-grams (e.g., trigrams)
n = 3
n_grams = list(ngrams(tokens, n))

print(f"{n}-grams:")
print(n_grams)

3-grams:
[('the', 'cat', 'is'), ('cat', 'is', 'on'), ('is', 'on', 'the'), ('on', 'the', 'mat'), ('the', 'mat', '.')]


# 3. Bag of Words (BoW)

A Bag of Words model can be implemented by counting word frequencies.

In [20]:
from collections import defaultdict

In [21]:
# Tokenize the text into words
words = nltk.word_tokenize(text.lower())

In [24]:
# Create a frequency distribution

word_freq = defaultdict(int)
for word in words:
    word_freq[word] += 1

In [25]:
print("Bag of Words:")
print(dict(word_freq))

Bag of Words:
{'the': 4, 'cat': 1, 'sat': 1, 'on': 2, 'mat': 2, '.': 2, 'dog': 1, 'lay': 1}


In [38]:
# 2nd Method
doc1 = "The cat sat on the mat."
doc2 = "The dog lay on the mat."
# Tokenize and create BoW representation
tokens1 = nltk.word_tokenize(doc1.lower())
tokens2 = nltk.word_tokenize(doc2.lower())
bow1 = Counter(tokens1)
bow2 = Counter(tokens2)

# Print BoW vectors
print("Document 1 BoW:", dict(bow1))
print("Document 2 BoW:", dict(bow2))


Document 1 BoW: {'the': 2, 'cat': 1, 'sat': 1, 'on': 1, 'mat': 1, '.': 1}
Document 2 BoW: {'the': 2, 'dog': 1, 'lay': 1, 'on': 1, 'mat': 1, '.': 1}


# 4. TF-IDF

For TF-IDF, we'll use nltk along with sklearn's TfidfVectorizer for convenience.

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
# Sample documents
documents = [
    "The cat sat on the mat.",
    "The dog lay on the mat."
]

In [31]:
# Create a TfidfVectorizer
vectorizer = TfidfVectorizer()

In [35]:
# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)
tfidf_matrix.toarray()

array([[0.44554752, 0.        , 0.        , 0.31701073, 0.31701073,
        0.44554752, 0.63402146],
       [0.        , 0.44554752, 0.44554752, 0.31701073, 0.31701073,
        0.        , 0.63402146]])

In [33]:
# Get feature names
feature_names = vectorizer.get_feature_names_out()

In [36]:
# Print the TF-IDF scores
for doc_index, doc in enumerate(tfidf_matrix.toarray()):
    print(f"\nDocument {doc_index + 1} TF-IDF Scores:")
    for word_index, score in enumerate(doc):
        print(f"{feature_names[word_index]}: {score:.4f}")


Document 1 TF-IDF Scores:
cat: 0.4455
dog: 0.0000
lay: 0.0000
mat: 0.3170
on: 0.3170
sat: 0.4455
the: 0.6340

Document 2 TF-IDF Scores:
cat: 0.0000
dog: 0.4455
lay: 0.4455
mat: 0.3170
on: 0.3170
sat: 0.0000
the: 0.6340
