# 4.1 N-grams

4.1.2 Generating N-grams in Python

In [None]:
from nltk import ngrams
from collections import Counter
import nltk
nltk.download('punkt')

# Sample text
text = "Natural Language Processing is a fascinating field of study."

# Tokenize the text into words
tokens = nltk.word_tokenize(text)

# Function to generate N-grams
def generate_ngrams(tokens, n):
    n_grams = ngrams(tokens, n)
    return [' '.join(grams) for grams in n_grams]

# Generate unigrams, bigrams, and trigrams
unigrams = generate_ngrams(tokens, 1)
bigrams = generate_ngrams(tokens, 2)
trigrams = generate_ngrams(tokens, 3)

print("Unigrams:")
print(unigrams)
print("\\nBigrams:")
print(bigrams)
print("\\nTrigrams:")
print(trigrams)

4.1.4 Training an N-gram Language Model

In [None]:
rom collections import defaultdict

# Sample text corpus
corpus = [
    "Natural Language Processing is a fascinating field of study.",
    "Machine learning and NLP are closely related.",
    "Language models are essential for NLP tasks."
]

# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]

# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    # Count bigrams
    for sentence in tokenized_corpus:
        for w1, w2 in ngrams(sentence, 2):
            model[w1][w2] += 1

    # Calculate probabilities
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count

    return model

# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)

# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
    return bigram_model[w1][w2]

print("Bigram Probability (NLP | for):")
print(get_bigram_probability(bigram_model, 'for', 'NLP'))

# 4.2 Hidden Markov Models

4.2.3 Implementing HMMs in Python

In [None]:
!pip install hmmlearn

In [None]:
import numpy as np
from hmmlearn import hmm

# Define the states and observations
states = ["Noun", "Verb"]
n_states = len(states)

observations = ["I", "run", "to", "the", "store"]
n_observations = len(observations)

# Transition probability matrix (A)
transition_probability = np.array([
    [0.7, 0.3],  # From Noun to [Noun, Verb]
    [0.4, 0.6]   # From Verb to [Noun, Verb]
])

# Emission probability matrix (B)
emission_probability = np.array([
    [0.2, 0.3, 0.2, 0.1, 0.2],  # From Noun to ["I", "run", "to", "the", "store"]
    [0.1, 0.6, 0.1, 0.1, 0.1]   # From Verb to ["I", "run", "to", "the", "store"]
])

# Initial probability vector (pi)
start_probability = np.array([0.6, 0.4])  # [Noun, Verb]

# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states)
model.startprob_ = start_probability
model.transmat_ = transition_probability
model.emissionprob_ = emission_probability

# Encode the observations to integers
observation_sequence = [0, 1, 2, 3, 4]  # "I", "run", "to", "the", "store"
observation_sequence = np.array(observation_sequence).reshape(-1, 1)

# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(observation_sequence, algorithm="viterbi")

print("Observations:", [observations[i] for i in observation_sequence.flatten()])
print("Hidden states:", [states[i] for i in hidden_states])

4.2.4 Solving the Three Fundamental Problems of HMMs

In [None]:
# Sample data: sequences of observations
training_sequences = [
    [0, 1, 2, 3, 4],  # "I run to the store"
    [4, 2, 0, 1, 3],  # "store to I run the"
    [1, 2, 3, 0, 4],  # "run to the I store"
]

# Convert the sequences to a format suitable for hmmlearn
training_sequences = [np.array(seq).reshape(-1, 1) for seq in training_sequences]
lengths = [len(seq) for seq in training_sequences]
training_data = np.concatenate(training_sequences)

# Create and train the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(training_data, lengths)

print("Learned start probabilities:")
print(model.startprob_)

print("Learned transition probabilities:")
print(model.transmat_)

print("Learned emission probabilities:")
print(model.emissionprob_)

# 4.3 Recurrent Neural Networks (RNNs)

4.3.3 Implementing RNNs in Python with TensorFlow/Keras


In [None]:
!pip install tensorflow

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id = np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

# 4.3.4 Evaluating RNN Performance

4.3.5 Improving RNNs

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, num_features)))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X_train, y_train, epochs=20, validation_data=(X_val, y_val))



# 4.4 Long Short-Term Memory Networks (LSTMs)

4.4.2 Implementing LSTMs in Python with TensorFlow/Keras

In [None]:
!pip install tensorflow

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id = np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

# Chapter-4 Assignments

Exercise 1: N-grams

In [None]:
from nltk import ngrams
import nltk
nltk.download('punkt')

# Sample text
text = "Natural Language Processing with Python"

# Tokenize the text into words
tokens = nltk.word_tokenize(text)

# Generate trigrams
trigrams = ngrams(tokens, 3)

print("Trigrams:")
for grams in trigrams:
    print(grams)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 2: Bigram Language Model

In [None]:
corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

In [None]:
from collections import defaultdict
import numpy as np
from nltk import ngrams
import nltk
nltk.download('punkt')

# Sample text corpus
corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]

# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    # Count bigrams
    for sentence in tokenized_corpus:
        for w1, w2 in ngrams(sentence, 2):
            model[w1][w2] += 1

    # Calculate probabilities
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count

    return model

# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)

# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
    return bigram_model[w1][w2]

print("Bigram Probability (Processing | Language):")
print(get_bigram_probability(bigram_model, 'Language', 'Processing'))

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 3: HMM for Part-of-Speech Tagging

In [None]:
sentences = [
    ["I", "run", "to", "the", "store"],
    ["She", "jumps", "over", "the", "fence"]
]

tags = [
    ["PRON", "VERB", "ADP", "DET", "NOUN"],
    ["PRON", "VERB", "ADP", "DET", "NOUN"]
]

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

In [None]:
import numpy as np
from hmmlearn import hmm

# Define the states and observations
states = ["PRON", "VERB", "ADP", "DET", "NOUN"]
n_states = len(states)

observations = ["I", "run", "to", "the", "store", "She", "jumps", "over", "fence"]
n_observations = len(observations)

# Encode the states and observations
state_to_idx = {state: idx for idx, state in enumerate(states)}
observation_to_idx = {obs: idx for idx, obs in enumerate(observations)}

# Create the sequences for training
X = [[observation_to_idx[word] for word in sentence] for sentence in sentences]
y = [[state_to_idx[tag] for tag in tag_sequence] for tag_sequence in tags]

# Convert to numpy arrays
X = np.concatenate([np.array(x).reshape(-1, 1) for x in X])
lengths = [len(x) for x in sentences]
y = np.concatenate(y)

# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(X, lengths)

# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(X, algorithm="viterbi")

# Map the states back to their original labels
hidden_states = [states[state] for state in hidden_states]

print("Observations:", sentences[0] + sentences[1])
print("Predicted states:", hidden_states)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 4: Simple RNN for Text Generation

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id = np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 5: LSTM for Text Generation

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id =

 np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  