# Chapter-4 Assignments

**Installed the required Python prerequisite packages and libraries.**

In [None]:
!pip install hmmlearn tensorflow keras

Exercise 1: N-grams

In [None]:
from nltk import ngrams
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# Sample text
text = "Natural Language Processing with Python"

# Tokenize the text into words
tokens = nltk.word_tokenize(text)

# Generate trigrams
trigrams = ngrams(tokens, 3)

print("Trigrams:")
for grams in trigrams:
    print(grams)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 2: Bigram Language Model

In [None]:
corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

In [None]:
from collections import defaultdict
import numpy as np
from nltk import ngrams
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# Sample text corpus
corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]

# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    # Count bigrams
    for sentence in tokenized_corpus:
        for w1, w2 in ngrams(sentence, 2):
            model[w1][w2] += 1

    # Calculate probabilities
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count

    return model

# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)

# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
    return bigram_model[w1][w2]

print("Bigram Probability (Processing | Language):")
print(get_bigram_probability(bigram_model, 'Language', 'Processing'))

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 3: HMM for Part-of-Speech Tagging

In [None]:
sentences = [
    ["I", "run", "to", "the", "store"],
    ["She", "jumps", "over", "the", "fence"]
]

tags = [
    ["PRON", "VERB", "ADP", "DET", "NOUN"],
    ["PRON", "VERB", "ADP", "DET", "NOUN"]
]

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

In [None]:
import numpy as np
from hmmlearn import hmm

# Define the states and observations
states = ["PRON", "VERB", "ADP", "DET", "NOUN"]
n_states = len(states)

observations = ["I", "run", "to", "the", "store", "She", "jumps", "over", "fence"]
n_observations = len(observations)

# Encode the states and observations
state_to_idx = {state: idx for idx, state in enumerate(states)}
observation_to_idx = {obs: idx for idx, obs in enumerate(observations)}

# Create the sequences for training
X = [[observation_to_idx[word] for word in sentence] for sentence in sentences]
y = [[state_to_idx[tag] for tag in tag_sequence] for tag_sequence in tags]

# Convert to numpy arrays
X = np.concatenate([np.array(x).reshape(-1, 1) for x in X])
lengths = [len(x) for x in sentences]
y = np.concatenate(y)

# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(X, lengths)

# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(X, algorithm="viterbi")

# Map the states back to their original labels
hidden_states = [states[state] for state in hidden_states]

print("Observations:", sentences[0] + sentences[1])
print("Predicted states:", hidden_states)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 4: Simple RNN for Text Generation

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id = np.argmax(predictions[-1])

        # Reshape predicted_id to have the same number of dimensions as input_eval
        predicted_id = np.array(predicted_id).reshape((1, 1, 1))

        input_eval = np.append(input_eval[:, 1:], predicted_id, axis=1)
        text_generated.append(idx_to_char[predicted_id[0][0][0]])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 5: LSTM for Text Generation

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id = np.argmax(predictions[-1])

        # Reshape predicted_id to match the dimensions of input_eval
        predicted_id = np.array(predicted_id).reshape((1, 1, 1))

        input_eval = np.append(input_eval[:, 1:], predicted_id, axis=1)
        text_generated.append(idx_to_char[predicted_id[0][0][0]])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  