In [None]:
import nltk
from nltk.corpus import treebank
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Dense, Bidirectional
from sklearn.model_selection import train_test_split

# Download the dataset
nltk.download('treebank')
nltk.download('universal_tagset')

# Load the dataset
sentences = treebank.tagged_sents(tagset='universal')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


In [None]:
# Create vocabularies for words and tags
words = [word for sentence in sentences for word, tag in sentence]
tags = [tag for sentence in sentences for word, tag in sentence]

# Get unique words and tags
unique_words = list(set(words))
unique_tags = list(set(tags))

word2idx = {w: i + 1 for i, w in enumerate(unique_words)}  # +1 for padding
tag2idx = {t: i for i, t in enumerate(unique_tags)}

# Convert sentences and tags to numerical indices
X = [[word2idx[w] for w, t in s] for s in sentences]
y = [[tag2idx[t] for w, t in s] for s in sentences]

# Pad sequences to make inputs uniform in length
max_len = 100  # Set max length for padding
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

# Convert tags to categorical (one-hot encoded) for training
y = [to_categorical(i, num_classes=len(unique_tags)) for i in y]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Define model parameters
input_dim = len(unique_words) + 1  # Vocabulary size (+1 for padding)
output_dim = len(unique_tags)      # Number of tags
input_length = max_len             # Input sequence length

# Build the model
model = Sequential()
model.add(Embedding(input_dim=input_dim, output_dim=64, input_length=input_length))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(output_dim, activation="softmax")))

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Model summary
model.summary()

In [None]:
# Train the model
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=5, validation_split=0.1, verbose=1)

Epoch 1/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 309ms/step - accuracy: 0.7576 - loss: 0.8872 - val_accuracy: 0.8430 - val_loss: 0.4890
Epoch 2/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 272ms/step - accuracy: 0.8791 - loss: 0.4080 - val_accuracy: 0.9400 - val_loss: 0.2048
Epoch 3/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 284ms/step - accuracy: 0.9507 - loss: 0.1683 - val_accuracy: 0.9760 - val_loss: 0.0924
Epoch 4/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 277ms/step - accuracy: 0.9824 - loss: 0.0712 - val_accuracy: 0.9850 - val_loss: 0.0561
Epoch 5/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 277ms/step - accuracy: 0.9921 - loss: 0.0364 - val_accuracy: 0.9870 - val_loss: 0.0430


In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, np.array(y_test), verbose=1)
print(f"Test Accuracy: {test_accuracy}")

# Test on a new sentence
test_sentence = "The quick brown fox jumps over the lazy dog"
test_words = test_sentence.split()

# Convert test words to indices and pad
X_test_input = [word2idx.get(w, 0) for w in test_words]  # Use 0 if word not in vocab
X_test_input = pad_sequences([X_test_input], maxlen=max_len, padding='post')

# Predict tags
predictions = model.predict(X_test_input)
pred_tags = np.argmax(predictions, axis=-1)

# Convert numerical tags back to words
predicted_tags = [list(tag2idx.keys())[i] for i in pred_tags[0] if i != 0]

# Print results
print("Sentence:", test_sentence)
print("Predicted POS tags:", predicted_tags[:len(test_words)])

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - accuracy: 0.9867 - loss: 0.0444
Test Accuracy: 0.9864623546600342
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Sentence: The quick brown fox jumps over the lazy dog
Predicted POS tags: ['DET', 'NOUN', 'ADP', 'DET']
