In [4]:
!pip install pandas
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
    


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


ModuleNotFoundError: No module named 'sklearn'

treebank: A large collection of English text that has already been correctly tagged by humans. This is the "answer key" the model will learn from.





In [39]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

universal_tagset: This tells nltk to use a simpler, more universal set of tags (like 'NOUN', 'VERB', 'ADJ') instead of more complex ones (like 'NNP' for proper noun).



In [40]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

This line loads the downloaded treebank data into a single variable called nltk_data. It's now a list of sentences.

In [41]:
nltk_data=list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

This prints the first two sentences from the dataset so you can see what the data looks like. Each sentence is a list of pairs, where each pair is (word, tag)

In [42]:
print(nltk_data[:2])

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


In [43]:
for sent in nltk_data[:2]:
  for tuple in sent:
    print(tuple)

('Pierre', 'NOUN')
('Vinken', 'NOUN')
(',', '.')
('61', 'NUM')
('years', 'NOUN')
('old', 'ADJ')
(',', '.')
('will', 'VERB')
('join', 'VERB')
('the', 'DET')
('board', 'NOUN')
('as', 'ADP')
('a', 'DET')
('nonexecutive', 'ADJ')
('director', 'NOUN')
('Nov.', 'NOUN')
('29', 'NUM')
('.', '.')
('Mr.', 'NOUN')
('Vinken', 'NOUN')
('is', 'VERB')
('chairman', 'NOUN')
('of', 'ADP')
('Elsevier', 'NOUN')
('N.V.', 'NOUN')
(',', '.')
('the', 'DET')
('Dutch', 'NOUN')
('publishing', 'VERB')
('group', 'NOUN')
('.', '.')


The code splits the entire dataset into two parts: a training set (80% of the data) and a test set (20%). The model will learn patterns from the training set and then we'll check how well it performs on the test set, which it has never seen before.

In [44]:
train_set, test_set=train_test_split(nltk_data, train_size=0.80, test_size=0.20, random_state=101)
train_tagged_words=[tup for sent in train_set for tup in sent]
test_tagged_words=[tup for sent in test_set for tup in sent]
print(len(train_tagged_words))
print(len(test_tagged_words))

80310
20366


In [45]:
train_tagged_words[:5]

[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN')]

This cell identifies all the unique tags (like 'NOUN', 'VERB', etc.) and all the unique words (the vocabulary) present in the training data. This is needed to build our probability models.

In [46]:
tags={tag for word, tag in train_tagged_words}
print(len(tags))
print(tags)
vocab={word for word, tag in train_tagged_words}

12
{'DET', 'CONJ', '.', 'NUM', 'PRON', 'X', 'ADV', 'ADP', 'NOUN', 'PRT', 'VERB', 'ADJ'}


Defines a function to calculate Emission Probability. In simple terms, it answers the question: "If I know the tag is 'NOUN', what is the probability that the word is 'dog'?" It counts how often a specific word appears with a specific tag

In [47]:
def word_given_tag(word, tag, train_bag=train_tagged_words):
  tag_list=[pair for pair in train_bag if pair[1]==tag]
  count_tag=len(tag_list)
  w_given_tag_list=[pair[0] for pair in tag_list if pair[0]==word]
  count_w_given_tag=len(w_given_tag_list)
  return (count_w_given_tag, count_tag)

Defines a function to calculate Transition Probability. This answers the question: "If the current tag is a 'ADJ' (adjective), what is the probability that the next tag will be a 'NOUN'?" It learns the sequence and structure of the language.

In [48]:
def t2_given_t1(t2, t1, train_bag=train_tagged_words):
  tags=[pair[1] for pair in train_bag]
  count_t1=len([t for t in tags if t==t1])
  count_t2_t1=0
  for index in range(len(tags)-1):
    if tags[index]==t1 and tags[index+1]==t2:
      count_t2_t1+=1
  return (count_t2_t1, count_t1)

This cell uses the function from the previous step to create a big table (a matrix) of all the transition probabilities. It calculates the probability of going from every possible tag to every other possible tag.

In [49]:
tags_matrix=np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
  for j, t2 in enumerate(list(tags)):
    tags_matrix[i, j]=t2_given_t1(t2,t1)[0]/t2_given_t1(t2,t1)[1]
print(tags_matrix)

[[6.03708485e-03 4.31220367e-04 1.73925534e-02 2.28546783e-02
  3.30602261e-03 4.51343954e-02 1.20741697e-02 9.91806854e-03
  6.35906279e-01 2.87480245e-04 4.02472317e-02 2.06410810e-01]
 [1.23490669e-01 5.48847427e-04 3.51262353e-02 4.06147093e-02
  6.03732169e-02 9.33040585e-03 5.70801310e-02 5.59824370e-02
  3.49066973e-01 4.39077942e-03 1.50384188e-01 1.13611415e-01]
 [1.72191828e-01 6.00793920e-02 9.23720598e-02 7.82104954e-02
  6.87694475e-02 2.56410260e-02 5.25694676e-02 9.29084867e-02
  2.18538776e-01 2.78940029e-03 8.96899477e-02 4.61323895e-02]
 [3.57015361e-03 1.42806144e-02 1.19243130e-01 1.84219927e-01
  1.42806140e-03 2.02427700e-01 3.57015361e-03 3.74866128e-02
  3.51660132e-01 2.60621198e-02 2.07068902e-02 3.53445187e-02]
 [9.56719834e-03 5.01138950e-03 4.19134386e-02 6.83371304e-03
  6.83371304e-03 8.83826911e-02 3.69020514e-02 2.23234631e-02
  2.12756261e-01 1.41230067e-02 4.84738052e-01 7.06150308e-02]
 [5.68902567e-02 1.03786280e-02 1.60868734e-01 3.07514891e-03
  5

This simply takes the probability table from the last cell and displays it in a clean, readable format using pandas. You can see, for example, that after an 'ADJ' (adjective), the most likely next tag is a 'NOUN' (with a probability of ~0.70).

In [50]:
tags_df=pd.DataFrame(tags_matrix, columns=list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,DET,CONJ,.,NUM,PRON,X,ADV,ADP,NOUN,PRT,VERB,ADJ
DET,0.006037,0.000431,0.017393,0.022855,0.003306,0.045134,0.012074,0.009918,0.635906,0.000287,0.040247,0.206411
CONJ,0.123491,0.000549,0.035126,0.040615,0.060373,0.00933,0.05708,0.055982,0.349067,0.004391,0.150384,0.113611
.,0.172192,0.060079,0.092372,0.07821,0.068769,0.025641,0.052569,0.092908,0.218539,0.002789,0.08969,0.046132
NUM,0.00357,0.014281,0.119243,0.18422,0.001428,0.202428,0.00357,0.037487,0.35166,0.026062,0.020707,0.035345
PRON,0.009567,0.005011,0.041913,0.006834,0.006834,0.088383,0.036902,0.022323,0.212756,0.014123,0.484738,0.070615
X,0.05689,0.010379,0.160869,0.003075,0.0542,0.075726,0.025754,0.142226,0.061695,0.185086,0.206419,0.017682
ADV,0.071373,0.006982,0.139255,0.029868,0.012025,0.022886,0.081458,0.119472,0.032196,0.01474,0.339022,0.130721
ADP,0.320931,0.001012,0.038724,0.063275,0.069603,0.034548,0.014553,0.016958,0.323589,0.001266,0.008479,0.107062
NOUN,0.013106,0.042454,0.240094,0.009144,0.004659,0.028825,0.016895,0.176827,0.262344,0.043935,0.149134,0.012584
PRT,0.10137,0.002348,0.04501,0.056751,0.017613,0.012133,0.009393,0.019569,0.250489,0.001174,0.401174,0.082975


This cell defines the Viterbi algorithm. Think of this as the "brain" of the tagger. For a new sentence, it takes the words as input. Using the emission and transition probabilities it learned earlier, it calculates the most likely sequence of hidden tags for those words. It's essentially finding the most probable grammatical path through the sentence.

In [51]:
def viterbi(words, train_bag=train_tagged_words):
  state=[]
  T=list(set([pair[1] for pair in train_bag]))
  for key, word in enumerate(words):
    p=[]
    for tag in T:
      if key==0:
        transition_p=tags_df.loc['.', tag]
      else:
        transition_p=tags_df.loc[state[-1], tag]
      emission_p=word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
      state_probability=emission_p*transition_p
      p.append(state_probability)
      pmax=max(p)
      state_max=T[p.index(pmax)]
      state.append(state_max)
  return list(zip(words, state))

To save time, this code randomly picks 10 sentences from the test set to evaluate the Viterbi algorithm on.



In [52]:
random.seed(1234)
rndom=[random.randint(1,len(test_set)) for x in range(10)]
test_run= [test_set[i] for i in rndom]
test_run_base=[tup for sent in test_run for tup in sent]
test_tagged_words=[tup[0] for sent in test_run for tup in sent]

It runs the Viterbi algorithm on the 10 random test sentences and measures how long it takes.

It then compares the tags predicted by the algorithm to the actual correct tags from the test set.

Finally, it calculates and prints the accuracy. The result is quite low (~10.5%), which indicates that this specific implementation is very basic and would need significant improvements to be useful in the real world.

In [53]:
start=time.time()
tagged_seq=viterbi(test_tagged_words)
end=time.time()
difference=end-start
print("Time taken in seconds: ", difference)

check=[i for i, j in zip(tagged_seq, test_run_base) if i == j]
accuracy=len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ', accuracy*100)

Time taken in seconds:  29.11750340461731
Viterbi Algorithm Accuracy:  10.526315789473683
