In [1]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [2]:
from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger()

In [3]:
tagger.tag('How does this bear on the question ?'.split())

[('How', 'WRB'),
 ('does', 'VBZ'),
 ('this', 'DT'),
 ('bear', 'NN'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('question', 'NN'),
 ('?', '.')]

Bear is tagged as a Noun(NN) when it is a Verb(VB)

In [4]:
tagger.tag('When you quiet, we can start talking .'.split())

[('When', 'WRB'),
 ('you', 'PRP'),
 ('quiet,', 'VBP'),
 ('we', 'PRP'),
 ('can', 'MD'),
 ('start', 'VB'),
 ('talking', 'VBG'),
 ('.', '.')]

Quiet is correctly tagged as a Verb(VBP)

In [6]:
#from google.colab import files
#uploaded = files.upload()

In [7]:
# Read the uploaded Brown Corpus text file
file_name = 'BrownCorpus.txt'
with open(file_name, 'r') as file:
    brown_corpus = file.readlines()

In [8]:
# Read the conversion file and create a mapping dictionary
pos_conversion_dict = {}

with open('BrownToUniversalTagMap.txt','r') as f:
  for line in f:
      original_tag, universal_tag = line.split()
      pos_conversion_dict[original_tag] = universal_tag

# Check the conversion dictionary
print(pos_conversion_dict)

{"'": '.', "''": '.', '(': '.', '(-HL': '.', ')': '.', ')-HL': '.', '*': 'ADV', '*-HL': 'ADV', '*-NC': 'ADV', '*-TL': 'ADV', ',': '.', ',-HL': '.', ',-NC': '.', ',-TL': '.', '--': '.', '---HL': '.', '.': '.', '.-HL': '.', '.-NC': '.', '.-TL': '.', ':': '.', ':-HL': '.', ':-TL': '.', 'ABL': 'PRT', 'ABN': 'PRT', 'ABN-HL': 'PRT', 'ABN-NC': 'PRT', 'ABN-TL': 'PRT', 'ABX': 'DET', 'AP': 'ADJ', 'AP$': 'PRT', 'AP+AP-NC': 'ADJ', 'AP-HL': 'ADJ', 'AP-NC': 'ADJ', 'AP-TL': 'ADJ', 'AT': 'DET', 'AT-HL': 'DET', 'AT-NC': 'DET', 'AT-TL': 'DET', 'AT-TL-HL': 'DET', 'BE': 'VERB', 'BE-HL': 'VERB', 'BE-TL': 'VERB', 'BED': 'VERB', 'BED*': 'VERB', 'BED-NC': 'VERB', 'BEDZ': 'VERB', 'BEDZ*': 'VERB', 'BEDZ-HL': 'VERB', 'BEDZ-NC': 'VERB', 'BEG': 'VERB', 'BEM': 'VERB', 'BEM*': 'VERB', 'BEM-NC': 'VERB', 'BEN': 'VERB', 'BEN-TL': 'VERB', 'BER': 'VERB', 'BER*': 'VERB', 'BER*-NC': 'VERB', 'BER-HL': 'VERB', 'BER-NC': 'VERB', 'BER-TL': 'VERB', 'BEZ': 'VERB', 'BEZ*': 'VERB', 'BEZ-HL': 'VERB', 'BEZ-NC': 'VERB', 'BEZ-TL': 'VE

In [9]:
def process_sentence(sentence):
  processed_sentence = []

  for word_tag in sentence:
    word,tag = word_tag
    if tag in pos_conversion_dict:
      tag = pos_conversion_dict[tag]
    processed_sentence.append((word.lower(),tag))

  return processed_sentence

sentences = []
for line in brown_corpus:
  word_tags = line.strip().split()
  sentence = [tuple(word_tag.split('_')) for word_tag in word_tags]
  processed_sentence = process_sentence(sentence)
  sentences.append(processed_sentence)

In [10]:
# Calculate the index for the 80/20 split
split_index = int(0.8 * len(sentences))

# Training set (first 80% of sentences)
training_set = sentences[:split_index]

# Testing set (last 20% of sentences)
testing_set = sentences[split_index:]

print(f"Training set size: {len(training_set)} sentences")
print(f"Testing set size: {len(testing_set)} sentences")

print("First 5 sentences in training set:")
for sentence in training_set[:5]:
    print(sentence)

print("First 5 sentences in testing set:")
for sentence in testing_set[:5]:
    print(sentence)


Training set size: 45652 sentences
Testing set size: 11414 sentences
First 5 sentences in training set:
[('the', 'DET'), ('fulton', 'NOUN'), ('county', 'NOUN'), ('grand', 'ADJ'), ('jury', 'NOUN'), ('said', 'VERB'), ('friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('no', 'DET'), ('evidence', 'NOUN'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')]
[('the', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('city', 'NOUN'), ('executive', 'ADJ'), ('committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CO

In [11]:
from nltk.corpus import treebank
from nltk.tag.mapping import map_tag
nltk.download('universal_tagset')

# Run the tagger on the test set and convert the Penn tags to Universal tags
converted_test_set = []

for sentence in testing_set:
  # Extract words from the sentence for tagging with the perceptron tagger
  words = [word for word, _ in sentence]

  # Tag the words using the perceptron tagger
  tagged_sentence = tagger.tag(words)

  # Map each Penn Treebank POS tag to the Universal POS tag using map_tag
  converted_sentence = [(word, map_tag('en-ptb','universal', pos)) for word, pos
                        in tagged_sentence]

  converted_test_set.append(converted_sentence)

for sentence in converted_test_set[:5]:
  print(sentence)

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


[('they', 'PRON'), ('are', 'VERB'), ('set', 'VERB'), ('forth', 'NOUN'), ('in', 'ADP'), ('your', 'PRON'), ('own', 'ADJ'), ('newspapers', 'NOUN'), ('.', '.')]
[('you', 'PRON'), ('want', 'VERB'), ('from', 'ADP'), ('me', 'PRON'), ('the', 'DET'), ('story', 'NOUN'), (',', '.'), ('but', 'CONJ'), ('a', 'DET'), ('story', 'NOUN'), ('is', 'VERB'), ('about', 'ADP'), ('why', 'ADV'), ('and', 'CONJ'), ('then', 'ADV'), (',', '.'), ('perhaps', 'ADV'), (',', '.'), ('about', 'ADP'), ('how', 'ADV'), ('.', '.')]
[('the', 'DET'), ('when', 'ADV'), ('you', 'PRON'), ('know', 'VERB'), (';', '.')]
[('yesterday', 'NOUN'), ('morning', 'NOUN'), ('.', '.')]
[('so', 'ADP'), ('what', 'PRON'), ('i', 'NOUN'), ('am', 'VERB'), ('trying', 'VERB'), ('to', 'PRT'), ('tell', 'VERB'), ('you', 'PRON'), ('is', 'VERB'), ('the', 'DET'), ('why', 'ADV'), ('--', '.'), ('that', 'DET'), ('is', 'VERB'), ('my', 'PRON'), ('point', 'NOUN'), ('--', '.'), ('and', 'CONJ'), ('that', 'ADP'), ('concerns', 'VERB'), ('the', 'DET'), ('spirit', 'NOUN

In [12]:
true_positives = 0
false_positives = 0
false_negatives = 0

for i in range(len(testing_set)):
  true_sentence = testing_set[i]
  predicted_sentence = converted_test_set[i]

  #Iterate over words and compare POS tags
  for true_word, true_tag in true_sentence:
    # Find the corresponding predicted word
    predicted_tag = None

    for word, pred_tag in predicted_sentence:
      if true_word == word:
        predicted_tag = pred_tag
        break

    if predicted_tag == true_tag:
      true_positives += 1
    elif predicted_tag != true_tag and predicted_tag is not None:
      false_positives += 1
    elif predicted_tag != true_tag and predicted_tag is None:
      false_negatives += 1

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)
accuracy = true_positives / (true_positives + false_positives + false_negatives)

print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Accuracy: {accuracy}")

True Positives: 154211
False Positives: 21830
False Negatives: 0
Precision: 0.8759947966666856
Recall: 1.0
F1 Score: 0.9338989620047722
Accuracy: 0.8759947966666856
