Aim: Understand POS tagging challenges in informal, noisy text.

step 1 : install and import librabries

In [None]:
!pip install nltk
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import twitter_samples



step 2 : Download required nltk resourses

In [None]:
nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

Step - 3 :Load Tweets Dataset

In [None]:
tweets = twitter_samples.strings('positive_tweets.json')

for i in range(3):
  print("Tweet",i+1)
  print(tweets[i])
  print()

Tweet 1
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Tweet 2
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!

Tweet 3
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!



step - 4 : Tokenization and Tweet Tokenizer

In [None]:
tokenizer = TweetTokenizer(
    preserve_case=False,
    strip_handles=True,
    reduce_len=True
)

tokenized_tweets = [tokenizer.tokenize(tweet) for tweet in tweets[:3]]

for i, tokens in enumerate(tokenized_tweets):
    print(f"Original Tweet {i+1}:")

    print(tweets[i])

    print(f"Tokenized Tweet {i+1} (processed with advanced tokenizer):")

    print(tokens)
    print()

Original Tweet 1:
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Tokenized Tweet 1 (processed with advanced tokenizer):
['#followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']

Original Tweet 2:
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
Tokenized Tweet 2 (processed with advanced tokenizer):
['hey', 'james', '!', 'how', 'odd', ':/', 'please', 'call', 'our', 'contact', 'centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'many', 'thanks', '!']

Original Tweet 3:
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
Tokenized Tweet 3 (processed with advanced tokenizer):
['we', 'had', 'a', 'listen', 'last', 'night', ':)', 'as', 'you', 'bleed', 'is', 'an', 'amazing', 'track', '.', 'when', 'are', 

step 5 : POS tagging using NLTK

In [None]:
text = "Akhil bhAAi üòç is playing with bunty üêª "

tokenized_text = tokenization.tokenize(text)

tagset = nltk.pos_tag(tokenized_text)

print("original text: ",text)
print("Tokenaized text: ",tokenized_text)
print("Pos tags: ",tagset)

original text:  Akhil bhAAi üòç is playing with bunty üêª 
Tokenaized text:  ['Akhil', 'bhAAi', 'üòç', 'is', 'playing', 'with', 'bunty', 'üêª']
Pos tags:  [('Akhil', 'NNP'), ('bhAAi', 'NN'), ('üòç', 'NN'), ('is', 'VBZ'), ('playing', 'VBG'), ('with', 'IN'), ('bunty', 'NN'), ('üêª', 'NN')]


Step - 6 : Extract nouns and verbs

In [None]:
nouns = []
verbs = []

for word, tag in tagset:
  if tag.startswith('NN'):
    nouns.append(word)
  elif tag.startswith('VB'):
    verbs.append(word)

print("Nouns: ",nouns)
print("Verbs: ",verbs)

Nouns:  ['Akhil', 'bhAAi', 'üòç', 'bunty', 'üêª']
Verbs:  ['is', 'playing']
