# Parts of Speech (POS) Tagging

In [None]:
import nltk
from nltk import word_tokenize, pos_tag

In [None]:
# download pos tagger
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# sample text
text = "The quick brown fox jumps over the lazy dog."

In [None]:
# tokenization
tokens = word_tokenize(text)
print(tokens)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [None]:
# POS Tagging
pos_tags = pos_tag(tokens)
print(pos_tags)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


# Named Entity Recognition (NER)

In [None]:
import spacy

In [None]:
# load english language model
nlp = spacy.load('en_core_web_sm')

In [None]:
# sample text
text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."

In [None]:
# process text
doc = nlp(text)

In [None]:
# extract and display named entities
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple Inc. ORG
Steve Jobs PERSON
Cupertino GPE
California GPE


# Sentiment Analysis

In [None]:
from textblob import TextBlob

In [None]:
# sample text
text = "I love this product! It's amazing."

In [None]:
# create a text blob object
blob = TextBlob(text)

In [None]:
# sentiment analysis
print(f"Sentiment Polarity: {blob.sentiment.polarity}")
print(f"Sentiment Subjectivity: {blob.sentiment.subjectivity}")

Sentiment Polarity: 0.6125
Sentiment Subjectivity: 0.75


# N-grams

In [None]:
import nltk
from nltk import ngrams
from collections import Counter
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# sample text
text = "The quick brown fox jumps over the lazy dog."

In [None]:
# tokenization
tokens = nltk.word_tokenize(text)

In [None]:
# generate bigrams
bigrams = ngrams(tokens, 2)

In [None]:
# display bigrams
for bigram in bigrams:
    print(bigram)

('The', 'quick')
('quick', 'brown')
('brown', 'fox')
('fox', 'jumps')
('jumps', 'over')
('over', 'the')
('the', 'lazy')
('lazy', 'dog')
('dog', '.')


In [None]:
# frequency of n-grams
bigram_freq = Counter(ngrams(tokens, 2))
print(bigram_freq)

Counter({('The', 'quick'): 1, ('quick', 'brown'): 1, ('brown', 'fox'): 1, ('fox', 'jumps'): 1, ('jumps', 'over'): 1, ('over', 'the'): 1, ('the', 'lazy'): 1, ('lazy', 'dog'): 1, ('dog', '.'): 1})


# Language Modeling

In [None]:
from nltk import FreqDist

In [None]:
# sample text
corpus = "I love data science. I enjoy data analysis. Data science is fun."

In [None]:
# tokenization
tokens = nltk.word_tokenize(corpus)

In [None]:
# calculate bigram probabilities
bigrams = list(nltk.bigrams(tokens))
bigram_freq = FreqDist(bigrams)
word_freq = FreqDist(tokens)

In [None]:
# probability of "science" following "data"
prob = bigram_freq[('data', 'science')] / word_freq['data']
print(f"Probability of 'science' following 'data': {prob}")

Probability of 'science' following 'data': 0.5


# Task 1:

## Use Spacy to perform POS tagging and NER on a news article

In [1]:
import spacy

In [2]:
# load english language model
nlp = spacy.load('en_core_web_sm')

In [3]:
# Sample News Article
news_article = """
Elon Musk, CEO of Tesla Inc., announced that the company will invest $1.5 billion in Bitcoin.
This decision aims to diversify Tesla's investment portfolio and boost digital currency adoption.
The announcement led to Bitcoin's price surging by 15% in a single day.
"""

In [4]:
# process the text
doc = nlp(news_article)

In [5]:
# pos tagging
print("=== POS Tagging ===")
for token in doc:
    print(f"{token.text:<15} | {token.pos_:<10} | {token.tag_:<15}")

=== POS Tagging ===

               | SPACE      | _SP            
Elon            | PROPN      | NNP            
Musk            | PROPN      | NNP            
,               | PUNCT      | ,              
CEO             | NOUN       | NN             
of              | ADP        | IN             
Tesla           | PROPN      | NNP            
Inc.            | PROPN      | NNP            
,               | PUNCT      | ,              
announced       | VERB       | VBD            
that            | SCONJ      | IN             
the             | DET        | DT             
company         | NOUN       | NN             
will            | AUX        | MD             
invest          | VERB       | VB             
$               | SYM        | $              
1.5             | NUM        | CD             
billion         | NUM        | CD             
in              | ADP        | IN             
Bitcoin         | PROPN      | NNP            
.               | PUNCT      | .        

In [6]:
# Named Entity Recognition (NER)
print("\n=== Named Entity Recognition (NER) ===")
for ent in doc.ents:
    print(f"{ent.text:<25} | {ent.label_:<15} | {spacy.explain(ent.label_)}")


=== Named Entity Recognition (NER) ===
Elon Musk                 | PERSON          | People, including fictional
Tesla Inc.                | ORG             | Companies, agencies, institutions, etc.
$1.5 billion              | MONEY           | Monetary values, including unit
Bitcoin                   | PERSON          | People, including fictional
Tesla                     | ORG             | Companies, agencies, institutions, etc.
Bitcoin                   | PERSON          | People, including fictional
15%                       | PERCENT         | Percentage, including "%"
a single day              | DATE            | Absolute or relative dates or periods


# Task 2:

## Built a basic sentiment analysis tool using a pre-trained model TextBlob on movie reviews.

In [7]:
from textblob import TextBlob
import pandas as pd

In [8]:
# Sample movie reviews dataset
data = {
    'Review': [
        "The movie was absolutely fantastic! I loved the characters and the plot.",
        "Horrible movie. Waste of time and money.",
        "An average movie with decent performances.",
        "Brilliant cinematography but weak storyline.",
        "I enjoyed the humor, but the pacing was too slow."
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Review
0,The movie was absolutely fantastic! I loved th...
1,Horrible movie. Waste of time and money.
2,An average movie with decent performances.
3,Brilliant cinematography but weak storyline.
4,"I enjoyed the humor, but the pacing was too slow."


In [9]:
# function to classify sentiment
def get_sentiment(text):

  analysis = TextBlob(text)
  if analysis.sentiment.polarity > 0:
    return 'Positive'
  elif analysis.sentiment.polarity == 0:
    return 'Neutral'
  else:
    return 'Negative'

In [10]:
# add sentiment column
df['Sentiment'] = df['Review'].apply(get_sentiment)

In [11]:
df

Unnamed: 0,Review,Sentiment
0,The movie was absolutely fantastic! I loved th...,Positive
1,Horrible movie. Waste of time and money.,Negative
2,An average movie with decent performances.,Positive
3,Brilliant cinematography but weak storyline.,Positive
4,"I enjoyed the humor, but the pacing was too slow.",Positive


# Task 3:

## Generate bigrams and trigrams from a text corpus and analyze frequent phrases.

In [13]:
import nltk
from nltk.util import ngrams
from collections import Counter
from nltk import word_tokenize
import re
from nltk.corpus import stopwords

nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
# Sample text corpus (Movie Reviews)
text = """
The movie was absolutely fantastic! The characters were well-developed, and the plot had unexpected twists.
I enjoyed the film's visuals, but the storyline was somewhat predictable.
Despite some flaws, the performances were exceptional. Overall, a great watch!
"""

In [15]:
# Text cleaning function
def clean_text(text):
    text = text.lower()                                # Lowercase
    text = re.sub(r'[^\w\s]', '', text)                # Remove special characters
    tokens = word_tokenize(text)                       # Tokenization
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return tokens

In [16]:
# Generate Bigrams
def get_bigrams(tokens):
    bigrams = list(ngrams(tokens, 2))
    return bigrams

# Generate Trigrams
def get_trigrams(tokens):
    trigrams = list(ngrams(tokens, 3))
    return trigrams

In [17]:
# Tokenize and clean the text
tokens = clean_text(text)

# Generate n-grams
bigrams = get_bigrams(tokens)
trigrams = get_trigrams(tokens)

# Count frequency of bigrams and trigrams
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

# Display top 5 most common bigrams
print("Top 5 Bigrams:")
for bigram, count in bigram_freq.most_common(5):
    print(f"{' '.join(bigram)}: {count}")

# Display top 5 most common trigrams
print("\nTop 5 Trigrams:")
for trigram, count in trigram_freq.most_common(5):
    print(f"{' '.join(trigram)}: {count}")


Top 5 Bigrams:
movie absolutely: 1
absolutely fantastic: 1
fantastic characters: 1
characters welldeveloped: 1
welldeveloped plot: 1

Top 5 Trigrams:
movie absolutely fantastic: 1
absolutely fantastic characters: 1
fantastic characters welldeveloped: 1
characters welldeveloped plot: 1
welldeveloped plot unexpected: 1
