# Task 1 – NLP Pipeline

Import Libraries

In [17]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, movie_reviews
from nltk import FreqDist, NaiveBayesClassifier
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.classify import accuracy
import random

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('movie_reviews')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

Step 2: Input Text

In [3]:

paragraph = """Artificial Intelligence and Machine Learning are transforming the world.
They help automate tasks, improve decision-making, and enhance user experiences across industries."""
print("Paragraph:\n", paragraph)


Paragraph:
 Artificial Intelligence and Machine Learning are transforming the world.
They help automate tasks, improve decision-making, and enhance user experiences across industries.


Step 3: Tokenization

In [4]:



tokens = word_tokenize(paragraph)
print("After Tokenization:\n", tokens)


After Tokenization:
 ['Artificial', 'Intelligence', 'and', 'Machine', 'Learning', 'are', 'transforming', 'the', 'world', '.', 'They', 'help', 'automate', 'tasks', ',', 'improve', 'decision-making', ',', 'and', 'enhance', 'user', 'experiences', 'across', 'industries', '.']


Step 4: Stopword Removal

In [5]:



stop_words = set(stopwords.words('english'))

filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]

print("After Stopword Removal:\n", filtered_tokens)


After Stopword Removal:
 ['Artificial', 'Intelligence', 'Machine', 'Learning', 'transforming', 'world', 'help', 'automate', 'tasks', 'improve', 'enhance', 'user', 'experiences', 'across', 'industries']


Step 5: Stemming

In [6]:
ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in filtered_tokens]
print("\n After Stemming:\n", stemmed_words)


 After Stemming:
 ['artifici', 'intellig', 'machin', 'learn', 'transform', 'world', 'help', 'autom', 'task', 'improv', 'enhanc', 'user', 'experi', 'across', 'industri']


Step 6: Lemmatization

In [7]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\n After Lemmatization:\n", lemmatized_words)


 After Lemmatization:
 ['Artificial', 'Intelligence', 'Machine', 'Learning', 'transforming', 'world', 'help', 'automate', 'task', 'improve', 'enhance', 'user', 'experience', 'across', 'industry']


# Task 2 – Sentiment Analysis

In [8]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [9]:
random.shuffle(documents)

print("Total Reviews:", len(documents))
print("Sample Data:", documents[0][:100])

Total Reviews: 2000
Sample Data: (['the', 'last', 'line', '(', 'or', 'near', 'to', 'that', 'honor', ')', 'is', 'the', 'great', 'butler', ',', 'alfred', '(', 'the', 'ubergod', ',', 'michael', 'gough', ')', 'saying', ',', '"', 'i', 'think', 'we', 'need', 'a', 'bigger', 'bat', 'cave', ',', '"', 'or', 'something', 'to', 'that', 'note', '.', 'that', "'", 's', 'exactly', 'what', 'this', 'film', 'is', '-', 'too', 'big', 'for', 'its', 'own', 'good', 'because', 'it', 'has', 'too', 'damn', 'much', '.', 'cut', 'batgirl', 'out', '.', 'cut', 'one', 'of', 'the', 'villains', '.', 'it', "'", 's', 'too', 'much', 'to', 'handle', 'in', 'one', 'dosage', '.', 'it', "'", 's', 'so', 'much', 'that', 'characters', 'get', 'left', 'behind', '.', 'poor', 'elle', 'gets', 'a', 'mere', '3', 'scenes', 'and', 'a', 'subplot', 'which', 'is', 'introduced', 'but', 'never', 'finished', 'in', 'any', 'way', ',', 'shape', 'or', 'form', '.', 'and', 'elle', 'deserves', 'better', '.', 'this', 'is', 'the', 'fourth', 'in', 'the', 

In [11]:
all_words = FreqDist(w.lower() for w in movie_reviews.words())

In [12]:
word_features = list(all_words)[:2000]

In [13]:
def document_features(document):
    words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in words)
    return features

In [15]:
featuresets = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = featuresets[:1600], featuresets[1600:]

In [18]:
# Train Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate accuracy
print("Accuracy:", accuracy(classifier, test_set))

# Show most informative words
classifier.show_most_informative_features(10)

Accuracy: 0.805
Most Informative Features
   contains(outstanding) = True              pos : neg    =      8.3 : 1.0
         contains(damon) = True              pos : neg    =      7.2 : 1.0
        contains(seagal) = True              neg : pos    =      7.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.1 : 1.0
        contains(poorly) = True              neg : pos    =      5.8 : 1.0
        contains(wasted) = True              neg : pos    =      5.8 : 1.0
         contains(awful) = True              neg : pos    =      5.7 : 1.0
         contains(waste) = True              neg : pos    =      5.5 : 1.0
           contains(era) = True              pos : neg    =      5.1 : 1.0
            contains(it) = False             neg : pos    =      5.1 : 1.0


In [19]:
test_text = "The movie was fantastic! The performances were amazing and the story was touching."
custom_features = document_features(test_text.split())
print("Predicted Sentiment:", classifier.classify(custom_features))


Predicted Sentiment: neg
