<a href="https://colab.research.google.com/github/kafSaugat7/NLP/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords, movie_reviews
from nltk import pos_tag, ne_chunk
from nltk import FreqDist
import random

# Download necessary resources
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Example string
example_string = "CSIT is a popular undergraduate program in Nepal that equips students with the skills to excel in the field of computer science and information technology."


# Tokenization
print("Sentence Tokenization:")
print(sent_tokenize(example_string))
print("Word Tokenization:")
words_in_example = word_tokenize(example_string)
print(words_in_example)


# Stopword removal
stop_words = set(stopwords.words("english"))
filtered_list = [word for word in words_in_example if word.casefold() not in stop_words]
print("Filtered List (without stopwords):")
print(filtered_list)


# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
sentiment_text = "Python is an awesome programming language."
print("Sentiment Analysis Scores:")
print(sia.polarity_scores(sentiment_text))


# Named Entity Recognition (NER)
ner_text = "It's a dangerous business, Frodo, going out your door."
words_ner = word_tokenize(ner_text)
tagged_words = pos_tag(words_ner)
named_entities = ne_chunk(tagged_words)
print("Named Entities:")
print(named_entities)


# Frequency Distribution
frq_sen = """Untrained neural network models are much like new-born babies:
They are created ignorant of the world (if considering tabula rasa epistemological theory),
and it is only through exposure to the world, i.e. a posteriori knowledge, that their ignorance is slowly revised."""
words_freq = word_tokenize(frq_sen)
frequency_distribution = FreqDist(words_freq)

print("Frequency Distribution Most Common Words:")
print(frequency_distribution.most_common(20))


# Text Classification
# Load movie reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

# Create a feature extractor
def extract_features(words):
    return {word: True for word in words}

# Prepare training data
featuresets = [(extract_features(words), category) for (words, category) in documents]
train_set, test_set = featuresets[:1500], featuresets[1500:]

# Train Naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Accuracy of Classifier:", accuracy)


# Classify a new review
new_review = "The movie was fantastic and I loved the characters."
new_features = extract_features(word_tokenize(new_review))
print("Classification of New Review:", classifier.classify(new_features))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Sentence Tokenization:
['CSIT is a popular undergraduate program in Nepal that equips students with the skills to excel in the field of computer science and information technology.']
Word Tokenization:
['CSIT', 'is', 'a', 'popular', 'undergraduate', 'program', 'in', 'Nepal', 'that', 'equips', 'students', 'with', 'the', 'skills', 'to', 'excel', 'in', 'the', 'field', 'of', 'computer', 'science', 'and', 'information', 'technology', '.']
Filtered List (without stopwords):
['CSIT', 'popular', 'undergraduate', 'program', 'Nepal', 'equips', 'students', 'skills', 'excel', 'field', 'computer', 'science', 'information', 'technology', '.']
Sentiment Analysis Scores:
{'neg': 0.0, 'neu': 0.549, 'pos': 0.451, 'compound': 0.6249}
Named Entities:
(S
  It/PRP
  's/VBZ
  a/DT
  dangerous/JJ
  business/NN
  ,/,
  (PERSON Frodo/NNP)
  ,/,
  going/VBG
  out/RP
  your/PRP$
  door/NN
  ./.)
Frequency Distribution Most Common Words:
[(',', 3), ('are', 2), ('the', 2), ('world', 2), ('is', 2), ('.', 2), ('Untra