<a href="https://colab.research.google.com/github/kshitizkool/Jupyter/blob/main/NLP_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nltk



In [2]:
import nltk

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

#Text Processing

## Tokenization

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize
text="NLTK is a powerful tool for NLP tasks. It can handle tokenization effectively."
words=word_tokenize(text)
sentences=sent_tokenize(text)

print("Words:", words)
print("Sentences", sentences)


Words: ['NLTK', 'is', 'a', 'powerful', 'tool', 'for', 'NLP', 'tasks', '.', 'It', 'can', 'handle', 'tokenization', 'effectively', '.']
Sentences ['NLTK is a powerful tool for NLP tasks.', 'It can handle tokenization effectively.']


## Stopword Removal

In [5]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
filtered_words=[word for word in words if word.lower() not in stop_words]

print("Filtered Words:", filtered_words)

Filtered Words: ['NLTK', 'powerful', 'tool', 'NLP', 'tasks', '.', 'handle', 'tokenization', 'effectively', '.']


## Stemming and Lemmetization

In [6]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words= ["running","ran","jumps","jumping"]
stemmed_words = [porter_stemmer.stem(word) for word in words]
lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]

print("Stemmed Words:", stemmed_words)
print("Lemmatized Words:", lemmatized_words)

Stemmed Words: ['run', 'ran', 'jump', 'jump']
Lemmatized Words: ['run', 'run', 'jump', 'jump']


## Part-of-Speech (POS) Tagging

In [7]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
tagged_words = pos_tag(words)
print("POS Tagging:", tagged_words)

POS Tagging: [('running', 'VBG'), ('ran', 'VBD'), ('jumps', 'NNS'), ('jumping', 'VBG')]


## Named Entity Recognition(NER)

In [9]:
from nltk import ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')

sentence = "My nickname is Kshitiz Kool and I was born in Ramgarh, but I live in Kuju"

tagged_sentence=pos_tag(word_tokenize(sentence))
named_entities=ne_chunk(tagged_sentence)

print("Named Entities:", named_entities)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Named Entities: (S
  My/PRP$
  nickname/NN
  is/VBZ
  (PERSON Kshitiz/NNP Kool/NNP)
  and/CC
  I/PRP
  was/VBD
  born/VBN
  in/IN
  (GPE Ramgarh/NNP)
  ,/,
  but/CC
  I/PRP
  live/VBP
  in/IN
  (GPE Kuju/NNP))


#NLTK CORPORA

In [10]:
from nltk.corpus import gutenberg

nltk.download('gutenberg')

print(gutenberg.fileids())


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [11]:
emma = gutenberg.words('austen-emma.txt')
num_words= len(emma)
num_sentence = len(gutenberg.sents('austen-emma.txt'))
avg_word_per_sentence = num_words / num_sentence

print("Number of words:", num_words)
print("Number of Sentences:", num_sentence)
print(" Average Words per sentence", avg_word_per_sentence)

Number of words: 192427
Number of Sentences: 7752
 Average Words per sentence 24.822884416924666


## WordNet
lexical database provivding semantic relationships between words, such as synonyms, hypernys, hyponyms, etc.

In [12]:
from nltk.corpus import wordnet
synsets = wordnet.synsets('happy')
synonyms = [syn.lemmas()[0].name() for syn in synsets]

synsets_dog = wordnet.synsets('dog')
hypernyms = synsets_dog[0].hypernyms()

print("Synonyms of happy:", synonyms)
print("Hypernyms of dog:", hypernyms)

Synonyms of happy: ['happy', 'felicitous', 'glad', 'happy']
Hypernyms of dog: [Synset('canine.n.02'), Synset('domestic_animal.n.01')]


## Using Lexical Resource

In [13]:
defi = wordnet.synset('dog.n.01').definition()
ex = wordnet.synset('dog.n.01').examples()

print("Definition of dog",defi)
print("examples of dogs",ex)

Definition of dog a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds
examples of dogs ['the dog barked all night']


# Text Classification with NLTK

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
documents=["NLTK is a powerful tool for NLP tasks.",
           "Sentiment analysis heps to understand user feelings",
           "Topic modeling finds hidden patterns in data."]

vectorizer=CountVectorizer()
feature_matrix = vectorizer.fit_transform(documents)

print("Feature Matrix")
print(feature_matrix.toarray())
print("Vocabulary:", vectorizer.get_feature_names_out())

Feature Matrix
[[0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 0 1 0 0 0]
 [1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1]
 [0 1 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 0 0]]
Vocabulary: ['analysis' 'data' 'feelings' 'finds' 'for' 'heps' 'hidden' 'in' 'is'
 'modeling' 'nlp' 'nltk' 'patterns' 'powerful' 'sentiment' 'tasks' 'to'
 'tool' 'topic' 'understand' 'user']


## Building a text classifier using NLTK

In [15]:
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy

positive_reviews = [(movie_reviews.words(fileid), 'positive') for fileid in movie_reviews.fileids('pos')]
negetive_reviews = [(movie_reviews.words(fileid), 'negetive') for fileid in movie_reviews.fileids('neg')]
reviews = positive_reviews + negetive_reviews
def extract_features(words):
  return dict([(word, True) for word in words])

feature_sets = [(extract_features(words), sentiment) for(words, sentiment) in reviews]

train_sets = feature_sets[:800]
test_sets = feature_sets[800:]

classifier = NaiveBayesClassifier.train(train_sets)
accuracy = nltk_accuracy(classifier, test_sets)

print("Accuracy:", accuracy)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


Accuracy: 0.16666666666666666


## Evaluating the text classifier

In [16]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, log_loss
import matplotlib.pyplot as plt

In [17]:
#wrong

from nltk.metrics import ConfusionMatrix

true_labels = [sentiment for (_, sentiment) in test_sets]
predicted_labels = [classifier.classify(features) for (features, _) in test_sets]
cm=ConfusionMatrix(true_labels, predicted_labels)

print("Confusion Matrix")
print(cm)

precision = cm['positive','positive'] / cm['positive','positive'] + cm['negetive','positive']
recall = cm['positive','positive'] / cm['positive','positive'] + cm['positive','negetive']
f1_score = 2*(precision*recall)/(precision+recall)

print("precision:", precision)
print("recall:", recall)
print("f1_score", f1_score)

Confusion Matrix
         |    n    p |
         |    e    o |
         |    g    s |
         |    e    i |
         |    t    t |
         |    i    i |
         |    v    v |
         |    e    e |
---------+-----------+
negetive |   <.>1000 |
positive |    . <200>|
---------+-----------+
(row = reference; col = test)

precision: 1001.0
recall: 1.0
f1_score 1.998003992015968


# Sentiment  Analysis with NLTK

sentiment lexicons

In [18]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [19]:
pos_reviews = [(movie_reviews.raw(fileid),'positive') for fileid in movie_reviews.fileids('pos')]
neg_reviews = [(movie_reviews.raw(fileid),'negetive') for fileid in movie_reviews.fileids('neg')]
reviews = pos_reviews+neg_reviews

import random
random.shuffle(reviews)

train_set = reviews[:1600]
test_set = reviews[1600:]

feature_sets = [(extract_features(words), sentiment) for ( words, sentiment) in train_sets]

classifier  = NaiveBayesClassifier.train(feature_sets)

accuracy = nltk_accuracy(classifier, test_sets)
print("Accuracy:", accuracy)

Accuracy: 0.16666666666666666


# Topic Modelling
unsupervised learning - discovers hidden topics or themes in a collection of text documents.

##LDA - Latent Dirichlet Allocation
topic modeling algo - each doc is mixture of topics and each topic is mixture of words.

In [20]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary

documents = [ "This is the first line.",
    "Here comes the second line.",
    "The third line is right here.",
    "Moving on to the fourth line now.",
    "And finally, the fifth line concludes."]

tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
dictionary = Dictionary(tokenized_docs)
corpus = [ dictionary.doc2bow(doc) for doc in tokenized_docs]
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)
for topic_num, topic_words in lda_model.print_topics():
  print(f"Total {topic_num+1}: {topic_words}")

Total 1: 0.048*"the" + 0.048*"." + 0.048*"line" + 0.048*"second" + 0.048*"here" + 0.048*"comes" + 0.048*"," + 0.048*"is" + 0.048*"first" + 0.048*"fifth"
Total 2: 0.121*"line" + 0.121*"." + 0.121*"the" + 0.055*"is" + 0.055*"here" + 0.033*"on" + 0.033*"fourth" + 0.033*"to" + 0.033*"moving" + 0.033*"now"


In [21]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import inaugural, brown, reuters, gutenberg, movie_reviews, webtext, nps_chat, treebank, conll2000, names, wordnet
from sklearn. feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
corpus=["Remember that this example is a basic demonstration.", "Real stock price prediction involves more complex models, feature engineering, and careful evaluation." ,"Additionally, the stock market is influenced by numerous factors beyond historical prices, so predictions are inherently uncertain."]
def preprocess_text(text):
   stop_words = set(stopwords.words('english'))
   lennotixer = WordNetLemmatizer()
   words = word_tokenize(text.lower())
   words = [word for word in words if word.isalpha()]
   words = [word for word in words if word not in stop_words]
   words = [lemmatizer.lemmatize(word) for word in words]
   return " ".join(words)
preprocessed_corpus = [preprocess_text(doc) for doc in corpus]
vectorizer =  CountVectorizer()
feature_matrix = vectorizer.fit_transform(preprocessed_corpus)
num_topics = 2
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(feature_matrix)
def display_topics(model, feature_names, nuni_top_words):
  for topic_idx, topic in enumerate(model.components_):
    print(f"Topic{topic_idx+1}:")
    print(" ".join([feature_names[i] for i in topic.argsort() [:-num_top_words - 1:-1]]))
num_top_words = 5
display_topics(lda_model, vectorizer.get_feature_names_out(), num_top_words)


Topic1:
example demonstration basic remember complex
Topic2:
stock price prediction factor numerous


## Visualising Topic Models

In [22]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.24.2 (from pyLDAvis)
  Downloading numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
Collecting pandas>=2.0.0 (from pyLDAvis)
  Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting tzdata>=2022.1 (from pandas>=2.0.0->pyLDAvis)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [

In [23]:
import pyLDAvis.gensim
from gensim import corpora


# Enable notebook mode for Jupyter Notebook
pyLDAvis.enable_notebook()

# Prepare the data for visualization
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

# Display the visualization
pyLDAvis.display(vis)


ValueError: ignored

#Language Translation

In [24]:
from nltk.translate import IBMModel1
from nltk.tokenize import word_tokenize

# English and French sentences
eng = ['I love NLTK.', 'It is powerful']
fr = ['J\'aime NLTK', 'C\'est puissant']

tokenized_english = [word_tokenize(sent.lower()) for sent in eng]
tokenized_french = [word_tokenize(sent.lower()) for sent in fr]

# Train IBM Model 1
ibm1 = IBMModel1(tokenized_english, tokenized_french, 10)

new_sentence = 'NLTK is amazing.'
tokenized_new = word_tokenize(new_sentence.lower())

# Get translation probabilities for the new sentence
translation_probabilities = ibm1.translation_table[tokenized_new[0]]

# Find the translated word with the maximum probability
translated_word = max(translation_probabilities, key=translation_probabilities.get)
print("Translation:", translated_word)


  and should_run_async(code)


AttributeError: ignored

# Dependency Parsing

In [25]:
from nltk.parse import CoreNLPParser

sentence = "The cat sat on a mat"
parser = CoreNLPParser()
parse_tree = next(parser.raw_parse(sentence))

# Extract the CoNLL format dependencies
dependencies = parse_tree.to_conll(10)  # Using 10 columns for complete information

print("Dependency Parsing in CoNLL format:")
print(dependencies)


  and should_run_async(code)


ConnectionError: ignored

In [28]:
!pip install SpeechRcognition

  and should_run_async(code)


[31mERROR: Could not find a version that satisfies the requirement SpeechRcognition (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for SpeechRcognition[0m[31m
[0m

In [26]:
import speech_recognition as sr
recognizer = sr.Recognizer()
with sr.AudioFile('/content/Audio.mp3') as a:
  ad = recognizer.record(a)
recognized_text = recognizer.recognize_google(ad)
print("Recognized Text:", recognized_text)

  and should_run_async(code)


ModuleNotFoundError: ignored