**Stemming**

In [1]:
!pip install nltk #install library 
import nltk #import library 
nltk.download('punkt') #download tokenize data

ERROR: Invalid requirement: '#install': Expected package name at the start of dependency specifier
    #install
    ^
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [3]:
x = "The University of Faisalabad is a private institute. imparting high quality higher education at undergraduate. graduate and postgraduate level."            

In [4]:
ps = PorterStemmer()

In [5]:
# Tokenize the sentence
words = word_tokenize(x)

In [6]:
# Apply stemming
stems = [ps.stem(w) for w in words]
print("Stemming result:")
print(stems)

Stemming result:
['the', 'univers', 'of', 'faisalabad', 'is', 'a', 'privat', 'institut', '.', 'impart', 'high', 'qualiti', 'higher', 'educ', 'at', 'undergradu', '.', 'graduat', 'and', 'postgradu', 'level', '.']


**Lemmatization**

In [7]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
# Apply lemmatization
lemmas = [lemmatizer.lemmatize(w) for w in words]
print("Lemmatization result:")
print(lemmas)

Lemmatization result:
['The', 'University', 'of', 'Faisalabad', 'is', 'a', 'private', 'institute', '.', 'imparting', 'high', 'quality', 'higher', 'education', 'at', 'undergraduate', '.', 'graduate', 'and', 'postgraduate', 'level', '.']


**Tokenization**

In [10]:
from nltk.tokenize import word_tokenize, sent_tokenize #import libraries 

In [11]:
w = word_tokenize(x)

In [12]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
from nltk.tokenize import word_tokenize

In [14]:
print (word_tokenize(x)) #split text into words

['The', 'University', 'of', 'Faisalabad', 'is', 'a', 'private', 'institute', '.', 'imparting', 'high', 'quality', 'higher', 'education', 'at', 'undergraduate', '.', 'graduate', 'and', 'postgraduate', 'level', '.']


In [15]:
print (sent_tokenize(x)) #split text into sentences 

['The University of Faisalabad is a private institute.', 'imparting high quality higher education at undergraduate.', 'graduate and postgraduate level.']


**POS Tagging**

In [16]:
from nltk.tag import pos_tag 

In [17]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [18]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [19]:
w = word_tokenize(x)

In [20]:
p=pos_tag(w)

In [21]:
p

[('The', 'DT'),
 ('University', 'NNP'),
 ('of', 'IN'),
 ('Faisalabad', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('private', 'JJ'),
 ('institute', 'NN'),
 ('.', '.'),
 ('imparting', 'VBG'),
 ('high', 'JJ'),
 ('quality', 'NN'),
 ('higher', 'JJR'),
 ('education', 'NN'),
 ('at', 'IN'),
 ('undergraduate', 'NN'),
 ('.', '.'),
 ('graduate', 'NN'),
 ('and', 'CC'),
 ('postgraduate', 'JJ'),
 ('level', 'NN'),
 ('.', '.')]

**Lowercasing**

In [22]:
text = x.lower() #remove uppercase
print(text)

the university of faisalabad is a private institute. imparting high quality higher education at undergraduate. graduate and postgraduate level.


**Remove special characters**

In [23]:
import re
import string

In [24]:
text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation/special characters
tokens = word_tokenize(text)
print(tokens)

['the', 'university', 'of', 'faisalabad', 'is', 'a', 'private', 'institute', 'imparting', 'high', 'quality', 'higher', 'education', 'at', 'undergraduate', 'graduate', 'and', 'postgraduate', 'level']


**Stopword Removal**

In [25]:
from nltk.corpus import stopwords #import library 

In [26]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w not in stop_words]
print(filtered_tokens)

['university', 'faisalabad', 'private', 'institute', 'imparting', 'high', 'quality', 'higher', 'education', 'undergraduate', 'graduate', 'postgraduate', 'level']


**Vocabulary**

In [28]:
vocabulary = sorted(set(filtered_tokens))
print(vocabulary)

['education', 'faisalabad', 'graduate', 'high', 'higher', 'imparting', 'institute', 'level', 'postgraduate', 'private', 'quality', 'undergraduate', 'university']


**BOW**

In [29]:
from collections import Counter
bow = Counter(filtered_tokens)
print(bow)

Counter({'university': 1, 'faisalabad': 1, 'private': 1, 'institute': 1, 'imparting': 1, 'high': 1, 'quality': 1, 'higher': 1, 'education': 1, 'undergraduate': 1, 'graduate': 1, 'postgraduate': 1, 'level': 1})


In [30]:
x = [1 if word in filtered_tokens else 0 for word in vocabulary]
print("x=", x)

x= [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


**Generate volcabulary And BOW**

In [31]:
# The given corpus
corpus = [
    "I am loving the NLP class, but sometimes it feels confusing!!!",
    "NLP is a fascinating field — it deals with text, speech, and language understanding."]

In [32]:
#Lowercase 
lowered = [sent.lower() for sent in corpus]
print("Lowercase:")
for s in lowered:
    print("-", s)

Lowercase:
- i am loving the nlp class, but sometimes it feels confusing!!!
- nlp is a fascinating field — it deals with text, speech, and language understanding.


In [33]:
#Remove special characters 
cleaned = [re.sub(r'[^a-z0-9\s]', '', s) for s in lowered]
for s in cleaned:
    print("-", s)

- i am loving the nlp class but sometimes it feels confusing
- nlp is a fascinating field  it deals with text speech and language understanding


In [34]:
#Tokenize sentences into words
tokenized = [word_tokenize(s) for s in cleaned]
for i, toks in enumerate(tokenized, 1):
    print(f"Sentence {i} tokens:", toks)

Sentence 1 tokens: ['i', 'am', 'loving', 'the', 'nlp', 'class', 'but', 'sometimes', 'it', 'feels', 'confusing']
Sentence 2 tokens: ['nlp', 'is', 'a', 'fascinating', 'field', 'it', 'deals', 'with', 'text', 'speech', 'and', 'language', 'understanding']


In [35]:
#Remove stopwords (use NLTK's stopword list)
stop_words = set(stopwords.words('english'))
filtered_tokens = [
    [tok for tok in toks if tok not in stop_words]
    for toks in tokenized]
for i, toks in enumerate(filtered_tokens, 1):
    print(f"Sentence {i} filtered tokens:", toks)

Sentence 1 filtered tokens: ['loving', 'nlp', 'class', 'sometimes', 'feels', 'confusing']
Sentence 2 filtered tokens: ['nlp', 'fascinating', 'field', 'deals', 'text', 'speech', 'language', 'understanding']


**Create Vocabulary**

In [36]:
#vocabulary
all_tokens = [tok for sent in filtered_tokens for tok in sent]
vocabulary = sorted(set(all_tokens))
print(vocabulary)

['class', 'confusing', 'deals', 'fascinating', 'feels', 'field', 'language', 'loving', 'nlp', 'sometimes', 'speech', 'text', 'understanding']


**Generate BOW**

In [37]:
#Bag Of Words 
bow_counts = []
for sent in filtered_tokens:
    c = Counter(sent)
    vector = [c[word] for word in vocabulary] # Build vector in vocabulary order (counts)
    bow_counts.append(vector)
for i, vec in enumerate(bow_counts, 1):
    print(f"Sentence {i} BoW counts:", vec)

Sentence 1 BoW counts: [1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0]
Sentence 2 BoW counts: [0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1]
