Text Preprocessing Steps
- 

Importing Libraries
- 

In [3]:
import nltk  # main NLP library
from nltk.tokenize import word_tokenize  # split text into words
from nltk.corpus import stopwords        # common words like 'is', 'the'
from nltk.stem import PorterStemmer      # reduce words to root form
from nltk.stem import WordNetLemmatizer  # convert words to dictionary form
from nltk import pos_tag                 # label words as noun, verb, etc.


In [21]:
nltk.download('punkt')                     
nltk.download('stopwords')                
nltk.download('wordnet')                 
nltk.download('averaged_perceptron_tagger') 
import nltk
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

 INPUT TEXT
-

In [14]:
sentence = "Hamna is learning Python in the NLP class, and she finds it fun and challenging!"


Step 1: Convert sentence to lower case
-

In [15]:
sentence_lower = sentence.lower()   #Convert the whole sentence into lowercase letters 
print("Lowercased:", sentence_lower)


Lowercased: hamna is learning python in the nlp class, and she finds it fun and challenging!


Step 2: Tokenization
-

In [16]:
#Tokenize the sentence into words

tokens = word_tokenize(sentence)
print("\n1. Tokens:", tokens)


1. Tokens: ['Hamna', 'is', 'learning', 'Python', 'in', 'the', 'NLP', 'class', ',', 'and', 'she', 'finds', 'it', 'fun', 'and', 'challenging', '!']


Step 3: Stopword Removal
-

In [17]:
# Remove stopwords and non-alphabetic tokens (like punctuation and numbers)

stop_words = set(stopwords.words("english"))           # Get the list of English stopwords
filtered = [w for w in tokens if w.lower() not in stop_words and w.isalpha()]     # Remove stopwords and keep only alphabetic words
print("\n2. After Stopword Removal:", filtered)


2. After Stopword Removal: ['Hamna', 'learning', 'Python', 'NLP', 'class', 'finds', 'fun', 'challenging']


Step 4: Stemming
-

In [18]:
# Apply Stemming (reduces words to root form)

ps = PorterStemmer()              # create stemmer
stemmed = [ps.stem(w) for w in filtered]  # reduce words to root form
print("\n3. After Stemming:", stemmed)   # show result



3. After Stemming: ['hamna', 'learn', 'python', 'nlp', 'class', 'find', 'fun', 'challeng']


Step 5: lemmatization
-

In [12]:
lemmatizer = WordNetLemmatizer()              # create lemmatizer
lemmatized = [lemmatizer.lemmatize(w) for w in filtered]  # convert words to dictionary form
print("\n4. After Lemmatization:", lemmatized)            # show result



4. After Lemmatization: ['Hamna', 'learning', 'Python', 'NLP', 'class', 'find', 'fun', 'challenging']


Step 6: POS tagging
-

In [22]:
#  Part-of-Speech (POS) Tagging

pos_tags = pos_tag(tokens)          # Assigns POS tags to each token
print("\n5. POS Tagging:", pos_tags)


5. POS Tagging: [('Hamna', 'NNP'), ('is', 'VBZ'), ('learning', 'VBG'), ('Python', 'NNP'), ('in', 'IN'), ('the', 'DT'), ('NLP', 'NNP'), ('class', 'NN'), (',', ','), ('and', 'CC'), ('she', 'PRP'), ('finds', 'VBZ'), ('it', 'PRP'), ('fun', 'NN'), ('and', 'CC'), ('challenging', 'NN'), ('!', '.')]


TASK 2
-

In [61]:
# Import necessary libraries
import string
from sklearn.feature_extraction.text import CountVectorizer       #convert text data into a matrix of token counts

 Step 1: Input corpus (list of sentences)
 -

In [60]:
sentence1 = "I am loving the NLP class, but sometimes it feels confusing"
sentence2 = "NLP is a fascinating field it deals with text speech and language understanding"


# Preprocessing function

In [63]:
def preprocess(sentence):
    # Lowercase
    sentence = sentence.lower()
    # Remove punctuation
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    # Split into words
    words = sentence.split()
    # Optional: remove stopwords
    stopwords = ['the', 'is', 'it', 'a', 'am', 'but', 'with', 'and']
    words = [w for w in words if w not in stopwords]
    return words

# Apply preprocessing
word1 = preprocess(sentence1)
word2 = preprocess(sentence2)

Step 2: Make all words lowercase and split into list of words
-

In [64]:
#  Make all words lowercase and split into list of words
words1 = sentence1.lower().split()
words2 = sentence2.lower().split()
print(words1)
print(words2)

['i', 'am', 'loving', 'the', 'nlp', 'class,', 'but', 'sometimes', 'it', 'feels', 'confusing']
['nlp', 'is', 'a', 'fascinating', 'field', 'it', 'deals', 'with', 'text', 'speech', 'and', 'language', 'understanding']


Step 3: Create vocabulary
-

In [65]:
 vocabulary = list(set(words1 + words2))
print(vocabulary)


['and', 'speech', 'fascinating', 'but', 'confusing', 'with', 'field', 'sometimes', 'loving', 'language', 'text', 'deals', 'class,', 'am', 'understanding', 'is', 'nlp', 'a', 'i', 'the', 'it', 'feels']


Step 4:  Make Bag of Words (BOW) for each sentence
-

In [66]:
#  Make Bag of Words (BOW) for each sentence
bow1 = [words1.count(word) for word in vocabulary]
bow2 = [words2.count(word) for word in vocabulary]      # For each word in the vocabulary, count how many times it appears in words1 and word2

print("BOW for sentence 1:", bow1)
print("BOW for sentence 2:", bow2)


BOW for sentence 1: [0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1]
BOW for sentence 2: [1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]


Term Frequency (TF)
-

In [67]:
# For each word in the vocabulary:
# - Count how many times the word appears in the sentence
# - Divide by total number of words in that sentence
# This gives a "relative frequency" (between 0 and 1)


tf1 = [words1.count(word) / len(words1) for word in vocabulary]
tf2 = [words2.count(word) / len(words2) for word in vocabulary]

print("\nTF for sentence 1:", tf1)
print("TF for sentence 2:", tf2)


TF for sentence 1: [0.0, 0.0, 0.0, 0.09090909090909091, 0.09090909090909091, 0.0, 0.0, 0.09090909090909091, 0.09090909090909091, 0.0, 0.0, 0.0, 0.09090909090909091, 0.09090909090909091, 0.0, 0.0, 0.09090909090909091, 0.0, 0.09090909090909091, 0.09090909090909091, 0.09090909090909091, 0.09090909090909091]
TF for sentence 2: [0.07692307692307693, 0.07692307692307693, 0.07692307692307693, 0.0, 0.0, 0.07692307692307693, 0.07692307692307693, 0.0, 0.0, 0.07692307692307693, 0.07692307692307693, 0.07692307692307693, 0.0, 0.0, 0.07692307692307693, 0.07692307692307693, 0.07692307692307693, 0.07692307692307693, 0.0, 0.0, 0.07692307692307693, 0.0]


IDF (Inverse Document Frequency)
-

In [68]:
import math

documents = [words1, words2]
idf = []

for word in vocabulary:
    doc_count = sum([1 for doc in documents if word in doc])
    idf.append(math.log(len(documents)/doc_count))

print("IDF:", idf)


IDF: [0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.0, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.0, 0.6931471805599453]


TF Ã— IDF
-

In [69]:
tfidf1 = [tf1[i] * idf[i] for i in range(len(vocabulary))]
tfidf2 = [tf2[i] * idf[i] for i in range(len(vocabulary))]

print("\nTF-IDF for sentence 1:", tfidf1)
print("TF-IDF for sentence 2:", tfidf2)



TF-IDF for sentence 1: [0.0, 0.0, 0.0, 0.06301338005090412, 0.06301338005090412, 0.0, 0.0, 0.06301338005090412, 0.06301338005090412, 0.0, 0.0, 0.0, 0.06301338005090412, 0.06301338005090412, 0.0, 0.0, 0.0, 0.0, 0.06301338005090412, 0.06301338005090412, 0.0, 0.06301338005090412]
TF-IDF for sentence 2: [0.053319013889226566, 0.053319013889226566, 0.053319013889226566, 0.0, 0.0, 0.053319013889226566, 0.053319013889226566, 0.0, 0.0, 0.053319013889226566, 0.053319013889226566, 0.053319013889226566, 0.0, 0.0, 0.053319013889226566, 0.053319013889226566, 0.0, 0.053319013889226566, 0.0, 0.0, 0.0, 0.0]


WORD EMBEDDING
-

In [36]:
from gensim.models import Word2Vec


In [74]:

sentences = [word1, word2]

model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=1)

print("Vocabulary:", list(model.wv.index_to_key))

print("Vector for 'nlp':", model.wv['nlp'][:5]) 

print("Most similar to 'nlp':", model.wv.most_similar('nlp'))


Vocabulary: ['nlp', 'understanding', 'language', 'speech', 'text', 'deals', 'field', 'fascinating', 'confusing', 'feels', 'sometimes', 'class', 'loving', 'i']
Vector for 'nlp': [-0.00107245  0.00047286  0.0102067   0.01801855 -0.0186059 ]
Most similar to 'nlp': [('class', 0.16704076528549194), ('i', 0.15019884705543518), ('fascinating', 0.13204392790794373), ('language', 0.1267007291316986), ('feels', 0.0998455360531807), ('understanding', 0.042373016476631165), ('loving', 0.04067763686180115), ('deals', 0.012442179024219513), ('sometimes', -0.01259106956422329), ('speech', -0.01447527389973402)]
