# **Pre-processing steps for NLP **




In [35]:
#importing necessary libraries for NLP pre-processing
import nltk
import pandas as pd
import re
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# loading the dataset, dataset can be referred from: https://www.kaggle.com/datasets/datatattle/email-classification-nlp
df_test= pd.read_csv("/content/SMS_test.csv",encoding='cp1252')

There were two files train and test in the dataset of email_classification

In [37]:
df_test.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,"UpgrdCentre Orange customer, you may now claim...",Spam
1,2,"Loan for any purpose £500 - £75,000. Homeowner...",Spam
2,3,Congrats! Nokia 3650 video camera phone is you...,Spam
3,4,URGENT! Your Mobile number has been awarded wi...,Spam
4,5,Someone has contacted our dating service and e...,Spam


In [38]:
df_train= pd.read_csv("/content/SMS_train.csv",encoding='cp1252')

In [39]:
df_train.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam


Text Preprocessing for NLP:

To prepare the text data for the model building we perform text preprocessing. It is the very first step of NLP projects. Some of the preprocessing steps are:

1. Removing punctuations like., ! $( ) * % @
2. Removing URLs
3. Removing Stop words
4. Lower casing
5. Tokenization
6. Stemming
7. Lemmatization
8. Sentence Segmentation

In [40]:
# Removing Punctuation is important as it does not add value to the meaning, and to ease out for the processing
# Function to remove punctuation marks using NLTK
def remove_punctuation_nltk(text):
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Remove tokens that are punctuation marks
    filtered_tokens = [word for word in tokens if word.isalnum()]
    # Join the filtered tokens back into a single string
    return ' '.join(filtered_tokens)

# Apply the function to remove punctuation marks from the 'Message_body' column
df_train['Message_body'] = df_train['Message_body'].apply(remove_punctuation_nltk)

# Print the DataFrame after removing punctuation marks
df_train.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i b...,Non-Spam
2,3,Pity was in mood for that So any other suggest...,Non-Spam
3,4,Will ü b going to esplanade fr home,Non-Spam
4,5,This is the 2nd time we have tried 2 contact U...,Spam


In [41]:
# Function to remove URLs using regular expressions
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

# Apply the function to remove URLs from the 'Message_body' column
df_train['Message_body'] = df_train['Message_body'].apply(remove_urls)

# Apply the function to remove punctuation marks from the 'Message_body' column
df_train['Message_body'] = df_train['Message_body'].apply(remove_punctuation_nltk)

# Print the DataFrame after removing URL
df_train.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i b...,Non-Spam
2,3,Pity was in mood for that So any other suggest...,Non-Spam
3,4,Will ü b going to esplanade fr home,Non-Spam
4,5,This is the 2nd time we have tried 2 contact U...,Spam


In [42]:
# There is pre set stopwords established in the NLTK toolkit, using those stopwords
# Function to remove stopwords using NLTK
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply the function to remove stopwords from the 'Message_body' column
df_train['Message_body'] = df_train['Message_body'].apply(remove_stopwords)

# Print the DataFrame after stopwords
df_train.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl true name,Non-Spam
1,2,guy bitching acted like interested buying some...,Non-Spam
2,3,Pity mood suggestions,Non-Spam
3,4,ü b going esplanade fr home,Non-Spam
4,5,2nd time tried 2 contact U Pound prize 2 claim...,Spam


In [43]:
#often a word is written in both upper case and lower case, while both words are same, to remove redundancy
# Function to lowercase the text
def lowercase_text(text):
    # Convert text to lowercase
    return text.lower()

# Apply the lowercase_text function to the 'Message_body' column
df_train['Message_body'] = df_train['Message_body'].apply(lowercase_text)

# Print the DataFrame after preprocessing
df_train.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,rofl true name,Non-Spam
1,2,guy bitching acted like interested buying some...,Non-Spam
2,3,pity mood suggestions,Non-Spam
3,4,ü b going esplanade fr home,Non-Spam
4,5,2nd time tried 2 contact u pound prize 2 claim...,Spam


In [44]:
# Tokenize the text
# Function to tokenize text
def tokenize_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text)
    return tokens

# Apply tokenization to the 'Message_body' column
df_train['Message_body'] = df_train['Message_body'].apply(tokenize_text)

# Print the DataFrame after tokenization
df_train.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,"[rofl, true, name]",Non-Spam
1,2,"[guy, bitching, acted, like, interested, buyin...",Non-Spam
2,3,"[pity, mood, suggestions]",Non-Spam
3,4,"[ü, b, going, esplanade, fr, home]",Non-Spam
4,5,"[2nd, time, tried, 2, contact, u, pound, prize...",Spam


In [45]:
# Stemming is used when we want the word to get rid of suffixes
# You can also compare the two results before stemming and after,
# bitching is converted to bitch, acted is to act
# Function to perform stemming
def stem_text(tokens):
    # Initialize PorterStemmer
    porter = PorterStemmer()
    # Apply stemming to each token
    stemmed_tokens = [porter.stem(token) for token in tokens]
    return stemmed_tokens

# Apply stemming to the 'Message_body' column
df_train['Message_body'] = df_train['Message_body'].apply(stem_text)

# Print the DataFrame after stemming
df_train.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,"[rofl, true, name]",Non-Spam
1,2,"[guy, bitch, act, like, interest, buy, someth,...",Non-Spam
2,3,"[piti, mood, suggest]",Non-Spam
3,4,"[ü, b, go, esplanad, fr, home]",Non-Spam
4,5,"[2nd, time, tri, 2, contact, u, pound, prize, ...",Spam


In [46]:
# It analyzes the word and convert it to its root words in english dictioary unlike stemming which only chops the suffixes
# Function to perform lemmatization
def lemmatize_text(tokens):
    # Initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    # Apply lemmatization to each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# Apply lemmatization to the 'Message_body' column
df_train['Message_body'] = df_train['Message_body'].apply(lemmatize_text)

# Print the DataFrame after lemmatization
df_train.head()


Unnamed: 0,S. No.,Message_body,Label
0,1,"[rofl, true, name]",Non-Spam
1,2,"[guy, bitch, act, like, interest, buy, someth,...",Non-Spam
2,3,"[piti, mood, suggest]",Non-Spam
3,4,"[ü, b, go, esplanad, fr, home]",Non-Spam
4,5,"[2nd, time, tri, 2, contact, u, pound, prize, ...",Spam


In [47]:
# We are done with major pre-processing and now want to preserve the meaning of our Message_body to sentence back
# Function to perform sentence segmentation
def segment_sentences(tokens):
    text = ' '.join(tokens)  # Join tokens into a single string
    sentences = nltk.sent_tokenize(text)
    return sentences

# Apply sentence segmentation to the 'Message_body' column
df_train['Message_body'] = df_train['Message_body'].apply(segment_sentences)

# Print the DataFrame after sentence segmentation
df_train.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,[rofl true name],Non-Spam
1,2,[guy bitch act like interest buy someth el nex...,Non-Spam
2,3,[piti mood suggest],Non-Spam
3,4,[ü b go esplanad fr home],Non-Spam
4,5,[2nd time tri 2 contact u pound prize 2 claim ...,Spam
