In [3]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('all')

In [45]:
df = pd.read_csv("SMSSpamCollection.tsv", sep='\t', header=None, names=['label', 'SMS_text']);df

Unnamed: 0,label,SMS_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [47]:
#some feature engineering
df['SMS_length'] = [len(text) for text in df['SMS_text']]
df['number_of_special_characters'] = [len(re.findall(r"[!@$#$%^&*()]", text))  for text in df['SMS_text']]
df['percentage_of_special_characters'] = round(df['number_of_special_characters']/df['SMS_length']*100,2);df

Unnamed: 0,label,SMS_text,SMS_length,number_of_special_characters,percentage_of_special_characters
0,ham,"Go until jurong point, crazy.. Available only ...",111,0,0.00
1,ham,Ok lar... Joking wif u oni...,29,0,0.00
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,3,1.94
3,ham,U dun say so early hor... U c already then say...,49,0,0.00
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,0,0.00
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,1,0.62
5568,ham,Will ü b going to esplanade fr home?,36,0,0.00
5569,ham,"Pity, * was in mood for that. So...any other s...",57,1,1.75
5570,ham,The guy did some bitching but I acted like i'd...,125,0,0.00


In [None]:
#Preprocess
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    '''
    1. lowercase
    2. remove links & numbers
    3. tokenize
    4. remove stopwards
    5. lemmatize
    '''
    # Lowercasing
    text = text.lower()
    
    # Remove special characters, links, and numbers
    text = re.sub(r"https?://\S+|www\.\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-z\s]", "", text)  # Keep only alphabets and spaces
    
    # Tokenization
    words = word_tokenize(text)
    
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return " ".join(words)  # Convert list back to string

df['SMS_text'] = [preprocess(text) for text in df['SMS_text']];df

Unnamed: 0,label,SMS_text,SMS_length,number_of_special_characters,percentage_of_special_characters
0,ham,go jurong point crazy available bugis n great ...,20,0,0.00
1,ham,ok lar joking wif u oni,6,0,0.00
2,spam,free entry wkly comp win fa cup final tkts st ...,28,3,10.71
3,ham,u dun say early hor u c already say,11,0,0.00
4,ham,nah dont think go usf life around though,13,0,0.00
...,...,...,...,...,...
5567,spam,nd time tried contact u u pound prize claim ea...,30,1,3.33
5568,ham,b going esplanade fr home,8,0,0.00
5569,ham,pity mood soany suggestion,10,1,10.00
5570,ham,guy bitching acted like id interested buying s...,26,0,0.00
