In [4]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [8]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1000 non-null   object
 1   label   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


### Let's divide the training and test set into two partitions

In [12]:
# Your code
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

## Data Preprocessing

In [13]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [19]:
# Your code
from bs4 import BeautifulSoup
import string

def clean_html(raw_html: str) -> str:
    if not isinstance(raw_html, str):
        return ""

    # Parse HTML safely
    soup = BeautifulSoup(raw_html, "lxml")

    # Get visible text only (removes <script>, <style>, comments, tags)
    text = soup.get_text(separator=" ", strip=True)

    return text


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [20]:
# Your code
import re

def normalize_text(text: str) -> str:
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove prefixed 'b' (from byte strings like b'hello')
    text = re.sub(r"^b\s+", "", text)

    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", " ", text)

    # Remove all single characters (isolated letters)
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)

    # Remove single characters from the start
    text = re.sub(r"^[a-zA-Z]\s+", "", text)

    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [21]:
def clean_text(raw_html: str) -> str:
    # Step 1: Strip HTML/JS/CSS
    text = clean_html(raw_html)

    # Step 2: Normalize plain text
    text = normalize_text(text)

    return text

## Now let's work on removing stopwords
Remove the stopwords.

In [22]:
# Your code
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))

def process_text_final(raw_html: str) -> str:
    
    #  Use the clean_html function
    text = clean_html(raw_html)

    # Use the normalize_text function
    text = normalize_text(text)

    # Tokenize, remove stopwords, and stem
    tokens = word_tokenize(text)
    stemmed_tokens = [snowball.stem(word) for word in tokens if word not in stop_words]

    return " ".join(stemmed_tokens)
    

print("Applying final processing to X_train and X_test...")
X_train_processed = X_train.apply(process_text_final)
X_test_processed = X_test.apply(process_text_final)
print("Processing complete!")
print("\nExample of processed text:")
print(X_train_processed.iloc[5])

Applying final processing to X_train and X_test...


  soup = BeautifulSoup(raw_html, "lxml")
  soup = BeautifulSoup(raw_html, "lxml")


Processing complete!

Example of processed text:
dear friend propos surpris person contact howev sincer seek confid transact propos person transpar honesti high calib let first start introduc proper name ron sinclear person assist haitian presid apolog infring privaci may interest know former presid haiti fight serious war unit democraci quit year back along foreign power countri quit long ago american govern order presid jean bertrand arist leav haiti forc power rabel forc know fulli well capabl america power member unit nation presid arist decid left seat power exil south africa countri seek asylum new govern place light sad happen deposit made secur compani europ year may took place still power presid haiti aid person assist ron sinclear loyalist suceed secret move sum usd nine million five hunder thousand unit state dollar privat secur compani vault europ need servic high reliabl foreign receiv fund bank account futur surviv famili arist present oper foreign bank account name fund 

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [26]:
# Your code
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(raw_html: str) -> str:
    
    # Step 1: Use your clean_html function
    text = clean_html(raw_html)

    # Step 2: Use your normalize_text function
    text = normalize_text(text)

    # Step 3: Tokenize the text
    tokens = word_tokenize(text)

    # Step 4: Remove stopwords and lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Step 5: Join tokens back into a string
    return " ".join(lemmatized_tokens)

# --- Apply the new lemmatization function to your data ---
print("Applying lemmatization to X_train and X_test...")
# We will use this lemmatized version for the rest of the notebook
X_train_processed = X_train.apply(lemmatize_text)
X_test_processed = X_test.apply(lemmatize_text)
print("Lemmatization complete!")

# Display an example to verify the output
print("\nExample of lemmatized text:")
print(X_train_processed.iloc[5])

Applying lemmatization to X_train and X_test...


  soup = BeautifulSoup(raw_html, "lxml")
  soup = BeautifulSoup(raw_html, "lxml")


Lemmatization complete!

Example of lemmatized text:
dear friend proposal surprising personal contact however sincerely seek confidence transaction propose person transparency honesty high caliber let first start introducing properly name ron sinclear personal assistance haitian president apologize infringed privacy may interest know former president haiti fighting serious war united democracy quite year backed along foreign powerful country quite long ago american government ordered president jean bertrand aristed leave haiti forced power rabel force knowing fully well capability america powerful member united nation president aristed decided left seat power exile south africa country seek asylum new government place light sad happening deposit made security company europe year may took place still power president haiti aid personal assistant ron sinclear loyalist suceeded secretely move sum usd nine million five hundered thousand united state dollar private security company vault eur

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [30]:
# Your code
from collections import Counter

# Combine the processed text with its label to do this analysis
train_df = pd.concat([X_train_processed, y_train], axis=1)

# Separate ham (0) and spam (1) messages
ham_messages = train_df[train_df['label'] == 0]['text']
spam_messages = train_df[train_df['label'] == 1]['text']

# Create a counter for ham words by joining all messages into one string
ham_word_counts = Counter(" ".join(ham_messages).split())

# Create a counter for spam words
spam_word_counts = Counter(" ".join(spam_messages).split())

print("Top 10 words in HAM messages")
print(ham_word_counts.most_common(10))

print("Top 10 words in SPAM messages")
print(spam_word_counts.most_common(10))

Top 10 words in HAM messages
[('state', 116), ('pm', 97), ('would', 93), ('president', 89), ('mr', 89), ('time', 81), ('percent', 80), ('obama', 77), ('call', 74), ('secretary', 74)]
Top 10 words in SPAM messages
[('money', 847), ('account', 742), ('bank', 645), ('u', 631), ('fund', 626), ('e', 510), ('transaction', 471), ('business', 424), ('mr', 423), ('country', 422)]


## Extra features

## How would work the Bag of Words with Count Vectorizer concept?

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [31]:
# Your code
from scipy.sparse import hstack, csr_matrix

# TF-IDF Vectorization
# Initialize the vectorizer to find the top 5000 word features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit on the training data and transform both train and test data
X_train_tfidf = vectorizer.fit_transform(X_train_processed)
X_test_tfidf = vectorizer.transform(X_test_processed)

print(f"Shape of the TF-IDF training data: {X_train_tfidf.shape}")

Shape of the TF-IDF training data: (800, 5000)


## And the Train a Classifier?

In [32]:
# Your code
# Using TF-IDF
print("--- Training Model with TF-IDF only ---")
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_tfidf))


--- Training Model with TF-IDF only ---
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       125
           1       0.97      0.97      0.97        75

    accuracy                           0.98       200
   macro avg       0.98      0.98      0.98       200
weighted avg       0.98      0.98      0.98       200



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code