In [60]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [61]:
%pip install pandas matplotlib scikit-learn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

Note: you may need to restart the kernel to use updated packages.


- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [62]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

## Data Preprocessing

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [63]:
import re

def clean_html(raw_html):
    # Step 1: remove inline JavaScript and CSS (between <script>...</script> and <style>...</style>)
    cleaned = re.sub(r'<(script|style).*?>.*?</\1>', '', raw_html, flags=re.DOTALL | re.IGNORECASE)

    # Step 2: remove HTML comments <!-- ... -->
    cleaned = re.sub(r'<!--.*?-->', '', cleaned, flags=re.DOTALL)

    # Step 3: remove remaining HTML tags <...>
    cleaned = re.sub(r'<[^>]+>', '', cleaned)

    return cleaned

# Combine both steps and overwrite the column
data["preprocessed_text"] = data["text"].apply(lambda t: clean_html(t))
print(data[["preprocessed_text", "label"]].head(3))

                                   preprocessed_text  label
0  DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...      1
1                                           Will do.      0
2  Nora--Cheryl has emailed dozens of memos about...      0


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [64]:
def text_cleaning(text):
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove all single characters
    text = re.sub(r'\b\w\b', '', text)

    # Remove single characters from the start of the text
    text = re.sub(r'^\w\s+', '', text)

    # Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove prefixed 'b' (e.g. from byte strings like b'Text')
    text = re.sub(r'^b\s+', '', text)

    # Convert to lowercase
    return text.lower().strip()

data["preprocessed_text"] = data["preprocessed_text"].apply(lambda t: text_cleaning(t))

## Now let's work on removing stopwords
Remove the stopwords.

In [65]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

data["preprocessed_text"] = data["preprocessed_text"].apply(lambda t: text_cleaning(t))

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [57]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
import nltk

# Eenmalige downloads (alleen als nog niet gedaan)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Initialiseer lemmatizer
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN


def lemmatize_text(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged_tokens]
    return " ".join(lemmatized)

data["preprocessed_text"] = data["preprocessed_text"].apply(lemmatize_text)
print(data[["preprocessed_text", "label"]].head(3))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


                                   preprocessed_text  label
0  dear sir strictly private business proposal be...      1
1                                            will do      0
2  nora cheryl have email dozen of memo about hai...      0


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [58]:
from collections import Counter

ham_texts = data[data["label"] == 0]["preprocessed_text"]
spam_texts = data[data["label"] == 1]["preprocessed_text"]

def count_words(text_series):
    all_words = " ".join(text_series).split()
    return Counter(all_words).most_common(10)

top_ham = count_words(ham_texts)
top_spam = count_words(spam_texts)

print("Top 10 words in HAM messages:")
for word, count in top_ham:
    print(f"{word}: {count}")

print("\nTop 10 words in SPAM messages:")
for word, count in top_spam:
    print(f"{word}: {count}")



Top 10 words in HAM messages:
the: 1780
be: 1188
to: 1065
and: 838
of: 799
in: 617
that: 415
have: 395
for: 374
it: 338

Top 10 words in SPAM messages:
the: 7048
to: 5594
be: 5293
of: 4982
and: 3979
in: 3291
you: 3237
this: 2674
my: 2146
your: 2071


## Extra features

In [66]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

from sklearn.model_selection import train_test_split

# Verdeel de dataset in training (80%) en validatie (20%)
data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)


data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

Unnamed: 0,text,label,preprocessed_text,money_mark,suspicious_words,text_len
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1,regards mr nelson smith kindly reply me on my ...,1,0,89
535,I have not been able to reach oscar this am. W...,0,have not been able to reach oscar this am we a...,1,0,91
695,; Huma Abedin B6I'm checking with Pat on the 5...,0,huma abedin checking with pat on the will work...,1,0,125
557,I can have it announced here on Monday - can't...,0,can have it announced here on monday can today,1,0,46
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1,bank of africaagence san pedro bp san pedro co...,1,1,1577


## How would work the Bag of Words with Count Vectorizer concept?

In [67]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(data_train["preprocessed_text"])


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Load the vectorizer
tfidf_vectorizer = TfidfVectorizer()

# 2. Fit + transform entire dataset
X_tfidf = tfidf_vectorizer.fit_transform(data["text"])

# 3. Show shape of the resulting TF-IDF matrix
print("TF-IDF matrix shape:", X_tfidf.shape)


TF-IDF matrix shape: (1000, 25592)


## And the Train a Classifier?

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

# Splitsen
data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)

# TF-IDF vectorizer (train only on training set)
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(data_train["text"])
X_val = tfidf_vectorizer.transform(data_val["text"])

y_train = data_train["label"]
y_val = data_val["label"]

# Initialiseer en train model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [74]:
# --- INSTALL AND IMPORT ---
%pip install nltk

import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
# --- DOWNLOADS ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# --- CLEANING SETUP ---
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_html(text):
    text = re.sub(r'<(script|style).*?>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'&[a-z]+;', ' ', text)
    return text

def final_text_cleaning(text):
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b\w\b', '', text)
    text = re.sub(r'^\w\s+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'^b\s+', '', text)
    return text.lower().strip()

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

def lemmatize_text(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    return " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged])

# --- LOAD DATA ---
data = pd.read_csv("../data/kg_train.csv", encoding="latin-1").head(1000)
data.fillna("", inplace=True)

# --- CLEAN TEXT ---
data["text"] = data["text"].apply(lambda t: lemmatize_text(remove_stopwords(final_text_cleaning(clean_html(t)))))

# --- SPLIT ---
from sklearn.model_selection import train_test_split
data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)

# --- ADD INDICATOR FEATURES ---
money_symbols = "|".join(["euro","dollar","pound","\u20ac","\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

for df in [data_train, data_val]:
    df["money_mark"] = df["text"].str.contains(money_symbols, case=False)*1
    df["suspicious_words"] = df["text"].str.contains(suspicious_words, case=False)*1
    df["text_len"] = df["text"].apply(len)

# --- TF-IDF VECTORIZER ---
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import scipy.sparse as sp

vectorizer = CountVectorizer(ngram_range=(1,2))
X_train_base = vectorizer.fit_transform(data_train["text"])
X_val_base = vectorizer.transform(data_val["text"])

X_train = sp.hstack([X_train_base, data_train[["money_mark", "suspicious_words", "text_len"]].values])
X_val = sp.hstack([X_val_base, data_val[["money_mark", "suspicious_words", "text_len"]].values])

# --- TRAIN CLASSIFIER ---
clf = MultinomialNB()
clf.fit(X_train, data_train["label"])
y_pred = clf.predict(X_val)

# --- REPORT ---
print("Accuracy:", accuracy_score(data_val["label"], y_pred))
print("\nClassification Report:")
print(classification_report(data_val["label"], y_pred, target_names=["ham", "spam"]))


Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Accuracy: 0.525

Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.24      0.39       125
        spam       0.44      1.00      0.61        75

    accuracy                           0.53       200
   macro avg       0.72      0.62      0.50       200
weighted avg       0.79      0.53      0.47       200

