In [4]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [11]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [None]:
X = data["text"] 
y = data["label"]  

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 800
Test size: 200


## Data Preprocessing

In [None]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [21]:
import re
from bs4 import BeautifulSoup, Comment
from html import unescape

def clean_html(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")  # or "html.parser"

    # 1) Remove inline JS/CSS:
    #    a) <script> and <style> blocks
    for tag in soup(["script", "style"]):
        tag.decompose()
    #    b) inline attributes like onclick, onload, style=...
    for tag in soup.find_all(True):
        # drop event handlers and inline styles
        for attr in list(tag.attrs):
            if attr.lower().startswith("on") or attr.lower() == "style":
                del tag.attrs[attr]

    # 2) Remove HTML comments (do this before stripping tags)
    for c in soup.find_all(string=lambda t: isinstance(t, Comment)):
        c.extract()

    # 3) Remove remaining tags -> keep only visible text
    text = soup.get_text(separator=" ")

    # Unescape entities and normalize whitespace
    text = unescape(text)
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [22]:
def normalize_text(text: str) -> str:
    # 1) Remove all special characters (keep only letters and spaces)
    text = re.sub(r"[^A-Za-z\s]", " ", text)

    # 2) Remove numbers
    text = re.sub(r"\d+", " ", text)

    # 3) Remove all single characters (isolated letters)
    text = re.sub(r"\b[a-zA-Z]\b", " ", text)

    # 4) Remove single characters from the start of words
    text = re.sub(r"\b[a-zA-Z]\s+", " ", text)

    # 5) Substitute multiple spaces with single space
    text = re.sub(r"\s+", " ", text)

    # 6) Remove prefixed 'b' (like b'word' when decoding bytes)
    text = re.sub(r"^b\s+", "", text)

    # 7) Convert to lowercase
    text = text.lower().strip()

    return text


In [24]:
sample = "123 trying! this.. thingy? I gUESS?!"
print(normalize_text(sample))


trying this thingy guess


## Now let's work on removing stopwords
Remove the stopwords.

In [25]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pktto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
import re
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words("english"))

def normalize_text(text: str) -> str:
    # 1) Remove all special characters (keep only letters and spaces)
    text = re.sub(r"[^A-Za-z\s]", " ", text)

    # 2) Remove numbers
    text = re.sub(r"\d+", " ", text)

    # 3) Remove all single characters (isolated letters)
    text = re.sub(r"\b[a-zA-Z]\b", " ", text)

    # 4) Remove single characters from the start of words
    text = re.sub(r"\b[a-zA-Z]\s+", " ", text)

    # 5) Substitute multiple spaces with single space
    text = re.sub(r"\s+", " ", text)

    # 6) Remove prefixed 'b'
    text = re.sub(r"^b\s+", "", text)

    # 7) Convert to lowercase
    text = text.lower().strip()

    # 8) Remove stopwords
    tokens = [word for word in text.split() if word not in STOPWORDS]
    text = " ".join(tokens)

    return text


In [27]:
sample = "I'm trying to remove some stopwords to see if this works or not, so this is an example."
print(normalize_text(sample))


trying remove stopwords see works example


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [28]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pktto\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\pktto\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [29]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

STOPWORDS = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def normalize_text(text: str, remove_stopwords: bool = True, do_lemmatize: bool = True) -> str:
    # 1) Remove all special characters
    text = re.sub(r"[^A-Za-z\s]", " ", text)

    # 2) Remove numbers
    text = re.sub(r"\d+", " ", text)

    # 3) Remove single characters
    text = re.sub(r"\b[a-zA-Z]\b", " ", text)

    # 4) Remove single characters from start
    text = re.sub(r"\b[a-zA-Z]\s+", " ", text)

    # 5) Collapse multiple spaces
    text = re.sub(r"\s+", " ", text)

    # 6) Remove prefixed 'b'
    text = re.sub(r"^b\s+", "", text)

    # 7) Lowercase
    text = text.lower().strip()

    # Tokenize
    tokens = text.split()

    # 8) Remove stopwords (optional)
    if remove_stopwords:
        tokens = [w for w in tokens if w not in STOPWORDS]

    # 9) Lemmatize (optional)
    if do_lemmatize:
        tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return " ".join(tokens)


In [31]:
sample = "Cats are running faster than the dogs were these past days."
print(normalize_text(sample))

cat running faster dog past day


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [36]:
def get_top_words_clean(data, label, n=10):
    texts = data[data["label"] == label]["text"]
    tokens = []
    for msg in texts:
        clean_msg = normalize_text(msg)  # your cleaning fn
        tokens.extend(clean_msg.split())
    counter = Counter(tokens)
    return counter.most_common(n)

In [43]:
# Map 0 -> ham, 1 -> spam
label_map = {0: "ham", 1: "spam"}
data["label_name"] = data["label"].map(label_map)

# Check it worked
print(data[["label", "label_name"]].head())


   label label_name
0      1       spam
1      0        ham
2      0        ham
3      1       spam
4      0        ham


In [44]:
# Using the new column
ham_texts = data[data["label_name"] == "ham"]["text"]
spam_texts = data[data["label_name"] == "spam"]["text"]

from collections import Counter

def top_words(texts, n=10):
    all_words = " ".join(texts).lower().split()
    counter = Counter(all_words)
    return counter.most_common(n)

print("Top HAM words:", top_words(ham_texts))
print("Top SPAM words:", top_words(spam_texts))


Top HAM words: [('the', 1710), ('to', 1056), ('and', 808), ('of', 789), ('a', 610), ('in', 582), ('that', 385), ('is', 379), ('for', 357), ('on', 304)]
Top SPAM words: [('the', 6785), ('to', 5494), ('of', 4858), ('and', 3867), ('in', 3156), ('i', 2859), ('you', 2722), ('this', 2523), ('a', 2245), ('my', 2026)]


## Extra features

In [45]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

NameError: name 'data_train' is not defined

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Your code

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
# Your code

## And the Train a Classifier?

In [None]:
# Your code

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code