In [8]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from sklearn.model_selection import train_test_split

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [9]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("kg_train.csv", encoding="latin-1")


# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [10]:
X = data["text"]        # features (the actual messages)
y = data["label"]       # target (spam or ham)

# Split into training (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Data Preprocessing

In [11]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [14]:
import re

def clean_html(text):
    # 1. Remove inline <script> ... </script> and <style> ... </style> blocks
    text = re.sub(r"<script.*?>.*?</script>", "", text, flags=re.DOTALL)
    text = re.sub(r"<style.*?>.*?</style>", "", text, flags=re.DOTALL)
    
    # 2. Remove HTML comments <!-- ... -->
    text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
    
    # 3. Remove any remaining HTML tags < ... >
    text = re.sub(r"<.*?>", "", text)
    
    # Return the clean text
    return text

# test to see if it works
sample_html = "<html><body><!-- comment -->Hello <b>World</b><script>var x=1;</script></body></html>"
print("Before:", sample_html)
print("After:", clean_html(sample_html))

Before: <html><body><!-- comment -->Hello <b>World</b><script>var x=1;</script></body></html>
After: Hello World


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [15]:
# Your code
def clean_text(text):
    # 1. Remove special characters (keep only letters and spaces)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    
    # 2. Remove numbers
    text = re.sub(r"\d+", " ", text)
    
    # 3. Remove single characters (like "a", "b")
    text = re.sub(r"\b[a-zA-Z]\b", " ", text)
    
    # 4. Remove single characters at the start of text
    text = re.sub(r"^[a-zA-Z]\s+", " ", text)
    
    # 5. Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text)
    
    # 6. Convert to lowercase
    text = text.lower()
    
    # Return the clean text
    return text.strip()


#Quick test
sample = "This is an Example! With NUMBERS 123, and some single letters: a b c."
print("Before:", sample)
print("After:", clean_text(sample))

Before: This is an Example! With NUMBERS 123, and some single letters: a b c.
After: this is an example with numbers and some single letters


## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    words = text.split()  # split sentence into words
    # keep only words that are NOT stopwords
    filtered = [w for w in words if w not in stop_words]
    return " ".join(filtered)

# Quick test
sample = "this is a very simple example of removing stopwords"
print("Before:", sample)
print("After:", remove_stopwords(sample))

Before: this is a very simple example of removing stopwords
After: simple example removing stopwords


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lemmas = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(lemmas)

# Quick test
sample = "cats are running faster and mice were better"
print("Before:", sample)
print("After:", lemmatize_text(sample))

Before: cats are running faster and mice were better
After: cat are running faster and mouse were better


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [23]:
print(data["label"].unique())


[1 0]


In [None]:
from collections import Counter
import matplotlib.pyplot as plt

# ---- HAM (label = 0) ----
ham_text = " ".join(data[data["label"] == 0]["text"])
ham_words = ham_text.split()
ham_freq = Counter(ham_words).most_common(10)
print("Top 10 HAM words:", ham_freq)

# ---- SPAM (label = 1) ----
spam_text = " ".join(data[data["label"] == 1]["text"])
spam_words = spam_text.split()
spam_freq = Counter(spam_words).most_common(10)
print("Top 10 SPAM words:", spam_freq)


Top 10 HAM words: [('the', 1593), ('to', 1039), ('of', 782), ('and', 781), ('a', 591), ('in', 540), ('that', 371), ('is', 353), ('for', 344), ('on', 283)]
Top 10 SPAM words: [('the', 5676), ('to', 4688), ('of', 4118), ('and', 3234), ('in', 2672), ('I', 2633), ('you', 2273), ('this', 2010), ('a', 1939), ('for', 1685)]


## Extra features

In [29]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])
data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 
data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 
data_train.head()

NameError: name 'data_train' is not defined

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Your code

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
# Your code

## And the Train a Classifier?

In [None]:
# Your code

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code