In [None]:
from IPython.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))


# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development.

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
import nltk

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development.
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Let's divide the training and test set into two partitions

In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'data' has columns 'text' for email content and 'label' for target
X = data['text']   # features
y = data['label']  # target

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42)



## Data Preprocessing

In [None]:

import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
import re

def clean_html(text):
    # Remove inline JavaScript/CSS
    cleaned_text = re.sub(r"<(style|script).*?>.*?</\s*\1>", "", text, flags=re.DOTALL)
    # Remove html comments
    cleaned_text = re.sub(r"<!--.*?-->", "", cleaned_text, flags=re.DOTALL)
    # Remove the remaining tags
    cleaned_text = re.sub(r"<.*?>", "", cleaned_text)
    return cleaned_text

X_train_cleaned = X_train.apply(clean_html)
X_test_cleaned = X_test.apply(clean_html)

print("HTML cleaning complete for training data:")
print(X_train_cleaned.head())
print("\nHTML cleaning complete for test data:")
print(X_test_cleaned.head())



HTML cleaning complete for training data:
29     ----------- REGARDS, MR NELSON SMITH.KINDLY RE...
535    I have not been able to reach oscar this am. W...
695    ; Huma Abedin B6I'm checking with Pat on the 5...
557    I can have it announced here on Monday - can't...
836        BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...
Name: text, dtype: object

HTML cleaning complete for test data:
521    Dear Sir=2C I wish you go through this offer t...
737    To take your mind off the Balkans for a second...
740                         Pls keep the updates coming!
660    CHRIST BETHEL HOSPITAL11 RUE ABOBOTE,ABIDJANIV...
411    sbwhoeopFriday February 5 2010 7:11 AMHRe: Bra...
Name: text, dtype: object


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters

- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
import re

def clean_text(text):
    # 1. Remove special characters (keep only letters and spaces)
    cleaned_text = re.sub(r'[^A-Za-z\s]', ' ', text)

    # 2. Remove numbers
    cleaned_text = re.sub(r'\d+', '', text)

    # 3. Remove all single characters (isolated letters)
    cleaned_text = re.sub(r'\b\w\b', '', text)

    # 4. Remove single characters from the start of the string
    cleaned_text = re.sub(r'^\w\s+', '', text)

    # 5. Substitute multiple spaces with single space
    cleaned_text = re.sub(r'\s+', ' ', text)

    # 6. Remove prefixed 'b' (in case of byte string artifacts like b'word')
    cleaned_text = re.sub(r"^b\s+", '', text)

    # 7. Convert to lowercase
    cleaned_text = text.lower()

    return cleaned_text.strip()

X_train_cleaned_text = X_train_cleaned.apply(clean_text)
X_test_cleaned_text = X_test_cleaned.apply(clean_text)

print("Text cleaning complete for training data:")
print(X_train_cleaned_text.head())
print("\nText cleaning complete for test data:")
print(X_test_cleaned_text.head())


Text cleaning complete for training data:
29     ----------- regards, mr nelson smith.kindly re...
535    i have not been able to reach oscar this am. w...
695    ; huma abedin b6i'm checking with pat on the 5...
557    i can have it announced here on monday - can't...
836    bank of africaagence san pedro14 bp 1210 san p...
Name: text, dtype: object

Text cleaning complete for test data:
521    dear sir=2c i wish you go through this offer t...
737    to take your mind off the balkans for a second...
740                         pls keep the updates coming!
660    christ bethel hospital11 rue abobote,abidjaniv...
411    sbwhoeopfriday february 5 2010 7:11 amhre: bra...
Name: text, dtype: object


## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
# Your code
from nltk.corpus import stopwords
import nltk
import sys

def remove_stopwords(text):
    try:
        stop_words = set(stopwords.words('english'))
    except LookupError:
        print("NLTK stopwords not found. Downloading...", file=sys.stderr)
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))

    cleaned_text = ' '.join(word for word in text.split() if word not in stop_words)
    return cleaned_text

X_train_no_stopwords = X_train_cleaned_text.apply(remove_stopwords)
X_test_no_stopwords = X_test_cleaned_text.apply(remove_stopwords)

print("Stopword removal complete for training data:")
print(X_train_no_stopwords.head())
print("\nStopword removal complete for test data:")
print(X_test_no_stopwords.head())

Stopword removal complete for training data:
29     ----------- regards, mr nelson smith.kindly re...
535    able reach oscar am. supposed send pdb 11. u r...
695    ; huma abedin b6i'm checking pat 50k work jack...
557                       announced monday - can't today
836    bank africaagence san pedro14 bp 1210 san pedr...
Name: text, dtype: object

Stopword removal complete for test data:
521    dear sir=2c wish go offer consider partner=2ei...
737    take mind balkans second see ã¢ââ great plug...
740                             pls keep updates coming!
660    christ bethel hospital11 rue abobote,abidjaniv...
411    sbwhoeopfriday february 5 2010 7:11 amhre: bra...
Name: text, dtype: object


In [None]:
# import nltk
# from nltk.corpus import stopwords

# # Make sure stopwords are available
# nltk.download('stopwords', quiet=True)
# stop_words = set(stopwords.words('english'))

# # Simple stopword removal function
# def remove_stopwords(text):
#     return ' '.join([w for w in text.split() if w not in stop_words])

# # Apply to training and test sets
# X_train_no_stopwords = X_train_cleaned_text.apply(remove_stopwords)
# X_test_no_stopwords = X_test_cleaned_text.apply(remove_stopwords)

# print(X_train_no_stopwords.head())
# print(X_test_no_stopwords.head())


29     ----------- regards, mr nelson smith.kindly re...
535    able reach oscar am. supposed send pdb 11. u r...
695    ; huma abedin b6i'm checking pat 50k work jack...
557                       announced monday - can't today
836    bank africaagence san pedro14 bp 1210 san pedr...
Name: text, dtype: object
521    dear sir=2c wish go offer consider partner=2ei...
737    take mind balkans second see ã¢ââ great plug...
740                             pls keep updates coming!
660    christ bethel hospital11 rue abobote,abidjaniv...
411    sbwhoeopfriday february 5 2010 7:11 amhre: bra...
Name: text, dtype: object


In [None]:

# import nltk
# import sys
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

# # Download stopwords & tokenizer
# nltk.download('punkt')
# nltk.download('stopwords')

# stop_words = set(stopwords.words('english'))

# # Tokenize text
# tokenized_text = word_tokenize(text)

# # Remove stopwords
# no_stop_words_text = [word for word in tokenized_text if word.lower() not in stop_words]

# print(no_stop_words_text)


['striped', 'bats', 'hanging', 'feet', 'best']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# import nltk
# from nltk.stem import WordNetLemmatizer

# # Download WordNet once
# nltk.download('wordnet', quiet=True)

# # Create the lemmatizer once
# lemmatizer = WordNetLemmatizer()

# # Define function
# def lemmatize_text(text):
#     return ' '.join(lemmatizer.lemmatize(w) for w in text.split())

# # Apply function
# X_train_lemmatized = X_train_no_stopwords.apply(lemmatize_text)
# X_test_lemmatized = X_test_no_stopwords.apply(lemmatize_text)

# print(X_train_lemmatized.head())
# print(X_test_lemmatized.head())




29     ----------- regards, mr nelson smith.kindly re...
535    able reach oscar am. supposed send pdb 11. u r...
695    ; huma abedin b6i'm checking pat 50k work jack...
557                       announced monday - can't today
836    bank africaagence san pedro14 bp 1210 san pedr...
Name: text, dtype: object
521    dear sir=2c wish go offer consider partner=2ei...
737    take mind balkan second see ã¢ââ great plug ...
740                              pls keep update coming!
660    christ bethel hospital11 rue abobote,abidjaniv...
411    sbwhoeopfriday february 5 2010 7:11 amhre: bra...
Name: text, dtype: object


In [None]:
# Your code
from nltk.stem import WordNetLemmatizer
import nltk
import sys

# Download wordnet before applying the function
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
     print("WordNet not found. Downloading...", file=sys.stderr)
     nltk.download('wordnet')


def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split())


X_train_lemmatized = X_train_no_stopwords.apply(lemmatize_text)
X_test_lemmatized = X_test_no_stopwords.apply(lemmatize_text)

print("Lemmatization complete for training data:")
print(X_train_lemmatized.head())
print("\nLemmatization complete for test data:")
print(X_test_lemmatized.head())

WordNet not found. Downloading...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Lemmatization complete for training data:
29     ----------- regards, mr nelson smith.kindly re...
535    able reach oscar am. supposed send pdb 11. u r...
695    ; huma abedin b6i'm checking pat 50k work jack...
557                       announced monday - can't today
836    bank africaagence san pedro14 bp 1210 san pedr...
Name: text, dtype: object

Lemmatization complete for test data:
521    dear sir=2c wish go offer consider partner=2ei...
737    take mind balkan second see ã¢ââ great plug ...
740                              pls keep update coming!
660    christ bethel hospital11 rue abobote,abidjaniv...
411    sbwhoeopfriday february 5 2010 7:11 amhre: bra...
Name: text, dtype: object


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
# from collections import Counter

# # Separate ham and spam messages
# ham_texts = data[data['label'] == 'ham']['text']
# spam_texts = data[data['label'] == 'spam']['text']

# # Tokenize all ham words and spam words
# ham_words = ' '.join(ham_texts).lower().split()
# spam_words = ' '.join(spam_texts).lower().split()

# # Get top 10 most common words in ham and spam
# top_ham = Counter(ham_words).most_common(10)
# top_spam = Counter(spam_words).most_common(10)

# print("Top 10 Ham words:", top_ham)
# print("Top 10 Spam words:", top_spam)


Top 10 Ham words: []
Top 10 Spam words: []


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Build and fit Bag of Words on training text
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)

# Get indices for ham and spam
ham_indices = y_train[y_train == 0].index
spam_indices = y_train[y_train == 1].index

# Sum word counts per class
ham_word_counts = np.array(X_train[ham_indices].sum(axis=0)).flatten()
spam_word_counts = np.array(X_train[spam_indices].sum(axis=0)).flatten()

# Vocabulary
vocab = np.array(bow_vectorizer.get_feature_names_out())

# Top 10 Ham words
top_ham = sorted(zip(vocab, ham_word_counts), key=lambda x: x[1], reverse=True)[:10]
# Top 10 Spam words
top_spam = sorted(zip(vocab, spam_word_counts), key=lambda x: x[1], reverse=True)[:10]

print("Top 10 Ham words:")
for word, count in top_ham:
    print(f"{word}: {count}")

print("\nTop 10 Spam words:")
for word, count in top_spam:
    print(f"{word}: {count}")


Top 10 Ham words:

Top 10 Spam words:


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Combine the lemmatized training data with the labels for analysis
train_data_eda = pd.DataFrame({'text': X_train_lemmatized, 'label': y_train})

# Separate ham and spam messages
ham_messages = train_data_eda[train_data_eda['label'] == 0]['text']
spam_messages = train_data_eda[train_data_eda['label'] == 1]['text']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform ham messages
ham_counts = count_vectorizer.fit_transform(ham_messages)
ham_word_counts = pd.DataFrame(ham_counts.sum(axis=0), columns=count_vectorizer.get_feature_names_out()).T
top_ham_words = ham_word_counts.sort_values(by=0, ascending=False).head(10)

print("Top 10 Ham Words:")
display(top_ham_words)

# Fit and transform spam messages
spam_counts = count_vectorizer.fit_transform(spam_messages)
spam_word_counts = pd.DataFrame(spam_counts.sum(axis=0), columns=count_vectorizer.get_feature_names_out()).T
top_spam_words = spam_word_counts.sort_values(by=0, ascending=False).head(10)

print("\nTop 10 Spam Words:")
display(top_spam_words)

Top 10 Ham Words:


Unnamed: 0,0
state,106
would,94
mr,89
president,89
pm,85
obama,77
percent,76
call,73
secretary,72
one,71



Top 10 Spam Words:


Unnamed: 0,0
2e,1604
money,837
2c,769
account,730
bank,642
fund,568
transaction,460
business,424
country,403
you,390


## Extra features

In [None]:
# data_train = X_train
# data_val = X_test
# preprocessed_text = clean_text

In [None]:
import pandas as pd
import re

# Example: if you already have X_train, X_test, y_train, y_test
data_train = pd.DataFrame({'text': X_train, 'label': y_train})
data_val = pd.DataFrame({'text': X_test, 'label': y_test})

# STEP 1 — Create preprocessed_text column
def preprocess_text(text):
    text = text.lower()                # lowercase
    text = re.sub(r'\W+', ' ', text)  # remove punctuation & special chars
    return text

data_train['preprocessed_text'] = data_train['text'].apply(preprocess_text)
data_val['preprocessed_text'] = data_val['text'].apply(preprocess_text)

# STEP 2 — Add indicators
money_symbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund",
                             "transfer","transaction","win","deposit","password"])

for df in [data_train, data_val]:
    df['money_mark'] = df['preprocessed_text'].str.contains(money_symbol_list)*1
    df['suspicious_words'] = df['preprocessed_text'].str.contains(suspicious_words)*1
    df['text_len'] = df['preprocessed_text'].apply(len)

# Check
print(data_train.head())


                                                  text  label  \
29   ----------- REGARDS, MR NELSON SMITH.KINDLY RE...      1   
535  I have not been able to reach oscar this am. W...      0   
695  ; Huma Abedin B6I'm checking with Pat on the 5...      0   
557  I can have it announced here on Monday - can't...      0   
836      BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...      1   

                                     preprocessed_text  money_mark  \
29    regards mr nelson smith kindly reply me on my...           0   
535  i have not been able to reach oscar this am we...           0   
695   huma abedin b6i m checking with pat on the 50...           0   
557  i can have it announced here on monday can t t...           0   
836   bank of africaagence san pedro14 bp 1210 san ...           1   

     suspicious_words  text_len  
29                  0        94  
535                 0        99  
695                 0       138  
557                 0        50  
836               

In [None]:
# # We add to the original dataframe two additional indicators (money symbols and suspicious words).
# money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
# suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

# data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
# data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
# data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x))

# data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
# data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
# data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x))

# data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Your code
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train_lemmatized)
X_test_bow = bow_vectorizer.transform(X_test_lemmatized)

print("Shape of Bag of Words vectorized training data:", X_train_bow.shape)
print("Shape of Bag of Words vectorized testing data:", X_test_bow.shape)

Shape of Bag of Words vectorized training data: (800, 22926)
Shape of Bag of Words vectorized testing data: (200, 22926)


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Vectorize all dataset
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_lemmatized)
X_test_tfidf = tfidf_vectorizer.transform(X_test_lemmatized)

# print the shape of the vectorized dataset
print("Shape of TF-IDF vectorized training data:", X_train_tfidf.shape)
print("Shape of TF-IDF vectorized testing data:", X_test_tfidf.shape)

Shape of TF-IDF vectorized training data: (800, 22926)
Shape of TF-IDF vectorized testing data: (200, 22926)


## And the Train a Classifier?

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Multinomial Naive Bayes classifier with default parameters
model_tfidf = MultinomialNB()

# Train the model using the TF-IDF vectorized training data
model_tfidf.fit(X_train_tfidf, y_train)

# Make predictions on the TF-IDF vectorized testing data
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

# Evaluate the model
print("Evaluation with TF-IDF features:")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tfidf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_tfidf))

Evaluation with TF-IDF features:
Accuracy: 0.945

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.91      0.95       125
           1       0.87      1.00      0.93        75

    accuracy                           0.94       200
   macro avg       0.94      0.96      0.94       200
weighted avg       0.95      0.94      0.95       200


Confusion Matrix:
[[114  11]
 [  0  75]]


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [79]:
# # Use the indices from the train/test split to align extra features correctly
# X_train_extra = pd.DataFrame({
#     'money_mark': data['text'].loc[X_train_lemmatized.index].apply(lambda x: int(any(m in x.lower() for m in ["euro","dollar","pound","€",r"\$"]))),
#     'suspicious_words': data['text'].loc[X_train_lemmatized.index].apply(lambda x: int(any(w in x.lower() for w in ["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"]))),
#     'text_len': data['text'].loc[X_train_lemmatized.index].apply(lambda x: len(str(x)))
# }).reset_index(drop=True) # Reset index for hstack compatibility

# X_test_extra = pd.DataFrame({
#     'money_mark': data['text'].loc[X_test_lemmatized.index].apply(lambda x: int(any(m in x.lower() for m in ["euro","dollar","pound","€",r"\$"]))),
#     'suspicious_words': data['text'].loc[X_test_lemmatized.index].apply(lambda x: int(any(w in x.lower() for w in ["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"]))),
#     'text_len': data['text'].loc[X_test_lemmatized.index].apply(lambda x: len(str(x)))
# }).reset_index(drop=True) # Reset index for hstack compatibility

# print("\nShape of extra features training data:", X_train_extra.shape)
# print("Shape of extra features testing data:", X_test_extra.shape)
# print("\nExtra features for training data (first 5 rows):")
# display(X_train_extra.head())
# print("\nExtra features for testing data (first 5 rows):")
# display(X_test_extra.head())

# # 4. Bag of Words + extra flags
# X_train_bow_extra = hstack([X_train_bow, X_train_extra.values])
# X_test_bow_extra = hstack([X_test_bow, X_test_extra.values])

# print("\nShape of Bag of Words + extra training data:", X_train_bow_extra.shape)
# print("Shape of Bag of Words + extra testing data:", X_test_bow_extra.shape)

# # 5. TF-IDF + extra flags
# X_train_tfidf_extra = hstack([X_train_tfidf, X_train_extra.values])
# X_test_tfidf_extra = hstack([X_test_tfidf, X_test_extra.values])

# print("\nShape of TF-IDF + extra training data:", X_train_tfidf_extra.shape)
# print("Shape of TF-IDF + extra testing data:", X_test_tfidf_extra.shape)