In [29]:
#from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [30]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speed up development. 

In [31]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [32]:
from sklearn.model_selection import train_test_split

# Split data into features (X) and target (y)
X = data['text']  # Email text
y = data['label']  # Spam/Ham labels

# Split into 80% train, 20% test (adjust test_size if needed)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train size: 800, Test size: 200


## Data Preprocessing

In [33]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [34]:
import re

def clean_html(text):
    # Remove inline JavaScript/CSS:
    text = re.sub(r'<(script|style).*?>.*?</\1>', '', text, flags=re.DOTALL)
    # Remove HTML comments:
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    # Remove remaining HTML tags:
    text = re.sub(r'<[^>]+>', '', text)
    return text

In [35]:
X_train = X_train.apply(clean_html)
X_test = X_test.apply(clean_html)

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [37]:
def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove single characters from the start of the text
    text = re.sub(r'^[a-zA-Z]\s+', '', text)
    # Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove the 'b' character prefix from byte conversions (if present)
    text = re.sub(r'^b\s+', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

In [38]:
X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

## Now let's work on removing stopwords
Remove the stopwords.

In [39]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)

X_train = X_train.apply(remove_stopwords)
X_test = X_test.apply(remove_stopwords)



## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # WordNet data for lemmatization
nltk.download('punkt')    # Tokenizer


from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized)

X_train = X_train.apply(lemmatize_text)
X_test = X_test.apply(lemmatize_text)




[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rospi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rospi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rospi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


442    dear good day hope fine cdear writting mail du...
962    mr henry kaborethe chief auditor inchargeforei...
971                                                     
190    desk dr adamu ismalerauditing accounting manag...
551    dear friend name loi estrada wife mr josephest...
495    amr ecollince addoattn ai need urgent assistan...
845                   pls send call sheet equadoran clue
821                                          yes arrange
409          fyi limit haiti email major fyi information
794    mr zuhair idris jordan kuwait bankcredit depar...
Name: text, dtype: object


In [52]:
print(y_train.unique())
print(X_train.sample(5))

[1 0]
847                                     rec printed desk
182                                                  fyi
372    depart private residence en route white house ...
619    devon drive independence lay enugu state urgen...
443    sbwhoeoptuesday december pmany truth account s...
Name: text, dtype: object


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Vectorize with Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X_train)

# Create a DataFrame from the Bag of Words
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
bow_df['label'] = y_train.values

In [54]:
ham_words = bow_df[bow_df['label'] == 0].drop('label', axis=1).sum().sort_values(ascending=False)
spam_words = bow_df[bow_df['label'] == 1].drop('label', axis=1).sum().sort_values(ascending=False)

# Get top 10
top_ham = ham_words.head(10)
top_spam = spam_words.head(10)

print("Top 10 words in HAM messages:")
print(top_ham)

print("\nTop 10 words in SPAM messages:")
print(top_spam)


Top 10 words in HAM messages:
state        97
president    95
would        92
mr           90
obama        80
percent      80
call         77
work         72
time         70
one          69
dtype: int64

Top 10 words in SPAM messages:
money          756
account        670
bank           614
fund           600
transaction    432
business       412
country        397
nbsp           387
mr             383
million        361
dtype: int64


## Extra features

In [56]:
from sklearn.model_selection import train_test_split

# Split original data into train and validation sets
data_train, data_val = train_test_split(
    data, test_size=0.2, random_state=42, stratify=data['label']
)

# Apply preprocessing pipeline to create 'preprocessed_text'
data_train['preprocessed_text'] = (
    data_train['text']
    .apply(preprocess_text)        # your cleaning function
    .apply(remove_stopwords)       # stopwords removal
    .apply(lemmatize_text)         # lemmatization
)

data_val['preprocessed_text'] = (
    data_val['text']
    .apply(preprocess_text)
    .apply(remove_stopwords)
    .apply(lemmatize_text)
)


In [57]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

Unnamed: 0,text,label,clean_text,preprocessed_text,money_mark,suspicious_words,text_len
442,Dear=2C Good day hope fine=2Cdear am writting ...,1,dear c good day hope fine cdear am writting ...,dear good day hope fine cdear writting mail du...,1,1,1002
962,FROM MR HENRY KABORETHE CHIEF AUDITOR INCHARGE...,1,from mr henry kaborethe chief auditor incharge...,mr henry kaborethe chief auditor inchargeforei...,1,1,1946
971,Will do.,0,will do,,1,0,0
190,FROM THE DESK OF DR.ADAMU ISMALERAUDITING AND...,1,from the desk of dr adamu ismalerauditing and ...,desk dr adamu ismalerauditing accounting manag...,1,1,383
551,"Dear Friend, My name is LOI C.ESTRADA,The wife...",1,dear friend my name is loi estrada the wife...,dear friend name loi estrada wife mr josephest...,1,1,1475


## How would work the Bag of Words with Count Vectorizer concept?

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=5000)  # limit vocab size for speed

# Fit on training preprocessed texts and transform both train and test sets
X_train_bow = vectorizer.fit_transform(data_train['preprocessed_text'])
X_val_bow = vectorizer.transform(data_val['preprocessed_text'])

print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")
print(f"Shape of train BoW matrix: {X_train_bow.shape}")
print(f"Shape of val BoW matrix: {X_val_bow.shape}")

Vocabulary size: 5000
Shape of train BoW matrix: (800, 5000)
Shape of val BoW matrix: (200, 5000)


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # limit vocab size for speed

# Fit on training data and transform both train and validation sets
X_train_tfidf = tfidf_vectorizer.fit_transform(data_train['preprocessed_text'])
X_val_tfidf = tfidf_vectorizer.transform(data_val['preprocessed_text'])

# Print shapes
print(f"TF-IDF train shape: {X_train_tfidf.shape}")
print(f"TF-IDF validation shape: {X_val_tfidf.shape}")


TF-IDF train shape: (800, 5000)
TF-IDF validation shape: (200, 5000)


## And the Train a Classifier?

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the classifier
clf = LogisticRegression(max_iter=1000, random_state=42)

# Train on the TF-IDF features
clf.fit(X_train_tfidf, data_train['label'])

# Predict on validation set
y_pred = clf.predict(X_val_tfidf)

# Evaluate
print("Accuracy:", accuracy_score(data_val['label'], y_pred))
print("\nClassification Report:\n", classification_report(data_val['label'], y_pred))


Accuracy: 0.995

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       112
           1       1.00      0.99      0.99        88

    accuracy                           0.99       200
   macro avg       1.00      0.99      0.99       200
weighted avg       1.00      0.99      0.99       200



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [61]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Initialize the classifier with default parameters
clf = MultinomialNB()

# Train on BoW features (or your chosen representation)
clf.fit(X_train_bow, data_train['label'])

# Predict on validation set
y_pred = clf.predict(X_val_bow)

# Evaluate
print("Accuracy:", accuracy_score(data_val['label'], y_pred))
print("\nClassification Report:\n", classification_report(data_val['label'], y_pred))


Accuracy: 0.98

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98       112
           1       0.96      1.00      0.98        88

    accuracy                           0.98       200
   macro avg       0.98      0.98      0.98       200
weighted avg       0.98      0.98      0.98       200



In [62]:
from scipy.sparse import hstack

# Extract engineered features as numpy arrays or sparse matrices
engineered_train = data_train[['money_mark', 'suspicious_words', 'text_len']].values
engineered_val = data_val[['money_mark', 'suspicious_words', 'text_len']].values

# Combine with BoW or TF-IDF sparse matrices
X_train_combined = hstack([X_train_bow, engineered_train])
X_val_combined = hstack([X_val_bow, engineered_val])

# Train classifier on combined features
clf.fit(X_train_combined, data_train['label'])
y_pred_combined = clf.predict(X_val_combined)

print("Accuracy with engineered features:", accuracy_score(data_val['label'], y_pred_combined))
print("\nClassification Report with engineered features:\n", classification_report(data_val['label'], y_pred_combined))


Accuracy with engineered features: 0.99

Classification Report with engineered features:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       112
           1       0.98      1.00      0.99        88

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200

