In [42]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [43]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [44]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv", encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [45]:
X = data['text']            # Features (SMS content)
y = data['label']           # Labels (ham/spam)

# Split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# checking split
print("Training size:", len(X_train))
print("Test size:", len(X_test))

Training size: 800
Test size: 200


## Data Preprocessing

In [46]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [47]:
import re

def clean_html(text):
    # Remove inline JavaScript and CSS
    text = re.sub(r'<(script|style).*?>.*?</\1>', '', text, flags=re.DOTALL)

    # Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

    # Remove remaining HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    return text

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [48]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', ' ', text)          # remove special characters
    text = re.sub(r'\d+', '', text)               # remove numbers
    text = re.sub(r'\b\w\b', '', text)            # remove single characters
    text = re.sub(r'^\s*\w\s+', '', text)         # remove single characters from start
    text = re.sub(r'\s+', ' ', text)              # multiple spaces to single
    text = re.sub(r'^b\s+', '', text)             # remove 'b' if prefixed
    text = text.lower().strip()                   # convert to lowercase
    return text

## Now let's work on removing stopwords
Remove the stopwords.

In [49]:
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    words = text.split()  # Tokenize by whitespace
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [50]:
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean_encoded_chars(text):
    return re.sub(r'=2C', ',', text)

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    return ' '.join(lemmatized_words)

# Clean HTML
X_train_clean = X_train.apply(clean_html)
X_test_clean = X_test.apply(clean_html)

# Clean text
X_train_clean = X_train_clean.apply(clean_text)
X_test_clean = X_test_clean.apply(clean_text)

# Remove stopwords
X_train_cleaned = X_train_clean.apply(remove_stopwords)
X_test_cleaned = X_test_clean.apply(remove_stopwords)

# Decode weird HTML artifacts like '=2C'
X_train_cleaned = X_train_cleaned.apply(clean_encoded_chars)
X_test_cleaned = X_test_cleaned.apply(clean_encoded_chars)

# Lemmatize
X_train_lemmatized = X_train_cleaned.apply(lemmatize_text)
X_test_lemmatized = X_test_cleaned.apply(lemmatize_text)

# Convert lemmatized version to lowercase
X_train_lemmatized = X_train_lemmatized.str.lower()
X_test_lemmatized = X_test_lemmatized.str.lower()

[nltk_data] Downloading package wordnet to /Users/affy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/affy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [51]:
print(y_train.unique())

[1 0]


In [53]:
# Separate cleaned messages based on label
ham_cleaned = X_train_cleaned[y_train == 0]   # ham is labeled as 0
spam_cleaned = X_train_cleaned[y_train == 1]  # spam is labeled as 1

# Tokenize into words
ham_words = ' '.join(ham_cleaned).lower().split()
spam_words = ' '.join(spam_cleaned).lower().split()

# Count frequencies
ham_freq = {}
for word in ham_words:
    ham_freq[word] = ham_freq.get(word, 0) + 1

spam_freq = {}
for word in spam_words:
    spam_freq[word] = spam_freq.get(word, 0) + 1

# Sort by frequency and get top 10 words
top_ham = sorted(ham_freq.items(), key=lambda item: item[1], reverse=True)[:10]
top_spam = sorted(spam_freq.items(), key=lambda item: item[1], reverse=True)[:10]

# Display results
print("Top 10 Words in HAM Messages:")
for word, count in top_ham:
    print(f"{word}: {count}")

print("\nTop 10 Words in SPAM Messages:")
for word, count in top_spam:
    print(f"{word}: {count}")

Top 10 Words in HAM Messages:
president: 94
would: 92
mr: 85
obama: 80
percent: 80
state: 77
pm: 68
work: 67
one: 67
call: 61

Top 10 Words in SPAM Messages:
money: 756
account: 630
bank: 598
us: 590
transaction: 422
business: 412
nbsp: 387
fund: 385
country: 370
million: 338


## Extra features

In [54]:
# Wrap up the preprocessed data into a DataFrame
data_train = pd.DataFrame({
    'preprocessed_text': X_train_lemmatized,
    'label': y_train
})

data_val = pd.DataFrame({
    'preprocessed_text': X_test_lemmatized,
    'label': y_test
})

In [55]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

Unnamed: 0,preprocessed_text,label,money_mark,suspicious_words,text_len
442,dear good day hope fine cdear writting mail du...,1,1,1,980
962,mr henry kaborethe chief auditor inchargeforei...,1,1,1,1981
971,,0,1,0,0
190,desk dr adamu ismalerauditing account manager ...,1,1,1,372
551,dear friend name loi estrada wife mr josephest...,1,1,1,1458


## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Load the vectorizer
bow_vectorizer = CountVectorizer()

# Fit training data then transform both train and validation sets
X_train_bow = bow_vectorizer.fit_transform(data_train['preprocessed_text'])
X_val_bow = bow_vectorizer.transform(data_val['preprocessed_text'])

# Print the shape
print("Bag of Words - Train shape:", X_train_bow.shape)
print("Bag of Words - Validation shape:", X_val_bow.shape)

Bag of Words - Train shape: (800, 19686)
Bag of Words - Validation shape: (200, 19686)


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform
X_train_tfidf = tfidf_vectorizer.fit_transform(data_train['preprocessed_text'])
X_val_tfidf = tfidf_vectorizer.transform(data_val['preprocessed_text'])

# Print shape
print("TF-IDF - Train shape:", X_train_tfidf.shape)
print("TF-IDF - Validation shape:", X_val_tfidf.shape)

TF-IDF - Train shape: (800, 19686)
TF-IDF - Validation shape: (200, 19686)


## And the Train a Classifier?

In [58]:
# Multinomial Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Predict on validation set
y_pred = nb_classifier.predict(X_val_tfidf)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.945
              precision    recall  f1-score   support

           0       1.00      0.90      0.95       112
           1       0.89      1.00      0.94        88

    accuracy                           0.94       200
   macro avg       0.94      0.95      0.94       200
weighted avg       0.95      0.94      0.95       200



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [100]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [101]:
# download resources
nltk.download('stopwords')
nltk.download('wordnet')

# init tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/affy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/affy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [102]:
def clean_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

def clean_text(text):
    text = re.sub(r'\n|\r', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

def clean_encoded_chars(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

def full_clean_pipeline(text):
    text = clean_html(text)
    text = clean_text(text)
    text = remove_stopwords(text)
    text = clean_encoded_chars(text)
    text = lemmatize_text(text)
    return text.lower()

In [103]:
# Load the data
kg_train = pd.read_csv('../data/kg_train.csv')
kg_test = pd.read_csv('../data/kg_test.csv')

X_train = kg_train['text']
y_train = kg_train['label']
X_test = kg_test['text']

In [104]:
# Apply the cleaning pipeline
X_train_cleaned = X_train.apply(full_clean_pipeline)
X_test_cleaned = X_test.apply(full_clean_pipeline)

  return BeautifulSoup(text, "html.parser").get_text()
  return BeautifulSoup(text, "html.parser").get_text()


In [105]:
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train_cleaned, y_train, test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9,
    max_features=5000
)

X_train_sub_tfidf = vectorizer.fit_transform(X_train_sub)
X_val_tfidf = vectorizer.transform(X_val)

model = MultinomialNB()
model.fit(X_train_sub_tfidf, y_train_sub)

val_predictions = model.predict(X_val_tfidf)

print("accuracy:", accuracy_score(y_val, val_predictions))
print(classification_report(y_val, val_predictions))

accuracy: 0.9681475272422464
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       674
           1       0.99      0.94      0.96       519

    accuracy                           0.97      1193
   macro avg       0.97      0.96      0.97      1193
weighted avg       0.97      0.97      0.97      1193



In [106]:
# Retrain on the full training set and predict on the test set
X_train_tfidf = vectorizer.fit_transform(X_train_cleaned)
X_test_tfidf = vectorizer.transform(X_test_cleaned)

final_model = MultinomialNB()
final_model.fit(X_train_tfidf, y_train)

test_predictions = final_model.predict(X_test_tfidf)

In [107]:
final_model_submission = pd.DataFrame({
    'text': kg_test['text'],
    'predicted_label': test_predictions
})

final_model_submission.head()

# save to file
final_model_submission.to_csv('spam_predictions.csv', index=False)