In [249]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [250]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [251]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [252]:
# Separate features and target
X = data.drop('label', axis=1)
y = data['label']

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)

(800, 1) (200, 1)


## Data Preprocessing

In [253]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [254]:
import re

def clean_html(text):
    # Remove inline JavaScript/CSS:
    text = re.sub(r'<(script|style).*?>.*?</\1>', '', text, flags=re.DOTALL|re.IGNORECASE)
    # Remove HTML comments:
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    # Remove remaining HTML tags:
    text = re.sub(r'<[^>]+>', '', text)
    return text


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [255]:

def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Remove single characters
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    # Remove single characters from the start
    text = re.sub(r'^[a-zA-Z]\s+', '', text)
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Remove prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    # Convert to lowercase
    text = text.lower()
    return text.strip()


## Now let's work on removing stopwords
Remove the stopwords.

In [256]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [257]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = text.split()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [258]:
# Define ham_texts and spam_texts from the data DataFrame
ham_texts = data[data['label'] == 0]['text'].tolist()
spam_texts = data[data['label'] == 1]['text'].tolist()

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()


# Fit and transform ham messages
ham_matrix = vectorizer.fit_transform(ham_texts)
ham_word_counts = ham_matrix.sum(axis=0).A1
ham_vocab = vectorizer.get_feature_names_out()
ham_freq = dict(zip(ham_vocab, ham_word_counts))
top10_ham = sorted(ham_freq.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 words in ham messages:", top10_ham)

# Fit and transform spam messages
vectorizer = CountVectorizer()  # re-initialize to avoid mixing vocabs
spam_matrix = vectorizer.fit_transform(spam_texts)
spam_word_counts = spam_matrix.sum(axis=0).A1
spam_vocab = vectorizer.get_feature_names_out()
spam_freq = dict(zip(spam_vocab, spam_word_counts))
top10_spam = sorted(spam_freq.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 words in spam messages:", top10_spam)

Top 10 words in ham messages: [('the', 1773), ('to', 1065), ('and', 833), ('of', 791), ('in', 616), ('that', 414), ('is', 385), ('for', 369), ('on', 329), ('you', 311)]
Top 10 words in spam messages: [('the', 7046), ('to', 5593), ('of', 4984), ('and', 3985), ('in', 3289), ('you', 3229), ('this', 2675), ('my', 2143), ('your', 2078), ('for', 2030)]


## Extra features

In [259]:
# Combine X_train/X_test and y_train/y_test into DataFrames for feature engineering
data_train = X_train.copy()
data_train['label'] = y_train
data_val = X_test.copy()
data_val['label'] = y_test

# Preprocess text for feature engineering
def preprocess_pipeline(text):
    text = clean_html(text)
    text = clean_text(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

data_train['preprocessed_text'] = data_train['text'].apply(preprocess_pipeline)
data_val['preprocessed_text'] = data_val['text'].apply(preprocess_pipeline)

In [260]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

Unnamed: 0,text,label,preprocessed_text,money_mark,suspicious_words,text_len
442,Dear=2C Good day hope fine=2Cdear am writting ...,1,dear good day hope fine cdear writting mail du...,1,1,998
962,FROM MR HENRY KABORETHE CHIEF AUDITOR INCHARGE...,1,mr henry kaborethe chief auditor inchargeforei...,1,1,1946
971,Will do.,0,,1,0,0
190,FROM THE DESK OF DR.ADAMU ISMALERAUDITING AND...,1,desk dr adamu ismalerauditing accounting manag...,1,1,383
551,"Dear Friend, My name is LOI C.ESTRADA,The wife...",1,dear friend name loi estrada wife mr josephest...,1,1,1475


## How would work the Bag of Words with Count Vectorizer concept?

In [261]:
from sklearn.feature_extraction.text import CountVectorizer

# Example or can I use any csv document here?
#documents = pd.read_csv("../data/kg_train.csv")
documents = [
    "I love machine learning",
    "Machine learning is fun",
    "I love fun"
]

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Show the feature names (vocabulary)
print("Vocabulary:", vectorizer.get_feature_names_out())

# Show the Bag of Words matrix
print("Bag of Words matrix:\n", X.toarray())

Vocabulary: ['fun' 'is' 'learning' 'love' 'machine']
Bag of Words matrix:
 [[0 0 1 1 1]
 [1 1 1 0 1]
 [1 0 0 1 0]]


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [262]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Vectorize the dataset using TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(data_train['preprocessed_text'])
X_test_tfidf = tfidf_vectorizer.transform(data_val['preprocessed_text'])

print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Test TF-IDF shape:", X_test_tfidf.shape)


Train TF-IDF shape: (800, 28335)
Test TF-IDF shape: (200, 28335)


## And the Train a Classifier?

In [263]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Train the MultinomialNB classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, data_train['label'])

# Predict on the test set
y_pred = clf.predict(X_test_tfidf)

# Evaluate the classifier
print("Accuracy:", accuracy_score(data_val['label'], y_pred))
print(classification_report(data_val['label'], y_pred))

Accuracy: 0.95
              precision    recall  f1-score   support

           0       1.00      0.91      0.95       112
           1       0.90      1.00      0.95        88

    accuracy                           0.95       200
   macro avg       0.95      0.96      0.95       200
weighted avg       0.96      0.95      0.95       200



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [264]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

clf = MultinomialNB()
clf.fit(X_train_tfidf, data_train['label'])

y_pred = clf.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(data_val['label'], y_pred))
print(classification_report(data_val['label'], y_pred))


Accuracy: 0.95
              precision    recall  f1-score   support

           0       1.00      0.91      0.95       112
           1       0.90      1.00      0.95        88

    accuracy                           0.95       200
   macro avg       0.95      0.96      0.95       200
weighted avg       0.96      0.95      0.95       200



In [265]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)


In [266]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = text.split()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)


In [267]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()


# Fit and transform ham messages
ham_matrix = vectorizer.fit_transform(ham_texts)
ham_word_counts = ham_matrix.sum(axis=0).A1
ham_vocab = vectorizer.get_feature_names_out()
ham_freq = dict(zip(ham_vocab, ham_word_counts))
top10_ham = sorted(ham_freq.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 words in ham messages:", top10_ham)

# Fit and transform spam messages
vectorizer = CountVectorizer()  # re-initialize to avoid mixing vocabs
spam_matrix = vectorizer.fit_transform(spam_texts)
spam_word_counts = spam_matrix.sum(axis=0).A1
spam_vocab = vectorizer.get_feature_names_out()
spam_freq = dict(zip(spam_vocab, spam_word_counts))
top10_spam = sorted(spam_freq.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 words in spam messages:", top10_spam)

Top 10 words in ham messages: [('the', 1773), ('to', 1065), ('and', 833), ('of', 791), ('in', 616), ('that', 414), ('is', 385), ('for', 369), ('on', 329), ('you', 311)]
Top 10 words in spam messages: [('the', 7046), ('to', 5593), ('of', 4984), ('and', 3985), ('in', 3289), ('you', 3229), ('this', 2675), ('my', 2143), ('your', 2078), ('for', 2030)]
