In [265]:
import warnings
warnings.filterwarnings("ignore")

In [266]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [267]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [268]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [269]:
print(data.columns)

Index(['text', 'label'], dtype='object')


In [270]:
data.head()

Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0


### Let's divide the training and test set into two partitions

In [271]:
from sklearn.model_selection import train_test_split

In [272]:
# Your code

data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)

## Data Preprocessing

In [273]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [274]:
import re
from bs4 import BeautifulSoup, Comment

In [275]:
# Your code
def clean_html_text(text):

    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # 1.
    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL|re.IGNORECASE)
    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL|re.IGNORECASE)
    
    # 2.
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    
    # 3.
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(separator=' ')
    
    # 4.
    text = re.sub(r'[^\x00-\x7F]', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()


data_train['cleaned_text'] = data_train['text'].apply(clean_html_text)
data_val['cleaned_text'] = data_val['text'].apply(clean_html_text)

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [276]:
# Your code here

def preprocess_text(text):
    
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Remove all special characters (keep only letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove single characters from anywhere in the text
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    
    # Remove single characters from the start of the string
    text = re.sub(r'^\w\s+', '', text)
    
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove prefixed 'b' (common when converting bytes to string)
    text = re.sub(r'^b\s+', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove leading and trailing spaces
    text = text.strip()
    
    return text


data_train['preprocessed_text'] = data_train['cleaned_text'].apply(preprocess_text)
data_val['preprocessed_text'] = data_val['cleaned_text'].apply(preprocess_text)

## Now let's work on removing stopwords
Remove the stopwords.

In [277]:
# Your code

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    words = text.split()
    
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    return ' '.join(filtered_words)


data_train['no_stopwords'] = data_train['preprocessed_text'].apply(remove_stopwords)
data_val['no_stopwords'] = data_val['preprocessed_text'].apply(remove_stopwords)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [278]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [279]:
# Your code
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    
    words = word_tokenize(text)
    
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    return " ".join(lemmatized_words)

In [280]:
data_train['lemmatized_text'] = data_train['no_stopwords'].apply(lemmatize_text)
data_val['lemmatized_text'] = data_val['no_stopwords'].apply(lemmatize_text)

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [281]:
# Your code

from collections import Counter

# Separate ham (label 0) and spam (label 1) messages
ham_texts_train = data_train[data_train['label'] == 0]['lemmatized_text']
spam_texts_train = data_train[data_train['label'] == 1]['lemmatized_text']

ham_texts_val = data_val[data_val['label'] == 0]['lemmatized_text']
spam_texts_val = data_val[data_val['label'] == 1]['lemmatized_text']

# Function to get all words from a series of texts
def get_all_words(texts):
    all_words = []
    for text in texts:
        all_words.extend(text.split())
    return all_words

# Get all words from each category
ham_words_train = get_all_words(ham_texts_train)
spam_words_train = get_all_words(spam_texts_train)

ham_words_val = get_all_words(ham_texts_val)
spam_words_val = get_all_words(spam_texts_val)

# Count the most common words
top_ham_train = Counter(ham_words_train).most_common(10)
top_spam_train = Counter(spam_words_train).most_common(10)

top_ham_val = Counter(ham_words_val).most_common(10)
top_spam_val = Counter(spam_words_val).most_common(10)

print("Top 10 words in ham messages:")
print(top_ham_train)
print(top_ham_val)

print("\nTop 10 words in spam messages:")
print(top_spam_train)
print(top_spam_val)

Top 10 words in ham messages:
[('u', 99), ('would', 93), ('state', 92), ('pm', 89), ('president', 84), ('percent', 76), ('call', 73), ('secretary', 71), ('time', 70), ('mr', 70)]
[('district', 38), ('pm', 26), ('call', 18), ('u', 16), ('time', 14), ('policy', 14), ('new', 13), ('would', 13), ('see', 12), ('middle', 12)]

Top 10 words in spam messages:
[('money', 800), ('account', 666), ('bank', 608), ('fund', 569), ('u', 446), ('business', 395), ('transaction', 354), ('country', 342), ('transfer', 332), ('company', 321)]
[('bank', 140), ('fund', 139), ('account', 135), ('money', 126), ('u', 107), ('business', 81), ('million', 73), ('country', 71), ('foreign', 70), ('transaction', 70)]


## Extra features

In [282]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['lemmatized_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['lemmatized_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['lemmatized_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['lemmatized_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['lemmatized_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['lemmatized_text'].apply(lambda x: len(x)) 

data_train.head()

Unnamed: 0,text,label,cleaned_text,preprocessed_text,no_stopwords,lemmatized_text,money_mark,suspicious_words,text_len
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",regards mr nelson smithkindly reply me on my p...,regards mr nelson smithkindly reply private em...,regard mr nelson smithkindly reply private ema...,0,0,75
535,I have not been able to reach oscar this am. W...,0,I have not been able to reach oscar this am. W...,have not been able to reach oscar this am we a...,able reach oscar supposed send pdb receive,able reach oscar supposed send pdb receive,0,0,42
695,; Huma Abedin B6I'm checking with Pat on the 5...,0,; Huma Abedin B6I'm checking with Pat on the 5...,huma abedin bim checking with pat on the will ...,huma abedin bim checking pat work jack jake re...,huma abedin bim checking pat work jack jake re...,0,0,79
557,I can have it announced here on Monday - can't...,0,I can have it announced here on Monday - can't...,can have it announced here on monday cant today,announced monday cant today,announced monday cant today,0,0,27
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 San P...,bank of africaagence san pedro bp san pedro co...,bank africaagence san pedro bp san pedro cote ...,bank africaagence san pedro bp san pedro cote ...,1,1,1067


## How would work the Bag of Words with Count Vectorizer concept?

In [283]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer(min_df=2, token_pattern=r'\b\w{4,}\b', stop_words='english')

# Fit and transform on training data
X_train_bow = vectorizer.fit_transform(data_train['lemmatized_text'])

# Transform validation data
X_val_bow = vectorizer.transform(data_val['lemmatized_text'])

# To inspect the vocabulary
print(vectorizer.get_feature_names_out()[:10])  # Show first 10 words

# To see the shape of the transformed matrices
print("Train shape:", X_train_bow.shape)
print("Validation shape:", X_val_bow.shape)


['abacha' 'abachac' 'abachae' 'abad' 'abandoned' 'abbas' 'abdul'
 'abdullah' 'abedin' 'abidjan']
Train shape: (800, 4889)
Validation shape: (200, 4889)


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [284]:
# Your code
tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(data_train['lemmatized_text'])
X_val_tfidf = tfidf_vectorizer.transform(data_val['lemmatized_text'])

print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Validation TF-IDF shape:", X_val_tfidf.shape)

Train TF-IDF shape: (800, 16689)
Validation TF-IDF shape: (200, 16689)


## And the Train a Classifier?

In [285]:
# Your code
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

clf = MultinomialNB()

clf.fit(X_train_tfidf, data_train['label'])

y_pred = clf.predict(X_val_tfidf)


In [286]:
print("Accuracy:", accuracy_score(data_val['label'], y_pred))

print("\nClassification Report:\n", classification_report(data_val['label'], y_pred))

print("\nConfusion Matrix:\n", confusion_matrix(data_val['label'], y_pred))


Accuracy: 0.94

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.90      0.95       125
           1       0.86      1.00      0.93        75

    accuracy                           0.94       200
   macro avg       0.93      0.95      0.94       200
weighted avg       0.95      0.94      0.94       200


Confusion Matrix:
 [[113  12]
 [  0  75]]


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [287]:
# Your code