In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

In [None]:
data.columns

### Let's divide the training and test set into two partitions

In [9]:
from sklearn.model_selection import train_test_split

X = data['text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Data Preprocessing

In [None]:
import string
import nltk
nltk.download('stopwords')
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:

from bs4 import BeautifulSoup
import re

# Example function to clean HTML
def clean_html(text):

    text = re.sub(r'<(script|style).*?>.*?(<\/\1>)', '', text, flags=re.DOTALL)
    
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text(separator=" ")  # Extract only the text
    
    # Optional: You can also remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

# Example usage on your dataset
data['cleaned_text'] = data['text'].apply(clean_html)

# Print the cleaned data
print(data[['text', 'cleaned_text']].head())



- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
# Your code

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
# Your code

from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

# Load the stopwords list
stop_words = set(stopwords.words('english'))

# Function to clean HTML and remove stopwords
def clean_html_and_remove_stopwords(text):
    # Step 1: Remove inline JavaScript/CSS
    text = re.sub(r'<(script|style).*?>.*?(<\/\1>)', '', text, flags=re.DOTALL)
    
    # Step 2: Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    
    # Step 3: Remove remaining HTML tags
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text(separator=" ")  # Extract only the text
    
    # Optional: Remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    # Step 4: Remove stopwords
    cleaned_words = [word for word in cleaned_text.split() if word.lower() not in stop_words]
    
    # Join the remaining words back into a single string
    final_text = ' '.join(cleaned_words)
    
    return final_text

# Example usage on your dataset
data['cleaned_text'] = data['text'].apply(clean_html_and_remove_stopwords)

# Print the cleaned data without stopwords
print(data[['text', 'cleaned_text']].head())


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# Your code

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')  # For tokenization


from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean HTML, remove stopwords, and lemmatize text
def clean_html_and_lemmatize(text):
    # Step 1: Remove inline JavaScript/CSS
    text = re.sub(r'<(script|style).*?>.*?(<\/\1>)', '', text, flags=re.DOTALL)
    
    # Step 2: Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    
    # Step 3: Remove remaining HTML tags
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text(separator=" ")  # Extract only the text
    
    # Optional: Remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    # Step 4: Tokenize the text
    words = word_tokenize(cleaned_text)
    
    # Step 5: Remove stopwords and apply lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]
    
    # Join the lemmatized words back into a single string
    final_text = ' '.join(lemmatized_words)
    
    return final_text

# Example usage on your dataset
data['cleaned_text'] = data['text'].apply(clean_html_and_lemmatize)

# Print the cleaned and lemmatized data
print(data[['text', 'cleaned_text']].head())



## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [25]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Assuming the cleaned and lemmatized text is stored in 'cleaned_text' column
# and the 'label' column is used to classify spam (1) or ham (0).

# Separate spam and ham messages
ham_messages = data[data['label'] == 0]['cleaned_text']
spam_messages = data[data['label'] == 1]['cleaned_text']

# Initialize CountVectorizer and fit on the entire dataset
vectorizer = CountVectorizer()
vectorizer.fit(data['cleaned_text'])

# Transform ham and spam messages using the same vocabulary
ham_word_counts = vectorizer.transform(ham_messages)
spam_word_counts = vectorizer.transform(spam_messages)

# Sum up word occurrences
ham_word_sum = ham_word_counts.sum(axis=0)
spam_word_sum = spam_word_counts.sum(axis=0)

# Get the words from the vocabulary
words = vectorizer.get_feature_names()

# Create a DataFrame for each to easily sort and get top 10 words
ham_word_freq = pd.DataFrame(ham_word_sum.A.flatten(), index=words, columns=['frequency']).sort_values(by='frequency', ascending=False)
spam_word_freq = pd.DataFrame(spam_word_sum.A.flatten(), index=words, columns=['frequency']).sort_values(by='frequency', ascending=False)

# Get the top 10 words for both ham and spam messages
top_ham_words = ham_word_freq.head(10)
top_spam_words = spam_word_freq.head(10)

# Display the top 10 words
print("Top 10 Words in Ham Messages:")
print(top_ham_words)

print("\nTop 10 Words in Spam Messages:")
print(top_spam_words)


Top 10 Words in Ham Messages:
           frequency
state            125
pm               113
would            107
president         99
call              94
mr                91
time              88
obama             84
2010              82
30                81

Top 10 Words in Spam Messages:
             frequency
2e                1857
money              979
2c                 923
account            887
bank               799
fund               755
transaction        549
business           511
country            504
transfer           423


## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform the training data into Bag of Words
X_train_bow = vectorizer.fit_transform(X_train)

# Transform the test data (using the same vocabulary from the training data)
X_test_bow = vectorizer.transform(X_test)

# Get the vocabulary (the unique words in the dataset)
vocab = vectorizer.get_feature_names()  # Use get_feature_names() instead of get_feature_names_out()

# Print the shape of the matrices (documents, unique words)
print(f"Shape of Bag of Words matrix for training data: {X_train_bow.shape}")
print(f"Shape of Bag of Words matrix for test data: {X_test_bow.shape}")

# Show the first 10 words from the vocabulary
print(f"Vocabulary: {vocab[:10]}")  # Show only the first 10 words for brevity

# Convert the training Bag of Words matrix into a DataFrame to visualize
bow_df_train = pd.DataFrame(X_train_bow.toarray(), columns=vocab)

# Convert the test Bag of Words matrix into a DataFrame to visualize
bow_df_test = pd.DataFrame(X_test_bow.toarray(), columns=vocab)

# Show the first few rows of the training data Bag of Words matrix
print("Training Data BoW Representation:")
print(bow_df_train.head())

# Show the first few rows of the test data Bag of Words matrix
print("Test Data BoW Representation:")
print(bow_df_test.head())


Shape of Bag of Words matrix for training data: (800, 23374)
Shape of Bag of Words matrix for test data: (200, 23374)
Vocabulary: ['00', '000', '000000', '00000e25', '00000e251', '00000eur', '000066', '0000ff', '000m', '000million']
Training Data BoW Representation:
   00  000  000000  00000e25  00000e251  00000eur  000066  0000ff  000m  \
0   0    0       0         0          0         0       0       0     0   
1   0    0       0         0          0         0       0       0     0   
2   0    0       0         0          0         0       0       0     0   
3   0    0       0         0          0         0       0       0     0   
4   0    0       0         0          0         0       0       0     0   

   000million  ...  â½s  â½t  â½ta  â½te  â½tica  â½to  â½trangers  â½x60ã  \
0           0  ...    0    0     0     0       0     0           0       0   
1           0  ...    0    0     0     0       0     0           0       0   
2           0  ...    0    0     0     0       0

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [29]:
# Your code
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data and transform the training data into TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data using the same vocabulary learned from the training data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Print the shape of the resulting TF-IDF matrices
print(f"Shape of TF-IDF matrix for training data: {X_train_tfidf.shape}")
print(f"Shape of TF-IDF matrix for test data: {X_test_tfidf.shape}")

# Optionally, get the vocabulary (unique words in the dataset)



Shape of TF-IDF matrix for training data: (800, 23374)
Shape of TF-IDF matrix for test data: (200, 23374)


## And the Train a Classifier?

In [30]:
# Your code
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier on the training data (X_train_tfidf and y_train)
nb_classifier.fit(X_train_tfidf, y_train)

# Predict the labels for the test data (X_test_tfidf)
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the model by calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Display the classification report for more detailed evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 87.50%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.80      0.89       125
           1       0.75      1.00      0.86        75

    accuracy                           0.88       200
   macro avg       0.88      0.90      0.87       200
weighted avg       0.91      0.88      0.88       200

Confusion Matrix:
[[100  25]
 [  0  75]]


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code