In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv(r"../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

### Let's divide the training and test set into two partitions

In [None]:
from sklearn.model_selection import train_test_split


X = data['text']
y = data['label']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Data Preprocessing

In [None]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
import re
from bs4 import BeautifulSoup
import nltk

from nltk.corpus import stopwords

nltk.download('stopwords')
# english_stopwords = set(stopwords.words("english")[100:110])
english_stopwords = set(stopwords.words('english'))


# this function does all the cleaning, including removing stopwords as instructed 
def clean_text(text, stop_words=None):
    # Remove inline JavaScript/CSS
    text = re.sub(r'<(script|style).*?>.*?(</\1>)', '', text, flags=re.DOTALL)
    # Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    # Remove remaining HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove all special characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove all single characters
    text = re.sub(r'\b\w\b', '', text)
    # Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords if provided
    if stop_words:
        text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

columns_to_clean = ['text']  
for col in columns_to_clean:
    data[col] = data[col].apply(lambda x: clean_text(x, stop_words=english_stopwords))




## Now let's work on removing stopwords


In [None]:
import re
from bs4 import BeautifulSoup
import nltk

from nltk.corpus import stopwords

nltk.download('stopwords')
# english_stopwords = set(stopwords.words("english")[100:110])
english_stopwords = set(stopwords.words('english'))


# this function does all the cleaning, including removing stopwords as instructed 
def clean_text(text, stop_words=None):
    # Remove inline JavaScript/CSS
    text = re.sub(r'<(script|style).*?>.*?(</\1>)', '', text, flags=re.DOTALL)
    # Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    # Remove remaining HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove all special characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove all single characters
    text = re.sub(r'\b\w\b', '', text)
    # Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords if provided
    if stop_words:
        text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

columns_to_clean = ['text']  
for col in columns_to_clean:
    data[col] = data[col].apply(lambda x: clean_text(x, stop_words=english_stopwords))




## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text, stop_words=None):
    # Tokenize text into words
    words = text.split()
    # Remove stopwords and apply lemmatization
    if stop_words:
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    else:
        words = [lemmatizer.lemmatize(word) for word in words]
    # Join words back into a single string
    return ' '.join(words)

columns_to_clean = ['text']  
for col in columns_to_clean:
    data[col] = data[col].apply(lambda x: tokenize_and_lemmatize(x, stop_words=english_stopwords))



In [12]:
from sklearn.model_selection import train_test_split

# Define features and target
X = data[['text']] 
y = data['label']  

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
from collections import Counter

# Combine all rows in the 'text' column into a single string
all_text = ' '.join(data['text'])

# Split the combined text into individual words
words = all_text.split()

# Count the frequency of each word
word_counts = Counter(words)

# Get the 10 most common words
top_10_words = word_counts.most_common(10)

# Print the results
print("Top 10 Words:", top_10_words)

## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

# Add indicators to the training set
X_train['money_mark'] = X_train['text'].str.contains(money_simbol_list) * 1
X_train['suspicious_words'] = X_train['text'].str.contains(suspicious_words) * 1
X_train['text_len'] = X_train['text'].apply(lambda x: len(x))

# Add indicators to the testing set
X_test['money_mark'] = X_test['text'].str.contains(money_simbol_list) * 1
X_test['suspicious_words'] = X_test['text'].str.contains(suspicious_words) * 1
X_test['text_len'] = X_test['text'].apply(lambda x: len(x))

data.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Vectorization:

# Each text is transformed into a vector where element corresponds to a word in the vocabulary.
# The value represents the count of how many times the word appears in the text.

from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer and transform the text
X = vectorizer.fit_transform(words)

# Convert to array for better visualization
print(X.toarray())

# Get the vocabulary (mapping of words to indices)
print(vectorizer.vocabulary_)



## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
# TF-IDF (Term Frequency-Inverse Document Frequency) assigns weights to words based on their importance. 

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Fit the vectorizer and transform the text
X = vectorizer.fit_transform(words)


print(X.shape)

## And the Train a Classifier?

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib


# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()


# Fit and transform  training data with TF-IDF
X_train_tfidf = vectorizer.fit_transform(X_train['text'])

# Transform  testing data using same vectorizer
X_test_tfidf = vectorizer.transform(X_test['text'])

# I will use Logistic regression as classifier
classifier = LogisticRegression(random_state=42)

# Training  classifier on TF-IDF features
classifier.fit(X_train_tfidf, y_train)

# Make predictions on  test set
y_pred = classifier.predict(X_test_tfidf)

# Print results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


# saving trained model and TF-IDF vectorizer 
joblib.dump(classifier, "classifier_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

In [None]:
# trying GridSearchCV to optimize logistic regressoin parameters

from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)
print("Best parameters:", grid_search.best_params_)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score



columns_to_clean = ['text']  
for col in columns_to_clean:
    data[col] = data[col].apply(lambda x: clean_text(x, stop_words=english_stopwords))

# Train the model with  best parameters
best_classifier = LogisticRegression(C=10, solver='liblinear', random_state=42)
best_classifier.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred_best = best_classifier.predict(X_test_tfidf)

# Print results
print("Optimized Model Accuracy:", accuracy_score(y_test, y_pred_best))
print("Optimized Model Classification Report:\n", classification_report(y_test, y_pred_best))


In [None]:
# what words are more triggering?

feature_names = vectorizer.get_feature_names_out()
coefficients = classifier.coef_[0]
sorted_features = sorted(zip(coefficients, feature_names), reverse=True)
print("Top features:", sorted_features[:10])


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# The evaluation metric for this competition is Mean F1-Score. 
# MultinimialNB = Multinomial Naive Bayes

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pandas as pd



## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv(r"../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
data.fillna("",inplace=True)


columns_to_clean = ['text']  
for col in columns_to_clean:
    data[col] = data[col].apply(lambda x: clean_text(x, stop_words=english_stopwords))

columns_to_clean = ['text']  
for col in columns_to_clean:
    data[col] = data[col].apply(lambda x: tokenize_and_lemmatize(x, stop_words=english_stopwords))


# Split into features and target
X = data['text']
y = data['label']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CountVectorizer with bigrams
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2))

# Fit and transform training data, transform test data
X_train_ngram = ngram_vectorizer.fit_transform(X_train)
X_test_ngram = ngram_vectorizer.transform(X_test)

# Train and evaluate MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train_ngram, y_train)
y_pred_ngram = classifier.predict(X_test_ngram)

# get feature nnames
bigram_features = ngram_vectorizer.get_feature_names_out()


# Extract the log probabilities of the features
feature_log_prob = classifier.feature_log_prob_

# Convert log probabilities to regular probabilities
feature_prob = np.exp(feature_log_prob)

# Get the top 10 bigrams for each class
top_n = 10

# For class 0
top_bigrams_class_0 = sorted(zip(feature_prob[0], bigram_features), reverse=True)[:top_n]

# For class 1
top_bigrams_class_1 = sorted(zip(feature_prob[1], bigram_features), reverse=True)[:top_n]

print("Top 10 Bigrams for Class 0 (HAM):")
for prob, bigram in top_bigrams_class_0:
    print(f"{bigram}: {prob:.4f}")

print("\nTop 10 Bigrams for Class 1 (SPAM):")
for prob, bigram in top_bigrams_class_1:
    print(f"{bigram}: {prob:.4f}")

# Get the bigram indices in a specific sample
sample_index = 0  # Example: First test sample
bigram_indices = X_test_ngram[sample_index].nonzero()[1]
sample_bigrams = [bigram_features[i] for i in bigram_indices]

print(f"Bigrams in Sample {sample_index}: {sample_bigrams}")
# Print results
print("N-grams Representation (Bigrams)")
print("Accuracy:", accuracy_score(y_test, y_pred_ngram))
print("Classification Report:\n", classification_report(y_test, y_pred_ngram))



## Final Notes



The best model seems to be the optimized model fro before. 