In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [4]:
## Read Data for the Fraudulent Email Kaggle Challenge
# Adjust the file path to the correct location
data = pd.read_csv("../data/kg_train.csv", encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [6]:
# divide the data into training and test sets
from sklearn.model_selection import train_test_split

# Rename variables to be more descriptive 
data_train, data_val = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])

print(f"Training set shape: {data_train.shape}")
print(f"Validation set shape: {data_val.shape}")
print(f"Spam distribution in training: {data_train['label'].mean():.2f}")
print(f"Spam distribution in validation: {data_val['label'].mean():.2f}")

Training set shape: (800, 2)
Validation set shape: (200, 2)
Spam distribution in training: 0.44
Spam distribution in validation: 0.44


## Data Preprocessing

In [None]:
import string
from nltk.corpus import stopwords

print(string.punctuation)
print(stopwords.words("english")[100:110])

from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:

# remove html comments
def remove_html_comments(text):
    return ' '.join(word for word in text.split() if not (word.startswith('<!--') and word.endswith('-->')))

# remove inline javascript/css
def remove_inline_js_css(text):
    return ' '.join(word for word in text.split() if not (word.startswith('<') and word.endswith('>')))

# clean html code removing words that are not useful
def clean_html(text):
    text = remove_html_comments(text)
    text = remove_inline_js_css(text)
    return text

# clean data_train and data_val
data_train['text'] = data_train['text'].apply(clean_html)
data_val['text'] = data_val['text'].apply(clean_html)



- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
# remove all special characters and digits
def remove_special_characters(text):
    return ''.join(char for char in text if char.isalpha() or char.isspace())

#remove numbers
def remove_numbers(text):
    return ''.join(char for char in text if not char.isdigit())

#remove all single characters
def remove_single_characters(text):
    return ' '.join(word for word in text.split() if len(word) > 1)

#remove single characters from the start of the word
def remove_single_characters_start(text):
    return ' '.join(word[1:] if len(word) > 1 and word[0].isalpha() else word for word in text.split())

# remove multiple spaces
def remove_multiple_spaces(text):
    return ' '.join(text.split())

# remove prefixed 'b'
def remove_prefixed_b(text):
    return text.replace("b'", "").replace("'", "")

# convert to lowercase
def convert_to_lowercase(text):
    return text.lower()

# apply all cleaning functions
def clean_text(text):
    text = remove_html_comments(text)
    text = remove_inline_js_css(text)
    text = remove_special_characters(text)
    text = remove_numbers(text)
    text = remove_single_characters(text)
    text = remove_single_characters_start(text)
    text = remove_multiple_spaces(text)
    text = remove_prefixed_b(text)
    text = convert_to_lowercase(text)
    return text

# clean the text in data_train and data_val
data_train['text'] = data_train['text'].apply(clean_text)
data_val['text'] = data_val['text'].apply(clean_text)

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
# remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join(word for word in text.split() if word not in stop_words)

# apply stopword removal
data_train['text'] = data_train['text'].apply(remove_stopwords)
data_val['text'] = data_val['text'].apply(remove_stopwords)



## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# break sentences into words
def break_into_words(text):
    return text.split()

# use lemmatization
def lemmatize_text(text):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split())

# apply lemmatization
data_train['text'] = data_train['text'].apply(lemmatize_text)
data_val['text'] = data_val['text'].apply(lemmatize_text)

# see how this creates cleaner data
print("Sample cleaned text from training data:")
print(data_train['text'].iloc[0])
# visualize the distribution of labels
plt.figure(figsize=(8, 6))
data_train['label'].value_counts().plot(kind='bar', color=['blue', 'orange'])
plt.title('Distribution of Labels in Training Data')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Ham', 'Spam'], rotation=0)
plt.show()

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
# get the top 10 top words in ham and spam messages
def get_top_words(data, label, n=10):
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(data[data['label'] == label]['text'])
    feature_names = vectorizer.get_feature_names_out()
    sums = X.sum(axis=0)
    data = pd.DataFrame(sums, columns=feature_names).T
    data.columns = ['tfidf']
    return data.nlargest(n, 'tfidf')

data_train_top_ham = get_top_words(data_train, 0, 10)
data_train_top_spam = get_top_words(data_train, 1, 10)
print("Top words in Ham messages:")
print(data_train_top_ham)
print("Top words in Spam messages:")
print(data_train_top_spam)

## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# bags of words with count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_features=1000)
X_train_counts = count_vectorizer.fit_transform(data_train['text'])
X_val_counts = count_vectorizer.transform(data_val['text'])

# see the resulting shape
print(f"Shape of training data after CountVectorizer: {X_train_counts.shape}")
print(f"Shape of validation data after CountVectorizer: {X_val_counts.shape}")



## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
# TF-IDF Vectorization
# Create and fit the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(data_train['text'])
X_val_tfidf = tfidf_vectorizer.transform(data_val['text'])


# Save the CountVectorizer for later use
import joblib
joblib.dump(count_vectorizer, '../data/count_vectorizer.pkl')


# Load the TF-IDF vectorizer
loaded_tfidf_vectorizer = joblib.load('../data/tfidf_vectorizer.pkl')

## And the Train a Classifier?

In [None]:
# train a classifier
from sklearn.naive_bayes import MultinomialNB
# Initialize the classifier
classifier = MultinomialNB()
# Fit the classifier on the training data
classifier.fit(X_train_tfidf, data_train['label'])
# Predict on the validation set
predictions = classifier.predict(X_val_tfidf)
# Evaluate the classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Calculate accuracy
accuracy = accuracy_score(data_val['label'], predictions)
print(f"Accuracy of the Naive Bayes classifier: {accuracy:.2f}")
# Print classification report
print("Classification Report:")
print(classification_report(data_val['label'], predictions, target_names=['Ham', 'Spam']))
# Print confusion matrix
conf_matrix = confusion_matrix(data_val['label'], predictions)
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# Save the trained classifier
joblib.dump(classifier, '../data/naive_bayes_classifier.pkl')

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [None]:
# Your code