In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [3]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [28]:
from sklearn.model_selection import train_test_split

X = data["text"]
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

## Data Preprocessing

In [29]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [30]:
import re

def remove_html(text):
    # remove html tags + inline css/js + comments
    cleaned_text = re.sub(r'<[^>]+>', '', text)

    # remove html comments
    return re.sub(r'<!--.*?-->', "", cleaned_text)

X_train_cleaned = X_train.apply(remove_html)
X_test_cleaned = X_test.apply(remove_html)

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [36]:
def clean_text(text):
    stop_words = stopwords.words("english")

    # remove prefixed 'b'
    cleaned_text = text.lstrip("b")
    
    # convert to lower
    cleaned_text = cleaned_text.lower()

    # remove punctuation
    for char in string.punctuation:
        cleaned_text = cleaned_text.replace(char, ' ')

    # remove digits
    for digit in string.digits:
        cleaned_text = cleaned_text.replace(digit, ' ')

    # split sentences into words + remove multiple spaces
    words = cleaned_text.split()
    words = [word for word in words if len(word) > 1]

    words = [word for word in words if word not in stop_words]
    
    return " ".join(words).strip()


X_train_cleaned = X_train_cleaned.apply(clean_text)
X_test_cleaned = X_test_cleaned.apply(clean_text)  

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
# Done in previous step

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [48]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

X_train_tokenized = [[lemmatizer.lemmatize(word) for word in word_tokenize(sentence)] for sentence in X_train_cleaned]
X_test_tokenized = [[lemmatizer.lemmatize(word) for word in word_tokenize(sentence)] for sentence in X_test_cleaned]

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [79]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(min_df=1, ngram_range=(2,2))

X_train_sentences = [" ".join(words) for words in X_train_tokenized]
X_test_sentences = [" ".join(words) for words in X_test_tokenized]

word_freq_matrix = count_vectorizer.fit_transform(X_train_sentences)

frequencies = np.asarray(word_freq_matrix.sum(axis=0)).flatten()
vocab_items = count_vectorizer.vocabulary_
reverse_vocab = {v: k for k, v in vocab_items.items()}

# Output frequency and bigram together
top_10 = sorted(zip(frequencies, [reverse_vocab[i] for i in range(len(frequencies))]), reverse=True)[:10]
for freq, bigram in top_10:
    print(f"{bigram}: {freq}")

nbsp nbsp: 235
next kin: 221
united state: 115
security company: 110
bank account: 102
state dollar: 100
fax number: 100
south africa: 85
hundred thousand: 83
five hundred: 74


## Extra features

In [80]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train = pd.DataFrame({'preprocessed_text': X_train_sentences})
data_val = pd.DataFrame({'preprocessed_text': X_test_sentences})


data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

Unnamed: 0,preprocessed_text,money_mark,suspicious_words,text_len
0,thursday february pm sbwhoeopre urgent good ne...,1,0,85
1,fyimills cheryl monday april pmhfw decision up...,1,0,50
2,dear cgood day ei know message come suprise co...,1,1,865
3,like,1,0,4
4,goodday thanks response email today business i...,1,1,468


## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Bag of Words simply counts the frequency of each word in the text and creates a vector from it.

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [92]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_idf_vectorizer = TfidfVectorizer(ngram_range=(2, 2), min_df=1)
tf_idf_matrix = tf_idf_vectorizer.fit_transform(X_train_sentences)

tf_idf_matrix.shape


(800, 71586)

## And the Train a Classifier?

In [97]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

model = MultinomialNB().fit(tf_idf_matrix, y_train)
y_pred = model.predict(tf_idf_vectorizer.transform(X_test_sentences))

acc = accuracy_score(y_test, y_pred)

print(acc)

0.965


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

vectorizers = {
    "Count Unigram": CountVectorizer(ngram_range=(1,1), min_df=2),
    "Count Bigram": CountVectorizer(ngram_range=(2,2), min_df=2),
    "Count Uni+Bi": CountVectorizer(ngram_range=(1,2), min_df=2),
    "TF-IDF Unigram": TfidfVectorizer(ngram_range=(1,1), min_df=2),
    "TF-IDF Bigram": TfidfVectorizer(ngram_range=(2,2), min_df=2),
    "TF-IDF Uni+Bi": TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words='english')
}

for name, vectorizer in vectorizers.items():
    X_train_vec = vectorizer.fit_transform(X_train_sentences)
    X_test_vec = vectorizer.transform(X_test_sentences)
    
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)
    
    y_pred = model.predict(X_test_vec)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='binary')
    rec = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    
    print(f"=== {name} ===")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-Score:  {f1:.4f}\n")

=== Count Unigram ===
Accuracy:  0.9700
Precision: 0.9432
Recall:    0.9881
F1-Score:  0.9651

=== Count Bigram ===
Accuracy:  0.9500
Precision: 0.9744
Recall:    0.9048
F1-Score:  0.9383

=== Count Uni+Bi ===
Accuracy:  0.9800
Precision: 0.9651
Recall:    0.9881
F1-Score:  0.9765

=== TF-IDF Unigram ===
Accuracy:  0.9750
Precision: 0.9540
Recall:    0.9881
F1-Score:  0.9708

=== TF-IDF Bigram ===
Accuracy:  0.9600
Precision: 0.9750
Recall:    0.9286
F1-Score:  0.9512

=== TF-IDF Uni+Bi ===
Accuracy:  0.9800
Precision: 0.9651
Recall:    0.9881
F1-Score:  0.9765

