In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [1]:
import pandas as pd

## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [2]:
data.head()

Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0


### Let's divide the training and test set into two partitions

In [8]:
from sklearn.model_selection import train_test_split

# Sépare le DataFrame complet
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Affiche les formes
print(train_df.head())

                                                  text  label
29   ----------- REGARDS, MR NELSON SMITH.KINDLY RE...      1
535  I have not been able to reach oscar this am. W...      0
695  ; Huma Abedin B6I'm checking with Pat on the 5...      0
557  I can have it announced here on Monday - can't...      0
836      BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...      1


In [9]:
# On a besoin de retirer le label de test_df
test_df = test_df.drop(columns=["label"])

test_df.head()

Unnamed: 0,text
521,Dear Sir=2C I wish you go through this offer t...
737,To take your mind off the Balkans for a second...
740,Pls keep the updates coming!
660,</STRONG><STRONG>CHRIST BETHEL HOSPITAL<BR>11 ...
411,sbwhoeopFriday February 5 2010 7:11 AMHRe: Bra...


## Data Preprocessing

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags
- Remove all the special characters
- Remove numbers
- Remove all single characters
- Remove single characters from the start
- Substitute multiple spaces with single space
- Remove prefixed 'b'
- Convert to Lowercase

In [10]:
import re

def clean_text(text):
    # 1. Remove inline JavaScript/CSS
    text = re.sub(r'<(script|style).*?>.*?</\1>', '', text, flags=re.DOTALL)

    # 2. Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

    # 3. Remove remaining HTML tags
    text = re.sub(r'<.*?>', '', text)

    # 4. Remove special characters (everything non-alphanumeric or whitespace)
    text = re.sub(r'[^\w\s]', '', text)

    # 5. Remove numbers
    text = re.sub(r'\d+', '', text)

    # 6. Remove all single characters
    text = re.sub(r'\b\w\b', '', text)

    # 7. Remove single characters from the start (ex: " a test" → " test")
    text = re.sub(r'^\w\s+', '', text)

    # 8. Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)

    # 9. Remove prefixed 'b' (as in b'string')
    text = re.sub(r'^b\s+', '', text)
    
    # 10. Convert to lowercase and trim
    return text.lower().strip()

In [13]:
# ------------------------------------------------------------------ #
#                         Nettoyage de texte                         #
# ------------------------------------------------------------------ #

train_df["clean_text"] = train_df["text"].apply(clean_text)
test_df["clean_text"] = test_df["text"].apply(clean_text)

train_df.head()

Unnamed: 0,text,label,clean_text
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1,regards mr nelson smithkindly reply me on my p...
535,I have not been able to reach oscar this am. W...,0,have not been able to reach oscar this am we a...
695,; Huma Abedin B6I'm checking with Pat on the 5...,0,huma abedin bim checking with pat on the will ...
557,I can have it announced here on Monday - can't...,0,can have it announced here on monday cant today
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1,bank of africaagence san pedro bp san pedro co...


Ici on va tokeniser les phrases nettoyées

In [14]:
# ------------------------------------------------------------------ #
#                 Tokenization de chaîne de caractère                #
# ------------------------------------------------------------------ #

from nltk.tokenize import word_tokenize # Import

def tokenize(row):
  tokens = word_tokenize(row["clean_text"])
  return [word.lower() for word in tokens if word.isalpha()]

train_df["tokenized"] = train_df.apply(tokenize, axis=1)
test_df["tokenized"] = test_df.apply(tokenize, axis=1)

train_df.head()

Unnamed: 0,text,label,clean_text,tokenized
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1,regards mr nelson smithkindly reply me on my p...,"[regards, mr, nelson, smithkindly, reply, me, ..."
535,I have not been able to reach oscar this am. W...,0,have not been able to reach oscar this am we a...,"[have, not, been, able, to, reach, oscar, this..."
695,; Huma Abedin B6I'm checking with Pat on the 5...,0,huma abedin bim checking with pat on the will ...,"[huma, abedin, bim, checking, with, pat, on, t..."
557,I can have it announced here on Monday - can't...,0,can have it announced here on monday cant today,"[can, have, it, announced, here, on, monday, c..."
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1,bank of africaagence san pedro bp san pedro co...,"[bank, of, africaagence, san, pedro, bp, san, ..."


## Now let's work on removing stopwords
Remove the stopwords.

In [15]:
# ------------------------------------------------------------------ #
#                    Mots de liaison + ponctuation                   #
# ------------------------------------------------------------------ #

# Import de la liste des stopwords en anglais
from nltk.corpus import stopwords

# Définir la liste des stopwords anglais une fois
stop_words = set(stopwords.words('english'))

# Conversion en lowercase + removal ponctuation
def to_lowercase_no_punct(tokens):
  return [word.lower() for word in tokens if word.isalpha()]

# Fonction pour enlever les stopwords sans perdre l’ordre
def remove_sw(tokens):
  return [word for word in tokens if word.lower() not in stop_words]

# Application sur tokenized_text
def apply_filter(row):
  tokenized_text_to_lowercase = to_lowercase_no_punct(row["tokenized"])
  filtered_tokens = remove_sw(tokenized_text_to_lowercase)
  return [word for word in filtered_tokens]

train_df["filtered_tokens"] = train_df.apply(apply_filter, axis=1)
test_df["filtered_tokens"] = test_df.apply(apply_filter, axis=1)

train_df.head()

Unnamed: 0,text,label,clean_text,tokenized,filtered_tokens
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1,regards mr nelson smithkindly reply me on my p...,"[regards, mr, nelson, smithkindly, reply, me, ...","[regards, mr, nelson, smithkindly, reply, priv..."
535,I have not been able to reach oscar this am. W...,0,have not been able to reach oscar this am we a...,"[have, not, been, able, to, reach, oscar, this...","[able, reach, oscar, supposed, send, pdb, rece..."
695,; Huma Abedin B6I'm checking with Pat on the 5...,0,huma abedin bim checking with pat on the will ...,"[huma, abedin, bim, checking, with, pat, on, t...","[huma, abedin, bim, checking, pat, work, jack,..."
557,I can have it announced here on Monday - can't...,0,can have it announced here on monday cant today,"[can, have, it, announced, here, on, monday, c...","[announced, monday, cant, today]"
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1,bank of africaagence san pedro bp san pedro co...,"[bank, of, africaagence, san, pedro, bp, san, ...","[bank, africaagence, san, pedro, bp, san, pedr..."


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [16]:
# ------------------------------------------------------------------ #
#                          Lemmatizing data                          #
# ------------------------------------------------------------------ #

# Import stemmer
from nltk.stem import WordNetLemmatizer

# Instanciate
lemmatizer = WordNetLemmatizer()

# Fonction pour stemmatiser
def lemmatize(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# Application
def apply_lemmatization(row):
  lemmatized_tokens = lemmatize(row["filtered_tokens"])
  return [word for word in lemmatized_tokens]

train_df["lemmatized"] = train_df.apply(apply_lemmatization, axis=1)
test_df["lemmatized"] = test_df.apply(apply_lemmatization, axis=1)

train_df.head()

Unnamed: 0,text,label,clean_text,tokenized,filtered_tokens,lemmatized
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1,regards mr nelson smithkindly reply me on my p...,"[regards, mr, nelson, smithkindly, reply, me, ...","[regards, mr, nelson, smithkindly, reply, priv...","[regard, mr, nelson, smithkindly, reply, priva..."
535,I have not been able to reach oscar this am. W...,0,have not been able to reach oscar this am we a...,"[have, not, been, able, to, reach, oscar, this...","[able, reach, oscar, supposed, send, pdb, rece...","[able, reach, oscar, supposed, send, pdb, rece..."
695,; Huma Abedin B6I'm checking with Pat on the 5...,0,huma abedin bim checking with pat on the will ...,"[huma, abedin, bim, checking, with, pat, on, t...","[huma, abedin, bim, checking, pat, work, jack,...","[huma, abedin, bim, checking, pat, work, jack,..."
557,I can have it announced here on Monday - can't...,0,can have it announced here on monday cant today,"[can, have, it, announced, here, on, monday, c...","[announced, monday, cant, today]","[announced, monday, cant, today]"
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1,bank of africaagence san pedro bp san pedro co...,"[bank, of, africaagence, san, pedro, bp, san, ...","[bank, africaagence, san, pedro, bp, san, pedr...","[bank, africaagence, san, pedro, bp, san, pedr..."


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [41]:
# ------------------------------------------------------------------ #
#                      Création de Bag of Words                      #
# ------------------------------------------------------------------ #

from sklearn.feature_extraction.text import CountVectorizer

# Création des blobs
def join_elems(row):
  return " ".join(row['lemmatized'])

train_df["blobs"] = train_df.apply(join_elems, axis=1)
test_df["blobs"] = test_df.apply(join_elems, axis=1)

# Conversion en Bag of words
bow_vect = CountVectorizer()
X = bow_vect.fit_transform(train_df['blobs']).toarray()

print("Bag of Words:\n", X)
print("Vocabulary:", bow_vect.get_feature_names_out())

Bag of Words:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Vocabulary: ['aac' 'aaclocated' 'aae' ... 'ãââcombating' 'ãââizaibeth' 'ãââjust']


In [42]:
as_df = pd.DataFrame(X, columns=bow_vect.get_feature_names_out())
as_df.head()

Unnamed: 0,aac,aaclocated,aae,aag,aaronovitchon,abacha,abachabefore,abachac,abachace,abachaco,...,zumac,zumadirector,zumae,zurich,ãâ,ãâbarna,ãââ,ãââcombating,ãââizaibeth,ãââjust
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2,random_state=100)
kmeans.fit(X)
pred = kmeans.predict(X)

In [44]:
predict_df = pd.concat([train_df['text'], train_df['label'],pd.DataFrame(pred,columns=['class'])],axis=1)
predict_df.head()

Unnamed: 0,text,label,class
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1.0,0.0
535,I have not been able to reach oscar this am. W...,0.0,1.0
695,; Huma Abedin B6I'm checking with Pat on the 5...,0.0,0.0
557,I can have it announced here on Monday - can't...,0.0,1.0
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1.0,


## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Your code

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
# Your code

## And the Train a Classifier?

In [None]:
# Your code

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code