In [78]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [79]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [80]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

In [81]:
# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [82]:
test = pd.read_csv("../data/kg_test.csv",encoding='latin-1')

### Let's divide the training and test set into two partitions

## Data Preprocessing

In [83]:
from textblob.classifiers import NaiveBayesClassifier

In [84]:
# data_tuples = list(data.itertuples(index=False, name=None))

In [85]:
# cl = NaiveBayesClassifier(data_tuples)
# cl.show_informative_features(10)

In [86]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [87]:
import re
import string
import pandas as pd

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [88]:
def clean_text(text):
    # 1. Remove inline JavaScript/CSS
    text = re.sub(r'<script.*?>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<style.*?>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # 2. Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    
    # 3. Remove remaining HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # 4. Remove special characters (punctuation)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    
    # 5. Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # 6. Remove all single characters (isolated)
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    
    # 7. Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    # 8. Remove prefixed 'b' (like from byte literals: b'text')
    text = re.sub(r"^b\s+", '', text)
    
    # 9. Convert to lowercase
    text = text.lower().strip()
    
    return text


In [89]:
# Apply to the dataframe
data['clean_text'] = data['text'].apply(clean_text)

In [90]:
data.clean_text

0      dear sir strictly private business proposal am...
1                                                will do
2      noracheryl has emailed dozens of memos about h...
3      dear sirfmadamc know that this proposal might ...
4                                                    fyi
                             ...                        
995    so whats the latest it sounds contradictory an...
996    transfer of million pounds to youraccountmy na...
997    barb will call to explain are you back in the ...
998       yang on travelnot free tonitemay work tomorrow
999    sbwhoeopsunday february pmhshaunh just talked ...
Name: clean_text, Length: 1000, dtype: object

## Now let's work on removing stopwords
Remove the stopwords.

In [91]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [92]:
def remove_stopwords(text, stopfile = 'english'):
    badwords = stopwords.words(stopfile)
    words = word_tokenize(text)  # tokenize & lowercase
    filtered_words = [w for w in words if w not in badwords]
    return filtered_words

In [93]:
# Your code
# Apply to the dataframe
data['no_stopwords'] = data['clean_text'].apply(remove_stopwords)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [94]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

# unfortunately pos_tag and lemmatize use different codes for parts of speech
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # get returns second argument if first key does not exist

lemmatizer = WordNetLemmatizer()

def lemmatizer_with_pos(row):
  return [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in row['no_stopwords']]

data['lemmatized'] = data.apply(lemmatizer_with_pos, axis=1)
data.head()

Unnamed: 0,text,label,clean_text,no_stopwords,lemmatized
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1,dear sir strictly private business proposal am...,"[dear, sir, strictly, private, business, propo...","[dear, sir, strictly, private, business, propo..."
1,Will do.,0,will do,[],[]
2,Nora--Cheryl has emailed dozens of memos about...,0,noracheryl has emailed dozens of memos about h...,"[noracheryl, emailed, dozens, memos, haiti, we...","[noracheryl, email, dozen, memo, haiti, weeken..."
3,Dear Sir=2FMadam=2C I know that this proposal ...,1,dear sirfmadamc know that this proposal might ...,"[dear, sirfmadamc, know, proposal, might, surp...","[dear, sirfmadamc, know, proposal, might, surp..."
4,fyi,0,fyi,[fyi],[fyi]


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [101]:
def re_blob(row):
  return " ".join(row['lemmatized'])

data['preprocessed_text'] = data.apply(re_blob,axis=1)
data.head()

Unnamed: 0,text,label,clean_text,no_stopwords,lemmatized,preprocessed_text,aac,aaclocated,aae,aag,...,â½t,â½ta,â½te,â½tica,â½to,â½trangers,â½v,â½x,â½xã,â½ã
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1,dear sir strictly private business proposal am...,"[dear, sir, strictly, private, business, propo...","[dear, sir, strictly, private, business, propo...",dear sir strictly private business proposal mi...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Will do.,0,will do,[],[],,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nora--Cheryl has emailed dozens of memos about...,0,noracheryl has emailed dozens of memos about h...,"[noracheryl, emailed, dozens, memos, haiti, we...","[noracheryl, email, dozen, memo, haiti, weeken...",noracheryl email dozen memo haiti weekend plea...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1,dear sirfmadamc know that this proposal might ...,"[dear, sirfmadamc, know, proposal, might, surp...","[dear, sirfmadamc, know, proposal, might, surp...",dear sirfmadamc know proposal might surprise e...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,fyi,0,fyi,[fyi],[fyi],fyi,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the vectorizer
vectorizer = CountVectorizer()

# Fit and transform the clean_text column
X = vectorizer.fit_transform(data["preprocessed_text"])

# Get the feature names (the words)
bow_columns = vectorizer.get_feature_names_out()

# Convert to a DataFrame if you want to inspect
bow_df = pd.DataFrame(X.toarray(), columns=bow_columns)

# Optionally join with your original df
df_bow = pd.concat([data, bow_df], axis=1)


## Extra features

In [106]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

df_bow['money_mark'] = df_bow['preprocessed_text'].str.contains(money_simbol_list)*1
df_bow['suspicious_words'] = df_bow['preprocessed_text'].str.contains(suspicious_words)*1
df_bow['text_len'] = df_bow['preprocessed_text'].apply(lambda x: len(x)) 

# df_bow['money_mark'] = df_bow['preprocessed_text'].str.contains(money_simbol_list)*1
# df_bow['suspicious_words'] = df_bow['preprocessed_text'].str.contains(suspicious_words)*1
# df_bow['text_len'] = df_bow['preprocessed_text'].apply(lambda x: len(x)) 

df_bow.head()

Unnamed: 0,text,label,clean_text,no_stopwords,lemmatized,preprocessed_text,aac,aaclocated,aae,aag,...,â½te,â½tica,â½to,â½trangers,â½v,â½x,â½xã,â½ã,suspicious_words,text_len
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1,dear sir strictly private business proposal am...,"[dear, sir, strictly, private, business, propo...","[dear, sir, strictly, private, business, propo...",dear sir strictly private business proposal mi...,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1463
1,Will do.,0,will do,[],[],,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nora--Cheryl has emailed dozens of memos about...,0,noracheryl has emailed dozens of memos about h...,"[noracheryl, emailed, dozens, memos, haiti, we...","[noracheryl, email, dozen, memo, haiti, weeken...",noracheryl email dozen memo haiti weekend plea...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,108
3,Dear Sir=2FMadam=2C I know that this proposal ...,1,dear sirfmadamc know that this proposal might ...,"[dear, sirfmadamc, know, proposal, might, surp...","[dear, sirfmadamc, know, proposal, might, surp...",dear sirfmadamc know proposal might surprise e...,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1353
4,fyi,0,fyi,[fyi],[fyi],fyi,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Your code

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [111]:
import numpy as np

In [109]:
## Read Data for the Fraudulent Email Kaggle Challenge
df = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

In [114]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df = 1)
tfidf_matrix = tfidf_vectorizer.fit_transform(df.text.to_list())


for doc,tf_idf_doc in zip(df,tfidf_matrix.todense()):
    print("DOC:", doc)
    print(np.around(tf_idf_doc,3)) # El tres define los decimales
    print()

DOC: text
[[0.034 0.063 0.    ... 0.    0.    0.   ]]

DOC: label
[[0. 0. 0. ... 0. 0. 0.]]



## And the Train a Classifier?

In [None]:
# Your code

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code