In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

In [None]:
data

### Let's divide the training and test set into two partitions

In [None]:
X = data['text']
y = data['label']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)          
# stratify=y = Keeps the spam/ham ratio the same in both sets

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

## Data Preprocessing

In [None]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
# we remove inline JavaScript/CSS

import re

def remove_inline_js_css(text):
    # Remove <script> ... </script> blocks (inline JavaScript)
    text = re.sub(r'<script.*?>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # Remove <style> ... </style> blocks (inline CSS)
    text = re.sub(r'<style.*?>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
    
    return text

# Apply to training and test data
X_train_clean = X_train.apply(remove_inline_js_css)
X_test_clean = X_test.apply(remove_inline_js_css)

# removing  html comments

def remove_html_comments(text):
    # Remove HTML comments <!-- ... -->
    return re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

# Apply to train and test sets *after* removing inline JS/CSS
X_train_clean = X_train_clean.apply(remove_html_comments)
X_test_clean = X_test_clean.apply(remove_html_comments)


# we can remove the remaining tags

def remove_html_tags(text):
    # Remove any remaining HTML tags like <div>, <p>, <a>, etc.
    return re.sub(r'<[^>]+>', '', text)

# Apply to training and test sets
X_train_clean = X_train_clean.apply(remove_html_tags)
X_test_clean = X_test_clean.apply(remove_html_tags)


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
# Removing all special characters

def remove_special_char(text):
    # Keep only letters, numbers, and spaces
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)
X_train_clean = X_train_clean.apply(remove_special_char)
X_test_clean = X_test_clean.apply(remove_special_char)

# Remove numbers

def remove_num(text):
    return re.sub(r'\d+', '', text)
X_train_clean = X_train_clean.apply(remove_num)
X_test_clean = X_test_clean.apply(remove_num)

# Remove all single characters

def remove_single_char(text):
    # \b means word boundary, [a-zA-Z0-9] means any letter or digit, and \b again ensures it's alone
    return re.sub(r'\b\w\b', '', text)
X_train_clean = X_train_clean.apply(remove_single_char)
X_test_clean = X_test_clean.apply(remove_single_char)

# Remove single characters from the start

def remove_single_char_start(text):
    return re.sub(r'^\w\s+', '', text)
X_train_clean = X_train_clean.apply(remove_single_char_start)
X_test_clean = X_test_clean.apply(remove_single_char_start)

# Substitute multiple spaces with single space

def remove_multi_space(text):
    return re.sub(r'\s+', ' ' ,text)
X_train_clean = X_train_clean.apply(remove_multi_space)
X_test_clean = X_test_clean.apply(remove_multi_space)

# Remove prefixed 'b'

def remove_prefixed_b(text):
    return re.sub(r"^b['\"]", '', text)
X_train_clean = X_train_clean.apply(remove_prefixed_b)
X_test_clean = X_test_clean.apply(remove_prefixed_b)

# Convert to Lowercase

def to_lowercase(text):
    return text.lower()
X_train_clean = X_train_clean.apply(to_lowercase)
X_test_clean = X_test_clean.apply(to_lowercase)


In [None]:
X_train_clean

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Tokenization
def do_tokenization(text):
    return word_tokenize(text)

X_train_clean = X_train_clean.apply(do_tokenization)
X_test_clean = X_test_clean.apply(do_tokenization)

# Remove stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

X_train_clean = X_train_clean.apply(remove_stopwords)
X_test_clean = X_test_clean.apply(remove_stopwords)
X_train_clean


In [None]:
X_test_clean

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
    """Mapping POS tag to first character lemmatize() accepts"""
    if treebank_tag.startswith('J'): return wordnet.ADJ
    if treebank_tag.startswith('V'): return wordnet.VERB
    if treebank_tag.startswith('N'): return wordnet.NOUN
    if treebank_tag.startswith('R'): return wordnet.ADV
    return wordnet.NOUN

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    if not tokens: 
        return tokens
    pos_tags = nltk.pos_tag(tokens)
    return [lemmatizer.lemmatize(tok, get_wordnet_pos(pos)) for tok, pos in pos_tags]


# Lemmatize
X_train_lem = X_train_clean.apply(lemmatize_tokens)
X_test_lem  = X_test_clean.apply(lemmatize_tokens)

# Join back to strings for vectorizers/models
X_train_ready = X_train_lem.apply(lambda toks: " ".join(toks))
X_test_ready  = X_test_lem.apply(lambda toks: " ".join(toks))

In [None]:
X_train_ready

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(min_df=2, ngram_range=(1,1))

X_train_bow = bow.fit_transform(X_train_ready)  # learn vocab on train and transform
X_test_bow  = bow.transform(X_test_ready)       # transform test with same vocab

print(X_train_bow.shape, X_test_bow.shape)      # (n_docs, n_features)


## Extra features

# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(min_df=2, ngram_range=(1,1))

X_train_bow = bow.fit_transform(X_train_ready)  # learn vocab on train and transform
X_test_bow  = bow.transform(X_test_ready)       # transform test with same vocab

print(X_train_bow.shape, X_test_bow.shape)      # (n_docs, n_features)


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Load the vectorizer
tfidf = TfidfVectorizer(min_df=2, ngram_range=(1,1))  # tweak min_df/ngrams as needed

# 2. Vectorize all dataset
X_train_tfidf = tfidf.fit_transform(X_train_ready)  # learn vocab + transform training set
X_test_tfidf  = tfidf.transform(X_test_ready)       # transform test set with same vocab

# 3. Print shapes
print("TF-IDF Train shape:", X_train_tfidf.shape)
print("TF-IDF Test shape:", X_test_tfidf.shape)


## And the Train a Classifier?

In [None]:
# Your code

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code