In [29]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [30]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development.

In [31]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development.
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [32]:
# Your code
from sklearn.model_selection import train_test_split

data_train = data.text
data_val = data.drop(columns=['text'])
X_train,X_test,y_train,y_test=train_test_split(data_train,data_val,test_size=0.2,random_state=42)

## Data Preprocessing

In [33]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [34]:
print(X_test) #before cleaning

521    Dear Sir=2C I wish you go through this offer t...
737    To take your mind off the Balkans for a second...
740                         Pls keep the updates coming!
660    </STRONG><STRONG>CHRIST BETHEL HOSPITAL<BR>11 ...
411    sbwhoeopFriday February 5 2010 7:11 AMHRe: Bra...
                             ...                        
408    Sorry yes exactlyWe have shy tomorrow at 10am ...
332    DEAR=2CGOOD DAY=2EI KNOW THIS MESSAGE WILL COM...
208                                                  FYI
613    Greetings Dear Friend Please Permit me to cont...
78     No in car on way to airport. Can you talk? Cal...
Name: text, Length: 200, dtype: object


In [35]:
# Your code
import re

def clean_html_regex(html_content):
    html_content = re.sub(r'<(script|style).*?>.*?</\1>', '', html_content, flags=re.S)  # Remove scripts & styles
    html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.S)  # Remove comments
    html_content = re.sub(r'<[^>]+>', '', html_content)  # Remove tags
    return re.sub(r'\s+', ' ', html_content).strip()  # Remove extra spaces


X_train = X_train.apply(clean_html_regex)
X_test = X_test.apply(clean_html_regex)

print(X_test) #before cleaning

521    Dear Sir=2C I wish you go through this offer t...
737    To take your mind off the Balkans for a second...
740                         Pls keep the updates coming!
660    CHRIST BETHEL HOSPITAL11 RUE ABOBOTE,ABIDJANIV...
411    sbwhoeopFriday February 5 2010 7:11 AMHRe: Bra...
                             ...                        
408    Sorry yes exactlyWe have shy tomorrow at 10am ...
332    DEAR=2CGOOD DAY=2EI KNOW THIS MESSAGE WILL COM...
208                                                  FYI
613    Greetings Dear Friend Please Permit me to cont...
78     No in car on way to airport. Can you talk? Cal...
Name: text, Length: 200, dtype: object


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters

- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [36]:
# Your code

def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'\W', ' ', text)

    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)

    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # Remove prefixed 'b'
    text = re.sub(r'^b\s+', '', text)

    # Convert to lowercase
    text = text.lower()

    return text

# Apply the function to the text data
X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

print(X_test)

521    dear sir 2c wish you go through this offer to ...
737    to take your mind off the balkans for second s...
740                         pls keep the updates coming 
660    christ bethel hospital11 rue abobote abidjaniv...
411    sbwhoeopfriday february 5 2010 7 11 amhre brav...
                             ...                        
408    sorry yes exactlywe have shy tomorrow at 10am ...
332    dear 2cgood day 2ei know this message will com...
208                                                  fyi
613    greetings dear friend please permit me to cont...
78     no in car on way to airport can you talk call ...
Name: text, Length: 200, dtype: object


## Now let's work on removing stopwords
Remove the stopwords.

In [37]:
# Your code
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Load NLTK's stopwords
    words = word_tokenize(text)  # Tokenize text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

X_train = X_train.apply(remove_stopwords)
X_test = X_test.apply(remove_stopwords)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [38]:
# Your code
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Apply the function to the text data
X_train = X_train.apply(lemmatize_text)
X_test = X_test.apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [39]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=5000)
X_train_bow = vectorizer.fit_transform(X_train).toarray()
X_test_bow = vectorizer.transform(X_test).toarray()

## Extra features

In [40]:
import pandas as pd

# Combine X_train and y_train into a single DataFrame for data_train
data_train = pd.DataFrame(X_train, columns=['preprocessed_text'])
data_train['label'] = y_train

# Combine X_test and y_test into a single DataFrame for data_val
data_val = pd.DataFrame(X_test, columns=['preprocessed_text'])
data_val['label'] = y_test

# Add new indicators to data_train
money_simbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(str(x)) if isinstance(x, str) else 0)

# Add new indicators to data_val
data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(str(x)) if isinstance(x, str) else 0)

# Check the updated data_train
print(data_val.head())
print('_'*66)
print(data_train.head())


    preprocessed_text  label money_mark suspicious_words  text_len
521               NaN      1        NaN              NaN         0
737               NaN      0        NaN              NaN         0
740               NaN      0        NaN              NaN         0
660               NaN      1        NaN              NaN         0
411               NaN      0        NaN              NaN         0
__________________________________________________________________
    preprocessed_text  label money_mark suspicious_words  text_len
29                NaN      1        NaN              NaN         0
535               NaN      0        NaN              NaN         0
695               NaN      0        NaN              NaN         0
557               NaN      0        NaN              NaN         0
836               NaN      1        NaN              NaN         0


## How would work the Bag of Words with Count Vectorizer concept?

In [41]:
# Your code
# The Bag of Words (BoW) model is a way to represent text data in numerical form.
# It creates a vocabulary of all the unique words in the text corpus and then represents each document as a vector of word counts.


## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [42]:
# Your code
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

print("Shape of TF-IDF vectorized training data:", X_train_tfidf.shape)
print("Shape of TF-IDF vectorized test data:", X_test_tfidf.shape)

Shape of TF-IDF vectorized training data: (800, 5000)
Shape of TF-IDF vectorized test data: (200, 5000)


## And the Train a Classifier?

In [43]:
# Your code
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train the classifier using BoW features
clf_bow = MultinomialNB()
clf_bow.fit(X_train_bow, y_train)
y_pred_bow = clf_bow.predict(X_test_bow)

print("BoW Accuracy:", accuracy_score(y_test, y_pred_bow))
print("BoW Classification Report:\n", classification_report(y_test, y_pred_bow))

# Train the classifier using TF-IDF features
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

print("TF-IDF Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("TF-IDF Classification Report:\n", classification_report(y_test, y_pred_tfidf))

BoW Accuracy: 0.965
BoW Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       125
           1       0.93      0.99      0.95        75

    accuracy                           0.96       200
   macro avg       0.96      0.97      0.96       200
weighted avg       0.97      0.96      0.97       200

TF-IDF Accuracy: 0.96
TF-IDF Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.94      0.97       125
           1       0.91      0.99      0.95        75

    accuracy                           0.96       200
   macro avg       0.95      0.97      0.96       200
weighted avg       0.96      0.96      0.96       200



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [44]:
# Your code