In [49]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import nltk


- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development.

In [51]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [102]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development.
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [103]:
data_shape = data.shape
data_head = data.head()

data_shape, data_head

((1000, 2),
                                                 text  label
 0  DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...      1
 1                                           Will do.      0
 2  Nora--Cheryl has emailed dozens of memos about...      0
 3  Dear Sir=2FMadam=2C I know that this proposal ...      1
 4                                                fyi      0)

# New Section

### Let's divide the training and test set into two partitions

In [105]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)
# Checking the sizes of the splits
train_size, test_size = X_train.shape[0], X_test.shape[0]

train_size, test_size

(800, 200)

## Data Preprocessing

In [58]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [113]:
def clean_html(text):
    """Remove HTML tags, inline JavaScript/CSS, and clean text"""
    # Remove inline JavaScript and CSS
    text = re.sub(r"<(script|style).*?>.*?</\1>", "", text, flags=re.DOTALL)
    # Remove HTML comments
    text = re.sub(r"<!--(.*?)-->", "", text, flags=re.DOTALL)
    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()  # This line removes the remaining tags
    return clean_text


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters

- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [110]:
 #re.sub() is a function that performs string substitution using regex patterns. It has the format:re.sub(pattern, replacement, string)

# Remove special characters
import re

def clean_text(text):
    """Removes special characters, numbers, single characters, and extra spaces."""
    # Remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    # Remove single characters
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
    # Remove single characters from the start
    text = re.sub(r"^[a-zA-Z]\s+", "", text)
    # Substituting multiple spaces with single space
    text = re.sub(r"\s+", " ", text, flags=re.I)
    # Removing prefixed 'b'
    text = re.sub(r"^b\s+", "", text)
    # Converting to Lowercase
    text = text.lower()

    return text


## Now let's work on removing stopwords
Remove the stopwords.

In [114]:

nltk.download('stopwords') #  download the stopwords

stop_words = set(stopwords.words('english'))
text = cleaned_text

words = text.split()
text = " ".join([word for word in words if word not in stop_words])

print(text) # Print to see the result

interesting um guess


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [115]:
#tokenise first
tokens = word_tokenize(text)

lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
text = " ".join(tokens)

print(text)



interesting um guess


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [116]:
def analyze_top_words(data, column='preprocessed_text', label_column='label', n=10):
    """Analyze top words for ham and spam messages"""
    ham_texts = ' '.join(data[data[label_column] == 0][column])
    spam_texts = ' '.join(data[data[label_column] == 1][column])

    # Get word frequencies
    ham_words = word_tokenize(ham_texts)
    spam_words = word_tokenize(spam_texts)

    ham_freq = Counter(ham_words).most_common(n)
    spam_freq = Counter(spam_words).most_common(n)

    # Create visualization
    plt.figure(figsize=(15, 6))

    # Ham plot
    plt.subplot(1, 2, 1)
    words, counts = zip(*ham_freq)
    sns.barplot(x=list(counts), y=list(words), palette='Blues_r')
    plt.title('Top Words in HAM Messages')
    plt.xlabel('Frequency')

    # Spam plot
    plt.subplot(1, 2, 2)
    words, counts = zip(*spam_freq)
    sns.barplot(x=list(counts), y=list(words), palette='Reds_r')
    plt.title('Top Words in SPAM Messages')
    plt.xlabel('Frequency')

    plt.tight_layout()
    plt.show()

    return ham_freq, spam_freq

## Extra features

## How would work the Bag of Words with Count Vectorizer concept?

In [87]:
# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]
# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the documents
vectorizer.fit(documents)

# Get the vocabulary
vocabulary = vectorizer.get_feature_names_out()
print("Vocabulary:", vocabulary)

# Transform the documents into a document-term matrix
document_term_matrix = vectorizer.transform(documents)

# Print the document-term matrix
print("Document-Term Matrix:")
print(document_term_matrix.toarray())

Vocabulary: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Document-Term Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [101]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf.fit_transform(data_train['preprocessed_text'])
X_val_tfidf = tfidf.transform(data_val['preprocessed_text'])

KeyError: 'preprocessed_text'

## And the Train a Classifier?

In [None]:
# Your code

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [None]:
# Your code