In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

In [None]:
# Display the first 5 rows of the dataset
print(data.head())

# Display a summary of the dataset
print(data.info())

### Let's divide the training and test set into two partitions

In [None]:
# Your code

from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data['text'],  # Input feature
    data['label'], # Target labels
    test_size=0.2, # 20% of the data for testing
    random_state=42 # Seed for reproducibility
)

# Display the sizes of each partition
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

## Data Preprocessing

In [None]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
# Your code

import re

def clean_html(text):
    # Remove inline JavaScript/CSS: Any text within <script> or <style> tags
    text = re.sub(r'<(script|style).*?>.*?(</\1>)', '', text, flags=re.S)

    # Remove HTML comments: <!-- ... -->
    text = re.sub(r'<!--.*?-->', '', text, flags=re.S)

    # Remove remaining HTML tags: <...>
    text = re.sub(r'<.*?>', '', text, flags=re.S)

    # Return cleaned text
    return text

# Clean HTML from training and test sets
X_train_cleaned = X_train.apply(clean_html)
X_test_cleaned = X_test.apply(clean_html)

# Display the first 5 rows of the cleaned training set to verify
print("Cleaned Training Data:")
print(X_train_cleaned.head())

print("\nCleaned Test Data:")
print(X_test_cleaned.head())

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
# Your code

def additional_cleaning(text):
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove all single characters
    text = re.sub(r'\b\w\b', '', text)
    
    # Remove single characters from the start
    text = re.sub(r'^\s*\w\s+', '', text)
    
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove prefixed 'b'
    text = re.sub(r'\bb\s+', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    return text

# Apply the additional cleaning to the training and test sets
X_train_cleaned = X_train_cleaned.apply(additional_cleaning)
X_test_cleaned = X_test_cleaned.apply(additional_cleaning)

# Display the first 5 rows of the cleaned training set
print("Further Cleaned Training Data:")
print(X_train_cleaned.head())

print("\nFurther Cleaned Test Data:")
print(X_test_cleaned.head())

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
# Your code

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Define a function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Set of English stopwords
    words = word_tokenize(text)  # Tokenize the text into words
    filtered_words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(filtered_words)  # Reconstruct the text without stopwords

# Apply the function to the training and test sets
X_train_cleaned = X_train_cleaned.apply(remove_stopwords)
X_test_cleaned = X_test_cleaned.apply(remove_stopwords)

# Display the first 5 rows of the cleaned training set
print("Training Data after Stopwords Removal:")
print(X_train_cleaned.head())

print("\nTest Data after Stopwords Removal:")
print(X_test_cleaned.head())

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# Your code

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')  # Download WordNet for lemmatization

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to perform lemmatization
def lemmatize_text(text):
    words = word_tokenize(text)  # Tokenize the text into words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize each word
    return ' '.join(lemmatized_words)  # Reconstruct the text with lemmatized words

# Apply the function to the training and test sets
X_train_cleaned = X_train_cleaned.apply(lemmatize_text)
X_test_cleaned = X_test_cleaned.apply(lemmatize_text)

# Display the first 5 rows of the lemmatized training set
print("Training Data after Lemmatization:")
print(X_train_cleaned.head())

print("\nTest Data after Lemmatization:")
print(X_test_cleaned.head())

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
# Your code

from collections import Counter

# Separate spam and ham messages
spam_messages = X_train_cleaned[y_train == 1]
ham_messages = X_train_cleaned[y_train == 0]

# Define a function to extract the top N words
def get_top_n_words(messages, n=10):
    all_words = ' '.join(messages).split()  # Combine all messages and split into words
    word_counts = Counter(all_words)  # Count the frequency of each word
    return word_counts.most_common(n)  # Return the top N words and their counts

# Get top 10 words for spam and ham messages
top_spam_words = get_top_n_words(spam_messages, n=10)
top_ham_words = get_top_n_words(ham_messages, n=10)

# Display the results
print("Top 10 words in spam messages:")
print(top_spam_words)

print("\nTop 10 words in ham messages:")
print(top_ham_words)

## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

# Convert Series to lists explicitly before creating DataFrames
data_train = pd.DataFrame({"preprocessed_text": X_train_cleaned.tolist()})
data_val = pd.DataFrame({"preprocessed_text": X_test_cleaned.tolist()})

# Add features to the training set
data_train["money_mark"] = data_train["preprocessed_text"].str.contains(money_symbol_list, regex=True).astype(int)
data_train["suspicious_words"] = data_train["preprocessed_text"].str.contains(suspicious_words, regex=True).astype(int)
data_train["text_len"] = data_train["preprocessed_text"].apply(len)

# Add features to the validation set
data_val["money_mark"] = data_val["preprocessed_text"].str.contains(money_symbol_list, regex=True).astype(int)
data_val["suspicious_words"] = data_val["preprocessed_text"].str.contains(suspicious_words, regex=True).astype(int)
data_val["text_len"] = data_val["preprocessed_text"].apply(len)

# Display the first rows of the training data with the new features
print("Training set with additional features:")
print(data_train.head())

print("\nValidation set with additional features:")
print(data_val.head())

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Your code

from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform the training set
X_train_counts = vectorizer.fit_transform(data_train["preprocessed_text"])

# Transform the validation set using the same vectorizer
X_val_counts = vectorizer.transform(data_val["preprocessed_text"])

# Get the feature names (words) and their frequencies for the training set
word_counts_train = X_train_counts.toarray().sum(axis=0)  # Sum frequencies across all documents in train set
word_frequencies_train = dict(zip(vectorizer.get_feature_names_out(), word_counts_train))

# Sort and get the top 10 words for the training set
top_words_train = sorted(word_frequencies_train.items(), key=lambda x: x[1], reverse=True)[:10]

# Print the results
print("Top 10 words in training data:")
print(top_words_train)

## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
# Your code

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data and transform the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(data_train["preprocessed_text"])

# Transform the validation set using the same vectorizer
X_val_tfidf = tfidf_vectorizer.transform(data_val["preprocessed_text"])

# Print the shapes of the vectorized datasets
print("Shape of training data (TF-IDF):", X_train_tfidf.shape)
print("Shape of validation data (TF-IDF):", X_val_tfidf.shape)

## And the Train a Classifier?

In [None]:
# Check the columns in data_train
print("Columns in data_train:")
print(data_train.columns)

# Display the first few rows to inspect the structure
print("\nFirst rows of data_train:")
print(data_train.head())

# Check if the labels are present in the DataFrame
if "label" not in data_train.columns:
    print("\nThe column 'label' is missing in data_train!")
else:
    print("\nThe column 'label' exists in data_train.")

In [None]:
# Add the labels to the training and validation DataFrames
data_train["label"] = y_train.values
data_val["label"] = y_test.values

# Verify the structure of data_train and data_val
print("Updated columns in data_train:")
print(data_train.columns)

print("\nFirst rows of data_train after adding label:")
print(data_train.head())

print("\nUpdated columns in data_val:")
print(data_val.columns)

print("\nFirst rows of data_val after adding label:")
print(data_val.head())

In [None]:
# Your code

from sklearn.naive_bayes import MultinomialNB

# Initialize the classifier
model = MultinomialNB()

# Train the classifier on the training data
model.fit(X_train_tfidf, data_train["label"])

# Make predictions on the validation set
y_pred = model.predict(X_val_tfidf)

# Print the predictions (optional)
print("Predictions on the validation set:")
print(y_pred)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(data_val["label"], y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate a classification report
print("\nClassification Report:")
print(classification_report(data_val["label"], y_pred))

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [None]:
# Your code
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB

# Define function to train and evaluate the classifier
def train_and_evaluate(vectorizer):
    # Create pipeline
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', MultinomialNB())
    ])
    
    # Train the classifier
    pipeline.fit(data_train["preprocessed_text"], data_train["label"])
    
    # Predict on validation set
    y_pred = pipeline.predict(data_val["preprocessed_text"])
    
    # Evaluate the model
    accuracy = accuracy_score(data_val["label"], y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print("\nClassification Report:")
    print(classification_report(data_val["label"], y_pred))
    return accuracy

# Try different vectorizers
print("Using TfidfVectorizer:")
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 2))
train_and_evaluate(tfidf_vectorizer)

print("\nUsing CountVectorizer:")
count_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
train_and_evaluate(count_vectorizer)