In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Let's prepare the environment

# Lab | Natural Language Processing
### SMS: SPAM or HAM

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [None]:

## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

### Let's divide the training and test set into two partitions

In [None]:
# Your code

from sklearn.model_selection import train_test_split

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the sizes of the partitions
print("Training Features:", X_train.shape)
print("Test Features:", X_test.shape)
print("Training Labels:", y_train.shape)
print("Test Labels:", y_test.shape)


## Data Preprocessing

In [None]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [10]:
# Your code

import re

def clean_html(html_content):

 # Step 1: Remove inline JavaScript and CSS
    html_content = re.sub(r'<script.*?>.*?</script>', '', html_content, flags=re.DOTALL)  # Remove JavaScript
    html_content = re.sub(r'<style.*?>.*?</style>', '', html_content, flags=re.DOTALL)    # Remove CSS

    # Step 2: Remove HTML comments
    html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)

    # Step 3: Remove remaining HTML tags
    html_content = re.sub(r'<[^>]+>', '', html_content)

    return html_content

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [11]:
# Your code

import re

def clean_text(text):

    # Step 1: Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Step 2: Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Step 3: Remove all single characters
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    
    # Step 4: Remove single characters from the start
    text = re.sub(r'^\s*\w\s*', '', text)
    
    # Step 5: Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Step 6: Remove prefixed 'b'
    text = re.sub(r'\bb\b', '', text)
    
    # Step 7: Convert to lowercase
    text = text.lower()
    
    return text

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
# Your code
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

def remove_stopwords(text):
     # Define the list of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Split the text into words
    words = text.split()
    
    # Filter out stopwords
    filtered_words = [word for word in words if word not in stop_words]
    
    # Join the words back into a string
    return ' '.join(filtered_words)

text = "This is a sample text like the, is, and in it."

cleaned_text = remove_stopwords(text)

# Display the cleaned text
print("Text without stopwords:")
print(cleaned_text)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# Your code

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

def lemmatize_text(text):
     # Initialize the WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Lemmatize each word and join back into a single string
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(lemmatized_words)


text = "The bats are hanging on their feet for best."

# Apply lemmatization
lemmatized_text = lemmatize_text(text)

# Display the lemmatized text
print("Lemmatized Text:")
print(lemmatized_text)

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
# Your code

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Example dataset
data = pd.DataFrame({
    'text': [
        "Win money now! Exclusive offer just for you.",
        "Meeting at 3 PM tomorrow, don't forget.",
        "Congratulations! You've been selected for a prize.",
        "Can we reschedule our meeting to next week?",
        "Limited time offer, click here to claim your reward!"
    ],
    'label': ['spam', 'ham', 'spam', 'ham', 'spam']
})

# Separate ham and spam messages
ham_messages = data[data['label'] == 'ham']['text']
spam_messages = data[data['label'] == 'spam']['text']

# Define a function to get the top 10 words
def get_top_words(messages, top_n=10):
    """
    Get the top N words from the messages.
    """
    # Initialize CountVectorizer with stopwords removed
    vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
    
    # Fit and transform the text data
    word_counts = vectorizer.fit_transform(messages)
    
    # Sum the counts for each word
    word_freq = word_counts.sum(axis=0).A1
    
    # Create a DataFrame for word frequencies
    word_freq_df = pd.DataFrame({'word': vectorizer.get_feature_names_out(), 'count': word_freq})
    
    # Sort by frequency and return the top words
    return word_freq_df.sort_values(by='count', ascending=False).head(top_n)

# Get the top 10 words for ham and spam messages
top_ham_words = get_top_words(ham_messages, top_n=10)
top_spam_words = get_top_words(spam_messages, top_n=10)

# Display the results
print("Top 10 Words in Ham Messages:")
print(top_ham_words)

print("\nTop 10 Words in Spam Messages:")
print(top_spam_words)



## Extra features

In [None]:
import pandas as pd

# Load the dataset (update the file path as needed)
data_train = pd.DataFrame({
    'preprocessed_text': [
        "win money now exclusive offer",
        "meeting scheduled for tomorrow at noon",
        "free account upgrade available",
        "deposit money to secure your funds",
        "bank transaction alert from your account"
    ]
})

data_val = pd.DataFrame({
    'preprocessed_text': [
        "click here to claim your prize",
        "meeting rescheduled to next week",
        "transfer money immediately for rewards",
        "password reset required for security",
        "free trial offer for premium access"
    ]
})


# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Your code

from sklearn.feature_extraction.text import CountVectorizer

# Example corpus
corpus = [
    "Win money now! Exclusive offer for you.",
    "Meeting scheduled tomorrow. Don’t forget!",
    "Congratulations! You’ve won a free prize.",
    "Reschedule meeting to next week.",
    "Limited time offer: claim your reward."
]

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the corpus to create the Bag of Words representation
X = vectorizer.fit_transform(corpus)

# Display the vocabulary
print("Vocabulary (Features):")
print(vectorizer.get_feature_names_out())

# Convert the sparse matrix to a dense matrix and display
print("\nBag of Words Matrix (Word Counts):")
print(X.toarray())


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
# Your code

from sklearn.feature_extraction.text import TfidfVectorizer

# Example corpus (replace with your dataset)
corpus = [
    "Win money now! Exclusive offer for you.",
    "Meeting scheduled tomorrow. Don’t forget!",
    "Congratulations! You’ve won a free prize.",
    "Reschedule meeting to next week.",
    "Limited time offer: claim your reward."
]

# Step 1: Load the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Step 2: Fit and transform the dataset
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Step 3: Print the shape of the vectorized dataset
print("Shape of the vectorized dataset:", tfidf_matrix.shape)

# Optional: Display feature names and a sample of the matrix
print("\nFeature Names (Vocabulary):")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix (Sparse Representation):")
print(tfidf_matrix)


## And the Train a Classifier?

In [None]:
# Your code

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Example dataset (replace with your dataset)
data = pd.DataFrame({
    'text': [
        "Win money now! Exclusive offer for you.",
        "Meeting scheduled tomorrow. Don’t forget!",
        "Congratulations! You’ve won a free prize.",
        "Reschedule meeting to next week.",
        "Limited time offer: claim your reward."
    ],
    'label': [1, 0, 1, 0, 1] 
})

# Step 1: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Step 2: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 3: Train a Classifier (Logistic Regression)
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Step 4: Make Predictions
y_pred = classifier.predict(X_test_tfidf)

# Step 5: Evaluate the Model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import nltk

# Download stopwords if not already done
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv ("../data/kg_train.csv",encoding='latin-1')

# Preprocessing function
def preprocess_text(text):
    """
    Cleans and preprocesses the input text.
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    text = ' '.join(word for word in words if word not in stop_words)
    
    # Apply stemming
    stemmer = SnowballStemmer('english')
    text = ' '.join(stemmer.stem(word) for word in text.split())
    
    return text

# Apply preprocessing
data['processed_text'] = data['text'].apply(preprocess_text)

# Define feature representation methods
vectorizers = {
    "Bag of Words": CountVectorizer(),
    "TF-IDF": TfidfVectorizer()
}

# Split the dataset
X = data['processed_text']
y = data['label']  # Adjust column name as necessary
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluate different feature representations
best_score = 0
best_vectorizer = None
for name, vectorizer in vectorizers.items():
    # Fit and transform the training data
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # Train MultinomialNB with default parameters
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\nFeature Representation: {name}")
    print("Accuracy:", accuracy)
    print(classification_report(y_test, y_pred))
    
    # Track the best feature representation
    if accuracy > best_score:
        best_score = accuracy
        best_vectorizer = name

# Output the best feature representation
print(f"\nBest Feature Representation: {best_vectorizer} with Accuracy: {best_score}")

