In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

### Let's divide the training and test set into two partitions

In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'data' is the dataframe read in the previous steps.
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)


## Data Preprocessing

In [None]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
import re
from bs4 import BeautifulSoup

def clean_html(text):
    # Remove JavaScript and CSS
    text = re.sub(r'<(script|style).*?>.*?</\1>', '', text, flags=re.S)
    # Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.S)
    # Remove remaining HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    return text

# Apply to the data column that contains HTML
data['cleaned_text'] = data['text_column'].apply(clean_html)


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove single characters
    text = re.sub(r'\b\w\b', '', text)
    # Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

# Apply preprocessing to cleaned text
data['processed_text'] = data['cleaned_text'].apply(preprocess_text)


## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

# Apply stopword removal
data['processed_text'] = data['processed_text'].apply(remove_stopwords)


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Apply lemmatization
data['processed_text'] = data['processed_text'].apply(lemmatize_text)


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Separate spam and ham messages
spam_messages = data[data['label'] == 'spam']
ham_messages = data[data['label'] == 'ham']

# Initialize vectorizer
vectorizer = CountVectorizer(max_features=10)

# Fit and transform for spam
spam_bow = vectorizer.fit_transform(spam_messages['processed_text'])
print("Top 10 words in spam messages:", vectorizer.get_feature_names_out())

# Fit and transform for ham
ham_bow = vectorizer.fit_transform(ham_messages['processed_text'])
print("Top 10 words in ham messages:", vectorizer.get_feature_names_out())


## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Define indicators
money_symbols = r"euro|dollar|pound|€|\$"
suspicious_words = r"free|cheap|sex|money|account|bank|win|fund"

# Adding features for money symbols and suspicious words
data['money_mark'] = data['processed_text'].str.contains(money_symbols).astype(int)
data['suspicious_words'] = data['processed_text'].str.contains(suspicious_words).astype(int)
data['text_len'] = data['processed_text'].apply(len)

data.head()



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the data
bow_data = count_vectorizer.fit_transform(data['processed_text'])

# Print the shape of the vectorized dataset
print("Shape of the Bag of Words dataset:", bow_data.shape)


## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_data = tfidf_vectorizer.fit_transform(data['processed_text'])

# Print the shape of the vectorized dataset
print("Shape of the TF-IDF dataset:", tfidf_data.shape)


## And the Train a Classifier?

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Split into features and labels
X_train, X_test, y_train, y_test = train_test_split(tfidf_data, data['label'], test_size=0.2, random_state=42)

# Initialize and train classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict on test data
y_pred = classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the classifier:", accuracy)


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'data' contains the cleaned and preprocessed dataset with 'processed_text' and 'label' columns
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['processed_text'], data['label'], test_size=0.2, random_state=42)

# Trying both TF-IDF and Count Vectorizer to see which yields the best accuracy

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the MultinomialNB classifier
tfidf_classifier = MultinomialNB()
tfidf_classifier.fit(X_train_tfidf, y_train)

# Predict and evaluate with TF-IDF
y_pred_tfidf = tfidf_classifier.predict(X_test_tfidf)
tfidf_accuracy = accuracy_score(y_test, y_pred_tfidf)
print("TF-IDF Vectorizer Accuracy:", tfidf_accuracy)
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf))

# Count Vectorizer
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# Train the classifier with Count Vectorizer
count_classifier = MultinomialNB()
count_classifier.fit(X_train_count, y_train)

# Predict and evaluate with Count Vectorizer
y_pred_count = count_classifier.predict(X_test_count)
count_accuracy = accuracy_score(y_test, y_pred_count)
print("Count Vectorizer Accuracy:", count_accuracy)
print("Classification Report (Count Vectorizer):\n", classification_report(y_test, y_pred_count))

# Determine the best feature representation
if tfidf_accuracy > count_accuracy:
    print("Best feature representation: TF-IDF Vectorizer with accuracy", tfidf_accuracy)
else:
    print("Best feature representation: Count Vectorizer with accuracy", count_accuracy)
