In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

### Let's divide the training and test set into two partitions

In [None]:
# Your code
# Load training data
data_train = pd.read_csv("kg_train.csv", encoding='latin-1')

# Splitting train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data_train['text'], data_train['label'], test_size=0.2, random_state=42
)
# Divide training set into two partitions
from sklearn.model_selection import train_test_split

X_train_part1, X_train_part2, y_train_part1, y_train_part2 = train_test_split(
    X_train, y_train, test_size=0.5, random_state=42)

## Data Preprocessing

In [None]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
# Your code
import re
from bs4 import BeautifulSoup
# Function to clean HTML content
def clean_html(text):
    # Remove inline JavaScript/CSS
    text = re.sub(r'<script.*?>.*?</script>', '', text, flags=re.DOTALL)
    text = re.sub(r'<style.*?>.*?</style>', '', text, flags=re.DOTALL)

    # Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

    # Remove remaining HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    return text

# Apply HTML cleaning to the text data
X_train_part1 = X_train_part1.apply(clean_html)
X_train_part2 = X_train_part2.apply(clean_html)
X_test = X_test.apply(clean_html)

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
# Your code
# Function to clean text
def clean_text_content(text):
    # Remove special characters
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove all single characters
    text = re.sub(r'\b[a-zA-Z]\b', '', text)

    # Remove single characters from the start
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)

    # Remove prefixed 'b'
    text = re.sub(r'^b\s+', '', text)

    # Convert to lowercase
    text = text.lower()

    return text


## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
# Your code
# Remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# Your code
from nltk.stem import WordNetLemmatizer
# Lemmatization
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
# Your code
from collections import Counter
# Bag of Words: Top 10 words in ham and spam messages
def get_top_words(dataframe, label, n=10):
    messages = dataframe[dataframe['label'] == label]['text']
    all_words = ' '.join(messages).split()
    word_counts = Counter(all_words)
    return word_counts.most_common(n)

## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 
data_val=pd.read_csv("kg_test.csv",encoding='latin-1')
data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Your code
# Count Vectorizer to implement Bag of Words
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(data_train['preprocessed_text'])
X_val_bow = vectorizer.transform(data_val['preprocessed_text'])

# Display feature names
vectorizer.get_feature_names_out()

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
# Your code
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(pd.concat([data_train['preprocessed_text'], data_val['preprocessed_text']]))

# Print the shape of the TF-IDF vectorized dataset
print("Shape of TF-IDF vectorized dataset:", X_tfidf.shape)

## And the Train a Classifier?

In [None]:
# Your code
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
# Train a Naive Bayes Classifier
X_train_tfidf = tfidf_vectorizer.transform(data_train['preprocessed_text'])
X_val_tfidf = tfidf_vectorizer.transform(data_val['preprocessed_text'])

classifier = MultinomialNB()
classifier.fit(X_train_tfidf, data_train['label'])

# Predict and evaluate
y_pred = classifier.predict(X_val_tfidf)
accuracy = accuracy_score(data_val['label'], y_pred)

# Print results
print("Shape of TF-IDF vectorized dataset:", X_tfidf.shape)
print("Validation Accuracy:", accuracy)

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code
# TF-IDF Vectorization with improved features
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),   # Using unigrams and bigrams
    stop_words='english', # Removing stopwords
    max_features=5000,   # Limiting the number of features
    sublinear_tf=True,   # Applying sublinear scaling
    min_df=5             # Minimum document frequency
)

X_tfidf = tfidf_vectorizer.fit_transform(pd.concat([data_train['preprocessed_text'], data_val['preprocessed_text']]))

# Train a Naive Bayes Classifier
X_train_tfidf = tfidf_vectorizer.transform(data_train['preprocessed_text'])
X_val_tfidf = tfidf_vectorizer.transform(data_val['preprocessed_text'])

classifier = MultinomialNB()
classifier.fit(X_train_tfidf, data_train['label'])

# Predict and evaluate
y_pred = classifier.predict(X_val_tfidf)
accuracy = accuracy_score(data_val['label'], y_pred)

# Print results
print("Shape of TF-IDF vectorized dataset:", X_tfidf.shape)
print("Validation Accuracy:", accuracy)
