In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [40]:
import matplotlib.pyplot as plt
import pandas as pd
import re
import string

from collections import Counter

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("kg_train.csv", encoding = 'latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)

print(data.shape)

data.fillna("", inplace = True)

display(data)

X = data['text']
y = data['label']

### Let's divide the training and test set into two partitions

In [None]:
# Your code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

## Data Preprocessing

In [None]:
print(string.punctuation)
print(stopwords.words("english")[100:110])

snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
# Your code

def remove_inline_js_css(text):
    text = re.sub(r'<script.*?>.*?</script>', '', text, flags=re.DOTALL) # Regex to remove <script>...</script> tags and their content
    text = re.sub(r'<style.*?>.*?</style>', '', text, flags=re.DOTALL) # Regex to remove <style>...</style> tags and their content
    return text

def remove_html_comments(text):
    return re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL) # Regex to match HTML comments

# Remove inline_JS_CSS
X_train = data['text'].apply(remove_inline_js_css)
X_test =  data['text'].apply(remove_inline_js_css)

# Apply the function to the 'text' column to remove HTML comments
X_train = X_train.apply(remove_html_comments)
X_test =  X_test.apply(remove_html_comments)

X_train.head()
X_test.head()

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
# Your code
def remove_others(text):
    text = re.sub(r'[^A-Za-z\s]+', '', text, flags=re.DOTALL) # Regex to remove all the special characters and numbers
    text = re.sub(r'\b\w\b', '', text, flags=re.DOTALL) # Regex to remove all single characters
    text = re.sub(r' {2,}', ' ', text, flags=re.DOTALL) # Regex to substitute multiple spaces with single space
    text = re.sub(r'\bb', '', text, flags=re.DOTALL) # Regex to remove prefixed 'b'
    text = re.sub(r'\s\s+', ' ', text, flags=re.DOTALL) # Substitute multiple spaces with single space

    return text.lower() # Convert to Lowercase

# Remove inline_JS_CSS
X_train = X_train.apply(remove_others)
X_test =  X_test.apply(remove_others)

X_train.head()
X_test.head()

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
# Your code
def removal(text): # Alex Code
    stop_words = set(stopwords.words('english')) # stopwords

    words = text.split()  # Split for tokenization
    words = [word for word in words if word not in stop_words] # Process everything except the stopwords

    return ' '.join(words)

# Remove stopwords
X_train = X_train.apply(removal)
X_test =  X_test.apply(removal)

X_train.head()
X_test.head()

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# Your code
def lemmatization(text): # Alex Code
    lemmatizer = WordNetLemmatizer()

    words = text.split()  # Split for tokenization
    words = [lemmatizer.lemmatize(word) for word in words] # Lemmatize

    return ' '.join(words)

# Lemmatization
X_train = X_train.apply(lemmatization)
X_test =  X_test.apply(lemmatization)

X_train.head()
X_test.head()

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
# Your code
# ham 0, spam = 1

all_words = ' '.join(X_train)

# Tokenize words
word_list = all_words.split()

word_counts = Counter(word_list)
print(word_counts)

print("\nTo 10 words:")
for i in word_counts.most_common(10):
    print(i)

## Extra features

In [None]:

# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data['money_mark'] = X_train.str.contains(money_simbol_list) * 1
data['suspicious_words'] = X_train.str.contains(suspicious_words) * 1
data['text_len'] = X_train.apply(lambda x: len(x))

'''
data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 
'''

data


## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Your code

# Create a Vectorizer Object
vectorizer = CountVectorizer()
 
vectorizer.fit(X_train)
 
# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)

# Encode the Document
vector = vectorizer.transform(X_train)
 
# Summarizing the Encoded Texts
print("Encoded Document is:")
print(vector.toarray())

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [38]:
# Your code
tf_idf_vectorizer = TfidfVectorizer() # TfidfVectorizer(ngram_range = (1, 2), max_features = 10000)

X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)

X_test_tf_idf = tf_idf_vectorizer.transform(X_test)

## And the Train a Classifier?

In [None]:
# Your code

# Classifier will make prediction on my data.

# Train the classifier (MultinomialNB)
model = MultinomialNB()
model.fit(X_train_tf_idf, y)

# Make predictions on the validation set
y_pred = model.predict(X_test_tf_idf)

print("Predictions:", y_pred)

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code