In [None]:
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [17]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speed up development. 

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv", encoding = 'latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("", inplace = True)

### Let's divide the training and test set into two partitions

In [19]:
data_train = pd.read_csv('../data/kg_train.csv')
data_val = pd.read_csv('../data/kg_test.csv')

## Data Preprocessing

In [None]:
import string
from nltk.corpus import stopwords

print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
print(data.shape)
print(data.head(10))

In [None]:
from bs4 import BeautifulSoup, Comment

def clean_text(text_line):
    soup = BeautifulSoup(text_line, 'html.parser')
    # Remove inline JavaScript and CSS
    for chunk in soup(['script', 'style']):
        chunk.decompose()
    # Remove HTML comments before regular tags
    for comment in soup.find_all(string = lambda text: isinstance(text, Comment)):
        comment.extract()
    # Remove remaining HTML tags
    clean_text = soup.get_text(separator = ' ', strip = True)
    return clean_text

data['text'] = data['text'].apply(clean_text)

print(data.head(10))

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
import re

def additional_cleaning(text):
    # Remove all special characters
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    # Remove all single characters
    text = re.sub(r"\b\w\b", "", text)
    # Remove single characters from the start
    text = re.sub(r"^\s*\w\s+", "", text)
    # Substitute multiple spaces with a single space
    text = re.sub(r"\s+", " ", text)
    # Remove prefixed 'b'
    text = re.sub(r"^b'", "", text)
    # Convert to lowercase
    text = text.lower()
    return text.strip()

data['text'] = data['text'].apply(additional_cleaning)

print(data.head(10))

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
stop_words = set(stopwords.words('english'))

# Remove stopwords from text
def remove_stopwords(text):
    words = text.split()
    filtered_text = " ".join(word for word in words if word not in stop_words)
    return filtered_text

data['text'] = data['text'].apply(remove_stopwords)

print(data.head(10))

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Lemmatize words in each line
def lemmatize_text(text):
    words = text.split()
    lemmatized_text = " ".join(lemmatizer.lemmatize(word) for word in words)
    return lemmatized_text

data['text'] = data['text'].apply(lemmatize_text)

print(data.head())

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
word_frequency = {}

for text in data['text']:
    words = text.split()
    for word in words:
        if word in word_frequency:
            word_frequency[word] += 1
        else:
            word_frequency[word] = 1

ranked_words = sorted(word_frequency.items(), key = lambda x: x[1], reverse = True)

print("Top 10 words:", ranked_words[:10])

## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_symbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['text'].str.contains(money_symbol_list) * 1
data_train['suspicious_words'] = data_train['text'].str.contains(suspicious_words) * 1
data_train['text_len'] = data_train['text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['text'].str.contains(money_symbol_list) * 1
data_val['suspicious_words'] = data_val['text'].str.contains(suspicious_words) * 1
data_val['text_len'] = data_val['text'].apply(lambda x: len(x))

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# Creating the bag of words
X = vectorizer.fit_transform(list(data['text']))

print('Bag of Words:', vectorizer.vocabulary_)

print('Bag of Words matrix:\n', X.toarray())

## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
tfidf_vectorizer = TfidfVectorizer()

# Vectorizing the dataset
X_tfidf = tfidf_vectorizer.fit_transform(list(data['text']))

print('Shape of the vectorized dataset:', X_tfidf.shape)

## And the Train a Classifier?

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size = 0.2, random_state = 42)

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy of the classifier:", accuracy)

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).