In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development.

In [4]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development.
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [6]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)

## Data Preprocessing

In [9]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [10]:
import re
# For Javasript
def remove_script_code(data):
    clean = re.compile('<script>.*?</script>')
    return [re.sub(clean, '', data)]
# For CSS Style
def remove_style_code2(data):
    clean = re.compile('<style>.*?</style>')
    return [re.sub(clean, '', data)]
data_train['clean_data']=data_train['text'].apply(remove_script_code)
data_train['clean_data']=data_train['text'].apply(remove_style_code2)
data_val['clean_data2']=data_val['text'].apply(remove_script_code)
data_val['clean_data2']=data_val['text'].apply(remove_style_code2)

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters

- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [11]:
def clean_text(text):
# Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove all single characters
    text = re.sub(r'\b[a-zA-Z]\b', '', text)

    # Remove single characters from the start
    text = re.sub(r'^[a-zA-Z]\s+', '', text)

    # Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove prefixed 'b' (sometimes appears from byte encoding issues)
    text = re.sub(r'^b\s+', '', text)

    # Convert to lowercase
    text = text.lower()

    return text
data_train['clean_data']=data_train['text'].apply(clean_text)
data_val['clean_data2']=data_val['text'].apply(clean_text)

## Now let's work on removing stopwords
Remove the stopwords.

In [12]:
import string

# Define a set of common English stopwords
common_stopwords = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
    "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this",
    "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have",
    "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
    "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against",
    "between", "into", "through", "during", "before", "after", "above", "below", "to", "from",
    "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once"
])

def remove_stopwords(text):
    words = text.split()  # Tokenize by splitting on spaces
    filtered_words = [word for word in words if word not in common_stopwords]
    return " ".join(filtered_words)

# Apply the function to remove stopwords from the 'cleaned_text' column
data_train['remove_stopwords'] = data_train['clean_data'].apply(remove_stopwords)
data_val['remove_stopwords2'] = data_val['clean_data2'].apply(remove_stopwords)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [13]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
data_train['cleaned_text'] = data_train['remove_stopwords'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))
data_val['cleaned_text'] = data_val['remove_stopwords2'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [20]:
from collections import Counter

spam_words = " ".join(data_train[data_train["label"] == 1]["cleaned_text"]).split()
ham_words = " ".join(data_train[data_train["label"] == 0]["cleaned_text"]).split()
top_spam_words = Counter(spam_words).most_common(10)
top_ham_words = Counter(ham_words).most_common(10)
top_spam_df = pd.DataFrame(top_spam_words, columns=["Word", "Count"])
top_ham_df = pd.DataFrame(top_ham_words, columns=["Word", "Count"])
print("Top 10 Spam Words:")
print(top_spam_df)

print("\nTop 10 Ham Words:")
print(top_ham_df)

spam_words2 = " ".join(data_val[data_val["label"] == 1]["cleaned_text"]).split()
ham_words2 = " ".join(data_val[data_val["label"] == 0]["cleaned_text"]).split()
top_spam_words2 = Counter(spam_words).most_common(10)
top_ham_words2 = Counter(ham_words).most_common(10)
top_spam_df2 = pd.DataFrame(top_spam_words, columns=["Word", "Count"])
top_ham_df2 = pd.DataFrame(top_ham_words, columns=["Word", "Count"])
print("Top 10 Spam Words:")
print(top_spam_df2)

print("\nTop 10 Ham Words:")
print(top_ham_df2)

Top 10 Spam Words:
       Word  Count
0      will   1449
1     money    795
2   account    661
3      bank    606
4      fund    564
5       not    480
6         u    443
7       all    406
8  business    391
9       any    347

Top 10 Ham Words:
        Word  Count
0       will    155
1        not    118
2          u     99
3      would     93
4      state     92
5        can     90
6         pm     89
7  president     84
8    percent     76
9       call     73
Top 10 Spam Words:
       Word  Count
0      will   1449
1     money    795
2   account    661
3      bank    606
4      fund    564
5       not    480
6         u    443
7       all    406
8  business    391
9       any    347

Top 10 Ham Words:
        Word  Count
0       will    155
1        not    118
2          u     99
3      would     93
4      state     92
5        can     90
6         pm     89
7  president     84
8    percent     76
9       call     73


## Extra features

In [16]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['cleaned_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['cleaned_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['cleaned_text'].apply(lambda x: len(x))

data_val['money_mark'] = data_val['cleaned_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['cleaned_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['cleaned_text'].apply(lambda x: len(x))

data_train.head()

Unnamed: 0,text,label,clean_data,remove_stopwords,cleaned_text,money_mark,suspicious_words,text_len
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1,regards mr nelson smithkindly reply me on my p...,regards mr nelson smithkindly reply private em...,regard mr nelson smithkindly reply private ema...,1,0,75
535,I have not been able to reach oscar this am. W...,0,have not been able to reach oscar this am we a...,not able reach oscar supposed send pdb can rec...,not able reach oscar supposed send pdb can rec...,1,0,50
695,; Huma Abedin B6I'm checking with Pat on the 5...,0,huma abedin bim checking with pat on the will ...,huma abedin bim checking pat will work jack ja...,huma abedin bim checking pat will work jack ja...,1,0,84
557,I can have it announced here on Monday - can't...,0,can have it announced here on monday cant today,can announced here monday cant today,can announced here monday cant today,1,0,36
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1,bank of africaagence san pedro bp san pedro co...,bank africaagence san pedro bp san pedro cote ...,bank africaagence san pedro bp san pedro cote ...,1,1,1097


## How would work the Bag of Words with Count Vectorizer concept?

## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(data_train["clean_data"])
X_val_tfidf = vectorizer.transform(data_val["clean_data2"])

print("Shape of TF-IDF vectorized training data:", X_train_tfidf.shape)
print("Shape of TF-IDF vectorized validation data:", X_val_tfidf.shape)

Shape of TF-IDF vectorized training data: (800, 18348)
Shape of TF-IDF vectorized validation data: (200, 18348)


## And the Train a Classifier?

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(X_train_tfidf, data_train["label"])

y_pred = model.predict(X_val_tfidf)

accuracy = accuracy_score(data_val["label"], y_pred)

print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.955


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [None]:
# Your code