In [374]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [375]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [376]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("./kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [377]:
# Your code
from sklearn.model_selection import train_test_split

X = data.iloc[:, 0]  # Text messages
y = data.iloc[:, 1]  # Labels (spam/ham)

# Split the data: 80% training, 20% validation (during development)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [378]:
X_train

228           Ok that's when I will go. Is that certain?
45     H <hrod17@clintonemail.com>Wednesday October 7...
373                                 I'm on the way back.
813    MRS. MARIA SOCORROCREDIT ACCOUNTS OFFICERHEAD ...
448                                                  FY1
                             ...                        
950          FYIWe have not announced the amount raised.
969    Interesting approach towards influencing popul...
40                                                    Jm
742    U.K.Dear Friend,I am Mr Mark Boland the Bank M...
595    CALL ME ON MY MOBILE PHONE NUMBER-234-80332545...
Name: text, Length: 800, dtype: object

## Data Preprocessing

In [379]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [380]:
# Your code
import re

def clean_html(text):
    if not isinstance(text, str):
        return ""

    # Remove inline JavaScript and CSS
    text = re.sub(r'<(script|style).*?>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

    # Remove all remaining HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    return text


X_train_cleaned = X_train.apply(clean_html)
X_test_cleaned = X_test.apply(clean_html)


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [381]:
# Your code
import re
import unicodedata

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Normalize unicode to remove accents, etc.
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('latin-1', 'ignore')

    # Remove email addresses and URLs
    text = re.sub(r'\S+@\S+', '', text)               # remove emails
    text = re.sub(r'http\S+|www\S+', '', text)        # remove URLs

    # Remove alphanumeric tokens (like 0000e2511c8print)
    text = re.sub(r'\b[a-zA-Z0-9]*\d+[a-zA-Z]+\w*\b', '', text)

    # Remove all numbers
    text = re.sub(r'\d+', '', text)

    # Remove special characters
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove single characters
    text = re.sub(r'\b\w\b', '', text)

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Convert to lowercase and strip spaces
    return text.lower().strip()

X_train_cleaned = X_train.apply(clean_text)  # apply your cleaning function here


In [382]:
X_train_cleaned

228                 ok that when will go is that certain
45     october rose thx for passing on the feedback w...
373                                      on the way back
813    mrs maria socorrocredit accounts officerhead o...
448                                                   fy
                             ...                        
950           fyiwe have not announced the amount raised
969    interesting approach towards influencing popul...
40                                                    jm
742    dear friend am mr mark boland the bank manager...
595    call me on my mobile phone number ceo presiden...
Name: text, Length: 800, dtype: object

## Now let's work on removing stopwords
Remove the stopwords.

In [383]:
# Your code
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)

X_train_cleaned = X_train_cleaned.apply(remove_stopwords)
X_test_cleaned = X_test_cleaned.apply(remove_stopwords)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [384]:
# Your code

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized)

# Apply to cleaned data
X_train_cleaned = X_train_cleaned.apply(lemmatize_text)
X_test_cleaned = X_test_cleaned.apply(lemmatize_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [385]:
# Final cleaned version
print(X_train_cleaned.head())


228                                        ok go certain
45     october rose thx passing feedback share let pl...
373                                             way back
813    mr maria socorrocredit account officerhead off...
448                                                   fy
Name: text, dtype: object


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [386]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Vectorize the entire cleaned dataset
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X_train_cleaned)

# Convert to DataFrame for analysis
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
bow_df['label'] = y_train.values  # Add back numeric labels (0 for ham, 1 for spam)

# Split by label using numeric labels
ham_words = bow_df[bow_df['label'] == 0].drop('label', axis=1).sum()
spam_words = bow_df[bow_df['label'] == 1].drop('label', axis=1).sum()

# Top 10 words per class
top_ham = ham_words.sort_values(ascending=False).head(10)
top_spam = spam_words.sort_values(ascending=False).head(10)

print("Top 10 words in HAM messages:")
print(top_ham)

print("\nTop 10 words in SPAM messages:")
print(top_spam)



Top 10 words in HAM messages:
would        93
mr           90
pm           84
time         83
president    82
state        80
percent      77
call         77
aaa          72
work         69
dtype: int64

Top 10 words in SPAM messages:
br             864
money          783
account        727
bank           652
fund           636
transaction    474
country        412
business       410
mr             369
nbsp           345
dtype: int64


## Extra features

In [399]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
# Define lists of money symbols and suspicious words
money_symbol_list = "|".join(["euro", "dollar", "pound", "€", r"\$"])
suspicious_words = "|".join([
    "free", "cheap", "sex", "money", "account", "bank",
    "fund", "transfer", "transaction", "win", "deposit", "password"
])

# Create a DataFrame with extra features for training data
train_features = pd.DataFrame({
    'money_mark': X_train_cleaned.str.contains(money_symbol_list) * 1,
    'suspicious_words': X_train_cleaned.str.contains(suspicious_words) * 1,
    'text_len': X_train_cleaned.apply(len)
})

# Create a DataFrame with extra features for test data
test_features = pd.DataFrame({
    'money_mark': X_test_cleaned.str.contains(money_symbol_list) * 1,
    'suspicious_words': X_test_cleaned.str.contains(suspicious_words) * 1,
    'text_len': X_test_cleaned.apply(len)
})

train_features.head()


Unnamed: 0,money_mark,suspicious_words,text_len
228,0,0,13
45,0,0,106
373,0,0,8
813,1,1,1230
448,0,0,2


## How would work the Bag of Words with Count Vectorizer concept?

In [400]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train_cleaned)
X_test_counts = count_vect.transform(X_test_cleaned)

print("Shape of CountVectorizer train data:", X_train_counts.shape)
print("Shape of CountVectorizer test data:", X_test_counts.shape)


Shape of CountVectorizer train data: (800, 14199)
Shape of CountVectorizer test data: (200, 14199)


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [401]:
# Your code
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train_cleaned)
X_test_tfidf = tfidf_vect.transform(X_test_cleaned)

print("Shape of TF-IDF train data:", X_train_tfidf.shape)
print("Shape of TF-IDF test data:", X_test_tfidf.shape)


Shape of TF-IDF train data: (800, 14199)
Shape of TF-IDF test data: (200, 14199)


## And the Train a Classifier?

In [402]:
# Your code
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)  # or use X_train_counts

y_pred = clf.predict(X_test_tfidf)  # or use X_test_counts

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.92
              precision    recall  f1-score   support

           0       0.88      1.00      0.93       113
           1       1.00      0.82      0.90        87

    accuracy                           0.92       200
   macro avg       0.94      0.91      0.92       200
weighted avg       0.93      0.92      0.92       200



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code