In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [3]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [4]:
data.head()

Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0


In [6]:
from sklearn.model_selection import train_test_split

# Separate features and labels
X = data["text"]
y = data["label"]

# Train/test split (80/20 by default, stratify keeps same spam/ham ratio in both)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train set:", X_train.shape, "Test set:", X_test.shape)
print("Spam ratio in train:", y_train.mean(), "Spam ratio in test:", y_test.mean())

Train set: (800,) Test set: (200,)
Spam ratio in train: 0.4425 Spam ratio in test: 0.44


## Data Preprocessing

In [9]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [10]:
import re, html

_HTML_SCRIPTS_STYLES = re.compile(r'(?is)<(script|style)\b.*?>.*?</\1>')
_HTML_COMMENTS       = re.compile(r'(?is)<!--.*?-->')
_HTML_TAGS           = re.compile(r'(?is)<[^>]+>')

def strip_html_min(s: str) -> str:
    s = html.unescape(str(s))
    s = _HTML_SCRIPTS_STYLES.sub(" ", s)   # 1) scripts/styles
    s = _HTML_COMMENTS.sub(" ", s)         # 2) comments (before tags!)
    s = _HTML_TAGS.sub(" ", s)             # 3) remaining tags
    return re.sub(r"\s+", " ", s).strip()  # tidy spaces

X_train_clean = X_train.apply(strip_html_min)
X_test_clean  = X_test.apply(strip_html_min)

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [11]:
def normalize_text(s: str) -> str:
    s = str(s)

    # 1) remove special characters (keep only letters/numbers/spaces)
    s = re.sub(r'[^A-Za-z0-9\s]', ' ', s)

    # 2) remove numbers
    s = re.sub(r'\d+', ' ', s)

    # 3) remove all single characters
    s = re.sub(r'\b[a-zA-Z]\b', ' ', s)

    # 4) remove single characters from the start
    s = re.sub(r'^\s*[a-zA-Z]\s+', ' ', s)

    # 5) substitute multiple spaces with single space
    s = re.sub(r'\s+', ' ', s)

    # 6) remove prefixed 'b' (artifact from byte strings: b'text')
    s = re.sub(r'^b\s+', '', s)

    # 7) convert to lowercase
    s = s.lower()

    return s.strip()

In [13]:
X_train_clean = X_train_clean.apply(normalize_text)
X_test_clean  = X_test_clean.apply(normalize_text)

## Now let's work on removing stopwords
Remove the stopwords.

In [14]:
STOPWORDS = set(stopwords.words("english"))

def remove_stopwords(s: str) -> str:
    tokens = s.split()  # already cleaned and lowercased
    tokens = [t for t in tokens if t not in STOPWORDS]
    return " ".join(tokens)

In [15]:
X_train_clean = X_train_clean.apply(remove_stopwords)
X_test_clean  = X_test_clean.apply(remove_stopwords)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [17]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_text(s: str) -> str:
    tokens = s.split()  # text already cleaned + stopwords removed
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(lemmas)

In [18]:
X_train_clean = X_train_clean.apply(lemmatize_text)
X_test_clean  = X_test_clean.apply(lemmatize_text)

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer turns text into BoW representation
cv = CountVectorizer()

# Fit on the cleaned training text
X_bow = cv.fit_transform(X_train_clean)

# Make a DataFrame: rows = docs, cols = words
bow_df = pd.DataFrame(X_bow.toarray(), columns=cv.get_feature_names_out(), index=X_train.index)

# Attach labels
bow_df["label"] = y_train

# Separate ham (0) and spam (1)
ham_words  = bow_df[bow_df["label"]==0].drop(columns="label").sum().sort_values(ascending=False)
spam_words = bow_df[bow_df["label"]==1].drop(columns="label").sum().sort_values(ascending=False)

print("Top 10 Ham Words:")
print(ham_words.head(10))

print("\nTop 10 Spam Words:")
print(spam_words.head(10))

Top 10 Ham Words:
state        97
president    95
would        92
mr           90
percent      80
obama        80
call         77
work         72
time         70
one          69
dtype: int64

Top 10 Spam Words:
money          761
account        674
bank           615
fund           600
transaction    435
business       412
country        401
mr             384
million        364
company        340
dtype: int64


## Extra features

In [20]:
# wrap train/val back into DataFrames
data_train = pd.DataFrame({"preprocessed_text": X_train_clean, "label": y_train})
data_val   = pd.DataFrame({"preprocessed_text": X_test_clean, "label": y_test})

In [21]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

Unnamed: 0,preprocessed_text,label,money_mark,suspicious_words,text_len
442,dear good day hope fine cdear writting mail du...,1,1,1,998
962,mr henry kaborethe chief auditor inchargeforei...,1,0,1,1946
971,,0,0,0,0
190,desk dr adamu ismalerauditing accounting manag...,1,1,1,383
551,dear friend name loi estrada wife mr josephest...,1,1,1,1475


## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# what does this even mean

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [25]:
# 1) Load the vectorizer
tfidf = TfidfVectorizer(stop_words="english")

# 2) Fit on training set, transform both train and test
X_train_tfidf = tfidf.fit_transform(X_train_clean)
X_test_tfidf  = tfidf.transform(X_test_clean)

# 3) Print the shape of the vectorized dataset
print("TF-IDF train shape:", X_train_tfidf.shape)
print("TF-IDF test shape :", X_test_tfidf.shape)

# (optional) Full dataset if you want to vectorize all at once
X_all_tfidf = tfidf.fit_transform(data["text"])
print("TF-IDF all data shape:", X_all_tfidf.shape)

TF-IDF train shape: (800, 28009)
TF-IDF test shape : (200, 28009)
TF-IDF all data shape: (1000, 25301)


## And the Train a Classifier?

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# assume you already have:
# X_train_tfidf, X_test_tfidf, y_train, y_test

clf = LogisticRegression(max_iter=1000, n_jobs=None)  # n_jobs arg removed in recent sklearn; omit if error
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

Accuracy: 0.975
[[112   0]
 [  5  83]]
              precision    recall  f1-score   support

           0      0.957     1.000     0.978       112
           1      1.000     0.943     0.971        88

    accuracy                          0.975       200
   macro avg      0.979     0.972     0.974       200
weighted avg      0.976     0.975     0.975       200

