In [587]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [588]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [589]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [590]:
data

Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0
...,...,...
995,So what's the latest? It sounds contradictory ...,0
996,"TRANSFER OF 36,759,000.00 MILLION POUNDS TO YO...",1
997,Barb I will call to explain. Are you back in t...,0
998,Yang on travelNot free tonite.May work tomorrow,0


In [591]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1000 non-null   object
 1   label   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [592]:
data["preprocessed_text"] = data["text"]

### Let's divide the training and test set into two partitions

In [593]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(
    data,
    test_size=0.25,
    random_state=42,
    stratify=data['label']  # чтобы баланс классов сохранился
)

print(f"Shape train: {data_train.shape}")
print(f"Shape val: {data_val.shape}")

Shape train: (750, 3)
Shape val: (250, 3)


## Data Preprocessing

In [594]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [595]:
# Your code
import re
import html


def clean_html(html_text: str) -> str:

    html_text = html.unescape(html_text)

    # 1. Remove inline JavaScript and CSS, replacing with a space
    html_text = re.sub(r'<script.*?>.*?</script>', ' ', html_text, flags=re.DOTALL | re.IGNORECASE)
    html_text = re.sub(r'<style.*?>.*?</style>', ' ', html_text, flags=re.DOTALL | re.IGNORECASE)
    
    # 2. Remove HTML comments, replacing with a space
    html_text = re.sub(r'<!--.*?-->', ' ', html_text, flags=re.DOTALL)
    
    # 3. Remove all remaining HTML tags, replacing with a space
    html_text = re.sub(r'<[^>]+>', ' ', html_text)
    
    # 4. Collapse multiple spaces into one
    html_text = re.sub(r'\s+', ' ', html_text).strip()
    
    return html_text


data_train["text"] = data_train["text"].apply(clean_html)
data_val["text"] = data_val["text"].apply(clean_html)

In [596]:
data_val

Unnamed: 0,text,label,preprocessed_text
400,FROM THE DESK OF DR AMAMAN LAZARE AUDITING DEP...,1,FROM THE DESK OF DR AMAMAN LAZARE AUDITING DEP...
371,Looks very much like there will be a vote today.,0,Looks very much like there will be a vote today.
533,WALTERS COMPANY FROM THE OFFICE OF THE MANAGER...,1,WALTERS COMPANY FROM THE OFFICE OF THE MANAGER...
641,"DEAR BELOVED,=20I am Sussan Adams=20PLEASE END...",1,"DEAR BELOVED,=20I am Sussan Adams=20PLEASE END..."
145,FOR SECURITY REASONS.DR. MRS. MARIAM ABACHA AB...,1,FOR SECURITY REASONS.DR. MRS. MARIAM ABACHA AB...
...,...,...,...
505,REQUEST FOR BUSINESS PARTNERSHIPI am PATRICK A...,1,REQUEST FOR BUSINESS PARTNERSHIPI am PATRICK A...
924,Received. Just out of a SVTC on the Europe iss...,0,Received. Just out of a SVTC on the Europe iss...
632,Madame Secretary:On flight home from Kampala w...,0,Madame Secretary:On flight home from Kampala w...
612,I'm up and would like to talk about this RCH a...,0,I'm up and would like to talk about this RCH a...


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [597]:
# Your code
'''

def clean_text(text: str) -> str:
    text = str(text)

    # 1. Remove prefixed b' or b"
    text = re.sub(r"^b['\"]", ' ', text)

    # 2. Remove all special characters (keep only letters and spaces)
    text = re.sub(r'[^A-Za-z\s]', ' ', text)

    # 3. Remove numbers
    text = re.sub(r'\d+', ' ', text)

    # 4. Remove single characters (isolated letters)
    text = re.sub(r'\b[A-Za-z]\b', ' ', text)

    # 5. Remove single character from start of string (if still there)
    text = re.sub(r'^[A-Za-z]\s+', ' ', text)

    # 6. Collapse multiple spaces into one
    text = re.sub(r'\s+', ' ', text)

    # 7. Strip leading/trailing spaces
    return text.strip()
'''

'''
def clean_text(text: str) -> str:
    text = str(text)

    # 1. Remove prefixed b' or b"
    text = re.sub(r"^b['\"]", ' ', text)

    # 2. Remove all special characters (keep only letters (latin/cyrillic) and spaces)
    text = re.sub(r'[^A-Za-zА-Яа-яЁё\s]', ' ', text)

    # 3. Remove numbers
    text = re.sub(r'\d+', ' ', text)

    # 4. Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # 5. Tokenize and remove one-letter tokens (можно оставить нужные)
    tokens = text.split()
    tokens = [t for t in tokens if len(t) > 1 or t.lower() in {"i", "я"}]

    return " ".join(tokens)

'''
import re

STOP_WORDS = {"u", "mr"}

def clean_text(text: str) -> str:
    text = str(text)

    # Remove prefixed b' or b"
    text = re.sub(r"^b['\"]", " ", text)

    # Lowercase
    text = text.lower()

    # Remove all special characters
    text = re.sub(r"[^a-z\s]", " ", text)

    # Remove numbers
    text = re.sub(r"\d+", " ", text)

    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize
    tokens = text.split()

    # Remove single characters and stop words
    tokens = [t for t in tokens if len(t) > 1 and t not in STOP_WORDS]

    return " ".join(tokens)

data_train["text"] = data_train["text"].apply(clean_text)
data_val["text"] = data_val["text"].apply(clean_text)


In [598]:
data_train["text"]

718    heard on the radio that there is cabinet mtg t...
891    does this mean bill should not plan on going a...
698    we hadn gathered people yet only cause jake an...
83     from amrs sese sekodear friend ci am mrs maria...
825    transfer of million pounds to youraccount my n...
                             ...                        
949    aaqskzjrgabaqealgcwaad qawrxhpzgaasukqaagaaaaa...
789    from the desk of dr hassan dikon eemail hassan...
728    will do think we are in good shape immelt indr...
664    with your permission we are changing the state...
798                       yes we will thank you courtney
Name: text, Length: 750, dtype: object

In [599]:
def split_stuck_words(text):
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    text = re.sub(r'(?<=[a-zA-Z])(?=\d)', ' ', text)
    return text

data_val["text"] = data_val["text"].apply(split_stuck_words)
data_train["text"] = data_train["text"].apply(split_stuck_words)

data_val["text"] = data_val["text"].apply(lambda x: x.lower())
data_train["text"] = data_train["text"].apply(lambda x: x.lower())

## Now let's work on removing stopwords
Remove the stopwords.

In [600]:
# Your code
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [
        word for word in tokens 
        if word.lower() not in stop_words and word not in string.punctuation
    ]
    return " ".join(filtered_tokens)

data_train["text"] = data_train["text"].apply(remove_stopwords)
data_val["text"] = data_val["text"].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sergej/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [601]:
data_train["text"] = data_train["text"].replace(r'^\s*$', np.nan, regex=True)
data_val["text"] = data_val["text"].replace(r'^\s*$', np.nan, regex=True)
data_train = data_train.dropna(subset=["text"])
data_val   = data_val.dropna(subset=["text"])
data_val["text"]


400    desk dr amaman lazare auditing department bank...
371                           looks much like vote today
533    walters company office manager walters gate co...
641    dear beloved sussan adams please endeavour use...
145    security reasons dr mrs mariam abacha abachac ...
                             ...                        
505    request business partnershipi patrick atike di...
924          received svtc europe issue sat oct pm wrote
632    madame secretary flight home kampala exhaustin...
612                            would like talk rch stuff
541    dear sir got contact cause seriouse search rel...
Name: text, Length: 247, dtype: object

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [602]:
# Your code
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_with_pos(text):
    if not isinstance(text, str):
        return ""
    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    lemmatized_tokens = [
        lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos_tag))
        for word, pos_tag in tagged_tokens
    ]
    return " ".join(lemmatized_tokens)
data_train["text"] = data_train["text"].apply(lemmatize_text)
data_val["text"] = data_val["text"].apply(lemmatize_text)

data_train["text"]

[nltk_data] Downloading package punkt to /Users/sergej/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sergej/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/sergej/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/sergej/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_val["text"] = data_val["text"].apply(lemmatize_text)


718                   heard radio cabinet mtg go sending
891    mean bill plan going wait public memorial service
698    gathered people yet cause jake cdm said move f...
83     amrs sese sekodear friend ci mr mariam sese se...
825    transfer million pound youraccount name jason ...
                             ...                        
949    aaqskzjrgabaqealgcwaad qawrxhpzgaasukqaagaaaaa...
789    desk dr hassan dikon eemail hassandiko fsmail ...
728    think good shape immelt indra huntsman making ...
664    permission changing statement spokesman name r...
798                                   yes thank courtney
Name: text, Length: 736, dtype: object

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [603]:
# Your code
from collections import Counter

def get_top_n_words(corpus, n=None):
    all_words = " ".join(corpus).split()
    counter = Counter(all_words)
    return counter.most_common(n)

df = pd.concat([data_train, data_val])

top_ham = get_top_n_words(df[df['label'] == 0]['text'], 10)

top_spam = get_top_n_words(df[df['label'] == 1]['text'], 10)

top_df = pd.DataFrame({
    'Ham': [w for w, _ in top_ham],
    'Ham_Freq': [c for _, c in top_ham],
    'Spam': [w for w, _ in top_spam],
    'Spam_Freq': [c for _, c in top_spam]
})

top_df


Unnamed: 0,Ham,Ham_Freq,Spam,Spam_Freq
0,state,136,money,987
1,pm,127,account,899
2,would,107,bank,801
3,president,99,fund,782
4,time,95,u,737
5,call,94,transaction,555
6,obama,84,business,514
7,percent,81,country,513
8,secretary,79,million,463
9,u,76,transfer,426


In [604]:
data_train[data_train["text"].str.contains(r"\bu\b", regex=True, case=False, na=False)]


Unnamed: 0,text,label,preprocessed_text
83,amrs sese sekodear friend ci mr mariam sese se...,1,FROM=3AMRS=2E M SESE-SEKODEAR FRIEND=2CI AM MR...
255,good day may surprise receive letter mesince k...,1,Good day You may be surprise to receive this l...
137,dear friendthis letter must come big surprise ...,1,Dear FriendThis letter must come to you as a b...
776,one hundred twenty six million dollar dear sir...,1,{ONE HUNDRED AND TWENTY SIX MILLION DOLLARS)De...
3,dear sir fmadam know proposal might surprise e...,1,Dear Sir=2FMadam=2C I know that this proposal ...
...,...,...,...
862,regard adnan harmoudi ismailttp www cbsnews co...,1,"=C2=A0Regards,=20Mr.Adnan Harmoudi Ismailttp:/..."
241,greetingstransfer u eighteen million united st...,1,"GreetingsTRANSFER OF US$18,000,000.00(EIGHTEEN..."
245,didi sithole johannesburg south africa tell de...,1,"FROM DIDI SITHOLE,JOHANNESBURG,SOUTH AFRICA.TE..."
949,aaqskzjrgabaqealgcwaad qawrxhpzgaasukqaagaaaaa...,1,/9j/4AAQSkZJRgABAQEAlgCWAAD/4QAWRXhpZgAASUkqAA...


## Extra features

In [605]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_val['text_len'] =

Unnamed: 0,text,label,preprocessed_text,money_mark,suspicious_words,text_len
718,heard radio cabinet mtg go sending,0,I heard on the radio that there is a Cabinet m...,0,0,104
891,mean bill plan going wait public memorial service,0,Does this mean Bill should not plan on going a...,0,0,94
698,gathered people yet cause jake cdm said move f...,0,We hadn't gathered people yet only cause jake ...,0,0,101
83,amrs sese sekodear friend ci mr mariam sese se...,1,FROM=3AMRS=2E M SESE-SEKODEAR FRIEND=2CI AM MR...,1,0,2855
825,transfer million pound youraccount name jason ...,1,"TRANSFER OF 36,759,000.00 MILLION POUNDS TO YO...",0,1,2777


## How would work the Bag of Words with Count Vectorizer concept?

In [606]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)

# Fit and transform the training data
X_train_counts = count_vectorizer.fit_transform(data_train['preprocessed_text'])

# Optionally, transform the validation data
X_val_counts = count_vectorizer.transform(data_val['preprocessed_text'])

# Print the shape to understand the dimensions of the transformed data
print(X_train_counts.shape)
print(X_val_counts.shape)

(736, 5000)
(247, 5000)


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [607]:
# Your code

from sklearn.feature_extraction.text import TfidfVectorizer

# -----------------------------
# 1. Create the TF-IDF vectorizer
# -----------------------------
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,      # limit the vocabulary
    stop_words='english',   # remove English stop words
    lowercase=True          # convert text to lowercase
)

# -----------------------------
# 2. Prepare the corpora
# -----------------------------
corpus_train = data_train["text"].tolist()
corpus_val = data_val["text"].tolist()
corpus_all = data["preprocessed_text"].tolist()  # full dataset

# -----------------------------
# 3. Vectorization
# -----------------------------
# Fit the vectorizer on the training corpus
X_train = tfidf_vectorizer.fit_transform(corpus_train)

# Transform the validation corpus
X_val = tfidf_vectorizer.transform(corpus_val)

# Transform the entire dataset
X_all = tfidf_vectorizer.transform(corpus_all)

# -----------------------------
# 4. Print the shape of the matrices
# -----------------------------
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_all shape:", X_all.shape)

X_train shape: (736, 5000)
X_val shape: (247, 5000)
X_all shape: (1000, 5000)


## And the Train a Classifier?

In [609]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize the MultinomialNB classifier
nb_classifier = MultinomialNB()

# Train the classifier on the TF-IDF data
nb_classifier.fit(X_train, data_train['label'])

# Predict on the validation set
predictions = nb_classifier.predict(X_val)

# Evaluate the performance
accuracy = accuracy_score(data_val['label'], predictions)
report = classification_report(data_val['label'], predictions)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 0.979757085020243
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98       136
           1       0.96      1.00      0.98       111

    accuracy                           0.98       247
   macro avg       0.98      0.98      0.98       247
weighted avg       0.98      0.98      0.98       247



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [610]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# -----------------------------
# 1. Define X and y
# -----------------------------
X_train_text = data_train["text"]
y_train = data_train["label"]

X_val_text = data_val["text"]
y_val = data_val["label"]

# -----------------------------
# 2. Vectorization (TF-IDF)
# -----------------------------
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english",
    lowercase=True
)

X_train = vectorizer.fit_transform(X_train_text)
X_val = vectorizer.transform(X_val_text)

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)

# -----------------------------
# 3. Train the classifier
# -----------------------------
clf = MultinomialNB()   # default parameters
clf.fit(X_train, y_train)

# -----------------------------
# 4. Predictions
# -----------------------------
y_pred = clf.predict(X_val)

# -----------------------------
# 5. Evaluation
# -----------------------------
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

X_train shape: (736, 5000)
X_val shape: (247, 5000)
Accuracy: 0.979757085020243

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98       136
           1       0.96      1.00      0.98       111

    accuracy                           0.98       247
   macro avg       0.98      0.98      0.98       247
weighted avg       0.98      0.98      0.98       247



In [611]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# --- Train classifier ---
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train) 

# --- Extract feature importance ---
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

ham_log_probs = nb_classifier.feature_log_prob_[0]
spam_log_probs = nb_classifier.feature_log_prob_[1]

spam_diff = spam_log_probs - ham_log_probs
ham_diff = ham_log_probs - spam_log_probs

# Top SPAM-indicative words
top_spam_idx = np.argsort(spam_diff)[-20:]
print("Top SPAM words:\n", feature_names[top_spam_idx])

# Top HAM-indicative words
top_ham_idx = np.argsort(ham_diff)[-20:]
print("\nTop HAM words:\n", feature_names[top_ham_idx])

Top SPAM words:
 ['assist' 'regard' 'deceased' 'contact' 'assistance' 'deposit' 'claim'
 'business' 'company' 'sum' 'dollar' 'father' 'million' 'kin' 'transfer'
 'transaction' 'fund' 'account' 'money' 'bank']

Top HAM words:
 ['schedule' 'hillary' 'today' 'jake' 'monday' 'sheet' 'wednesday'
 'blackberry' 'huma' 'print' 'thx' 'pls' 'talk' 'gov' 'cheryl' 'pm'
 'tomorrow' 'ok' 'yes' 'fyi']


In [612]:
# --- твои слова по частотам ---
top_df = pd.DataFrame({
    'Ham': [w for w, _ in top_ham[:10]],   # топ-10
    'Ham_Freq': [c for _, c in top_ham[:10]],
    'Spam': [w for w, _ in top_spam[:10]],
    'Spam_Freq': [c for _, c in top_spam[:10]]
})

# --- слова от MultinomialNB ---
nb_top_spam = feature_names[top_spam_idx][::-1]  # переворачиваем, чтобы важные были сверху
nb_top_ham = feature_names[top_ham_idx][::-1]

# --- добавим новые колонки ---
top_df['Ham_NB'] = pd.Series(nb_top_ham)
top_df['Spam_NB'] = pd.Series(nb_top_spam)
top_df

Unnamed: 0,Ham,Ham_Freq,Spam,Spam_Freq,Ham_NB,Spam_NB
0,state,136,money,987,fyi,bank
1,pm,127,account,899,yes,money
2,would,107,bank,801,ok,account
3,president,99,fund,782,tomorrow,fund
4,time,95,u,737,pm,transaction
5,call,94,transaction,555,cheryl,transfer
6,obama,84,business,514,gov,kin
7,percent,81,country,513,talk,million
8,secretary,79,million,463,pls,father
9,u,76,transfer,426,thx,dollar


In [613]:
from IPython.display import display

top_df = pd.DataFrame({
    'Ham': [w for w, _ in top_ham[:10]],   # топ-10
    'Ham_Freq': [c for _, c in top_ham[:10]],
    'Spam': [w for w, _ in top_spam[:10]],
    'Spam_Freq': [c for _, c in top_spam[:10]]
})

# --- MultinomialNB ---
nb_top_spam = feature_names[top_spam_idx][::-1]  
nb_top_ham = feature_names[top_ham_idx][::-1]

top_df['Ham_NB'] = pd.Series(nb_top_ham)
top_df['Spam_NB'] = pd.Series(nb_top_spam)

ham_manual = set(top_df['Ham'])
spam_manual = set(top_df['Spam'])

def highlight_words(val, word_set):
    if val in word_set:
        return "font-weight: bold; color: green"
    return ""

styled_df = top_df.style.applymap(lambda v: highlight_words(v, ham_manual), subset=['Ham_NB'])
styled_df = styled_df.applymap(lambda v: highlight_words(v, spam_manual), subset=['Spam_NB'])

display(styled_df)

  styled_df = top_df.style.applymap(lambda v: highlight_words(v, ham_manual), subset=['Ham_NB'])
  styled_df = styled_df.applymap(lambda v: highlight_words(v, spam_manual), subset=['Spam_NB'])


Unnamed: 0,Ham,Ham_Freq,Spam,Spam_Freq,Ham_NB,Spam_NB
0,state,136,money,987,fyi,bank
1,pm,127,account,899,yes,money
2,would,107,bank,801,ok,account
3,president,99,fund,782,tomorrow,fund
4,time,95,u,737,pm,transaction
5,call,94,transaction,555,cheryl,transfer
6,obama,84,business,514,gov,kin
7,percent,81,country,513,talk,million
8,secretary,79,million,463,pls,father
9,u,76,transfer,426,thx,dollar


## Ham (non-spam)

- **Matching words:** `state`, `would`, `president`  
- **Non-matching words:** `pm`, `time`, `call`, `obama`, `percent`, `secretary`, `u`, etc.  
- **Reason:** NB selects words most **discriminative** for HAM, not just frequent words.

## Spam

- **Matching words:** `money`, `bank`, `fund`  
- **Non-matching words:** `account`, `transaction`, `business`, `country`, `million`, etc.  
- **Reason:** NB highlights words that **best separate SPAM from HAM**; common words in both classes may not appear in top NB features.

## Overall

- NB identifies words that are more **informative for classification** rather than purely frequent.