# 10. Основы обработки естественного языка (NLP)

## Импорт библиотек

In [1]:
import glob
import math
import random
import re
import tarfile
import warnings
from collections import Counter, defaultdict
from io import BytesIO
from typing import Iterable, NamedTuple

import requests
import spacy
import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from spacy.lookups import Lookups

In [2]:
warnings.filterwarnings("ignore")

## Работа с данными

In [3]:
BASE_URL = "https://spamassassin.apache.org/old/publiccorpus"

In [4]:
FILES = [
    "20021010_easy_ham.tar.bz2",
    "20021010_hard_ham.tar.bz2",
    "20021010_spam.tar.bz2",
]

In [5]:
OUTPUT_DIR = "../data/spam_data"

In [6]:
# for filename in FILES:
#     content = requests.get(f"{BASE_URL}/{filename}").content
#     fin = BytesIO(content)
#     with tarfile.open(fileobj=fin, mode="r:bz2") as tf:
#         tf.extractall(OUTPUT_DIR)

In [7]:
path = "../data/spam_data/*/*"

In [8]:
class Message(NamedTuple):
    text: str
    is_spam: bool

In [9]:
raw_subjects: list[Message] = []

In [10]:
for filename in glob.glob(path):
    with open(filename, errors="ignore") as email_file:
        for line in email_file:
            if line.startswith("Subject:"):
                subject = line.lstrip("Subject: ")
                raw_subjects.append(Message(subject, is_spam="ham" not in filename))
                break

## Работа с текстом

In [11]:
nlp = spacy.load("en_core_web_sm")

#### Токенизация

In [12]:
tokens = []

for subject in tqdm.tqdm(raw_subjects, bar_format="[+] Tokenizing: |{bar:50}|"):
    tokens.extend([token for token in nlp(subject.text)])

[+] Tokenizing: |██████████████████████████████████████████████████|


In [13]:
print(f"[+] Lenght of tokens list: {len(tokens)}")

[+] Lenght of tokens list: 29861


#### Удаление стоп-слов

In [14]:
stopwords = nlp.Defaults.stop_words
stopwords.add("re")

In [15]:
cleared_tokens = []

for token in tqdm.tqdm(tokens, bar_format="[+] Removing stop-words: |{bar:50}|"):
    if token.lower_ not in stopwords:
        cleared_tokens.append(token)

[+] Removing stop-words: |██████████████████████████████████████████████████|


In [16]:
print(f"[+] Lenght of cleared tokens list: {len(cleared_tokens)}")

[+] Lenght of cleared tokens list: 23773


#### Лемматизация

In [17]:
lemmatized_tokens = []

for token in tqdm.tqdm(cleared_tokens, bar_format="[+] Lemmatizing: |{bar:50}|"):
    lemmatized_tokens.append(token.lemma_)

[+] Lemmatizing: |██████████████████████████████████████████████████|


In [18]:
subjects_tokens = lemmatized_tokens[:]

#### Выделение тем писем

In [19]:
subject_texts = [subject.text for subject in raw_subjects]

## Bag of Words

In [20]:
count_vectorizer = CountVectorizer(min_df=2)

In [21]:
bow = count_vectorizer.fit_transform(subject_texts)

In [22]:
list(count_vectorizer.vocabulary_.keys())[:5]

['re', 'the', 'curse', 'of', 'india']

In [23]:
def bag_of_words(texts: list[str]):
    vocabulary = {}
    for text in texts:
        for word in text.split():
            if word not in vocabulary:
                vocabulary[word] = len(vocabulary)
    
    feature_matrix = []
    for text in texts:
        vector = [0] * len(vocabulary)
        for word in text.split():
            if word in vocabulary:
                index = vocabulary[word]
                vector[index] += 1
        feature_matrix.append(vector)
    
    return feature_matrix, vocabulary

In [24]:
feature_matrix, vocabulary = bag_of_words(subject_texts)

## TF-IDF

In [25]:
tfidf_vectorizer = TfidfVectorizer()

In [26]:
tfidf = tfidf_vectorizer.fit_transform(subject_texts)

In [27]:
list(tfidf_vectorizer.vocabulary_.keys())[:5]

['re', 'the', 'curse', 'of', 'india']

In [28]:
def compute_tf(text):
    tf_dict = {}
    words = text.split()
    for word in words:
        tf_dict[word] = tf_dict.get(word, 0) + 1
    # Нормализация TF
    for word in tf_dict:
        tf_dict[word] /= len(words)
    return tf_dict


def compute_idf(texts):
    idf_dict = {}
    total_documents = len(texts)
    
    for text in texts:
        words = set(text.split())
        for word in words:
            idf_dict[word] = idf_dict.get(word, 0) + 1
    
    for word in idf_dict:
        idf_dict[word] = math.log(total_documents / idf_dict[word])
    
    return idf_dict


def compute_tf_idf(texts):
    tf_idf_matrix = []
    idf_dict = compute_idf(texts)
    
    for text in texts:
        tf_dict = compute_tf(text)
        tf_idf_dict = {}
        for word, tf_value in tf_dict.items():
            tf_idf_dict[word] = tf_value * idf_dict.get(word, 0)
        tf_idf_matrix.append(tf_idf_dict)
    
    return tf_idf_matrix

In [29]:
tf_idf_matrix = compute_tf_idf(subject_texts)

## Классификация спама с помощью наивного Байеса

#### Модель

In [30]:
def tokenize(text: str) -> set[str]:
    text = text.lower()
    all_words = re.findall("[a-z0-9']+", text)
    return set(all_words)

In [31]:
class NaiveBayesClassifier:
    def __init__(self, k: float = 0.5) -> None:
        self.k = k
        self.tokens: set[str] = set()
        self.token_spam_counts: dict[str, int] = defaultdict(int)
        self.token_ham_counts: dict[str, int] = defaultdict(int)
        self.spam_messages = 0
        self.ham_messages = 0

    def train(self, messages: Iterable[Message]) -> None:
        for message in messages:
            if message.is_spam:
                self.spam_messages += 1
            else:
                self.ham_messages += 1

            for token in tokenize(message.text):
                self.tokens.add(token)
                if message.is_spam:
                    self.token_spam_counts[token] += 1
                else:
                    self.token_ham_counts[token] += 1

    def _probabilities(self, token: str) -> tuple[float, float]:
        spam = self.token_spam_counts[token]
        ham = self.token_ham_counts[token]

        p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k)
        p_token_ham = (ham + self.k) / (self.ham_messages + 2 * self.k)

        return p_token_spam, p_token_ham

    def predict(self, text: str) -> float:
        text_tokens = tokenize(text)
        log_prob_if_spam = 0.0
        log_prob_if_ham = 0.0

        for token in self.tokens:
            prob_if_spam, prob_if_ham = self._probabilities(token)
            if token in text_tokens:
                log_prob_if_spam += math.log(prob_if_spam)
                log_prob_if_ham += math.log(prob_if_ham)
            else:
                log_prob_if_spam += math.log(1.0 - prob_if_spam)
                log_prob_if_ham += math.log(1.0 - prob_if_ham)

        prob_if_spam = math.exp(log_prob_if_spam)
        prob_if_ham = math.exp(log_prob_if_ham)

        return prob_if_spam / (prob_if_spam + prob_if_ham)

#### Работа с данными

In [32]:
data: list[Message] = []

In [33]:
for filename in glob.glob(path):
    is_spam = "ham" not in filename

    with open(filename, errors="ignore") as email_file:
        for line in email_file:
            if line.startswith("Subject:"):
                subject = line.lstrip("Subject: ")
                data.append(Message(subject, is_spam))
                break

In [34]:
def split_data(data: list[str], prob: float) -> tuple[list[str], list[str]]:
    data = data[:]
    random.shuffle(data)
    cut = int(len(data) * prob)
    return data[:cut], data[cut:]

In [35]:
train_messages, test_messages = split_data(data, 0.75)

#### Обучение

In [36]:
model = NaiveBayesClassifier()
model.train(train_messages)

#### Оценивание классификации

In [37]:
predictions = [(message, model.predict(message.text)) for message in test_messages]

In [38]:
confusion_matrix = Counter((message.is_spam, spam_probability > 0.5) for message, spam_probability in predictions)

In [39]:
print(f"[+] Confusion matrix: {confusion_matrix}")

[+] Confusion matrix: Counter({(False, False): 675, (True, True): 83, (True, False): 44, (False, True): 23})


In [40]:
tp = confusion_matrix[(True, True)]
tn = confusion_matrix[(False, False)]
fp = confusion_matrix[(True, False)]
fn = confusion_matrix[(False, True)]

In [41]:
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2 * precision * recall) / (precision + recall)

In [42]:
print(f"[+] Accuracy: {accuracy:.4f}")
print(f"[+] Precision: {precision:.4f}")
print(f"[+] Recall: {recall:.4f}")
print(f"[+] F1-score: {f1:.4f}")

[+] Accuracy: 0.9188
[+] Precision: 0.6535
[+] Recall: 0.7830
[+] F1-score: 0.7124


#### Вероятностный анализ тем писем

In [43]:
def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
    prob_if_spam, prob_if_ham = model._probabilities(token)
    return prob_if_spam / (prob_if_spam + prob_if_ham)

In [44]:
words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model))

In [45]:
print(f"[+] Most spammy words: {words[-10:]}")
print(f"[+] Least spam words: {words[:10]}")

[+] Most spammy words: ['account', 'attn', 'zzzz', 'clearance', 'b', 'sale', 'rates', 'adv', 'systemworks', 'money']
[+] Least spam words: ['satalk', 'spambayes', 'users', 'razor', 'zzzzteana', 'sadev', 'perl', 'apt', 'spamassassin', 'ouch']


## LDA

In [46]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [47]:
def print_top_words(model, feature_names, n_top_words):
    for i, topic in enumerate(model.components_):
        data = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(f"[+] Topic {i}: {data}")

In [48]:
tf_vectorizer = CountVectorizer(min_df=2, max_df=0.95, max_features=n_features)
tf = tf_vectorizer.fit_transform(subject_texts)

In [49]:
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=20, learning_method="online", learning_offset=50).fit(tf)

In [50]:
print_top_words(lda, tf_vectorizer.get_feature_names_out(), 10)

[+] Topic 0: re for is from the spam sadev java kiddies bug
[+] Topic 1: on the re spambayes zzzzteana are you new test low
[+] Topic 2: re the was of ouch in bliss selling wedded message
[+] Topic 3: re and razor users with satalk of problem it this
[+] Topic 4: for 2002 use perl no headlines 10 09 dvd stories
[+] Topic 5: alsa easy made almost spambayes more insurance life to 500
[+] Topic 6: your re lockergnome tech sed daily united roman states empire
[+] Topic 7: to the in re you news get at of can
[+] Topic 8: re new my window me best sequences help please package
[+] Topic 9: ilug re the cvs has internet rates not in spamassassin
