<a href="https://colab.research.google.com/github/iuliivasilev/SpamFilter/blob/main/EDA/SpamFilterParser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Примеры внешних фильтров в CommuniGate

1. [Исходная документация (UI)](https://www.communigate.ru/CommuniGatePro/russian/VirusScan.html#Launch)
2. [Пример реализации Spam Filter для CommuniGate Pro (Perl, но можно посмотреть как встраивать)](https://github.com/TFF-Enterprises/CGPSA)
3. [Внутренняя кухня CommuniGate](https://habr.com/ru/companies/communigatepro/articles/197720/)
4. [Поддерживаемый пример детектора спама (C/C++)](https://github.com/freiz/terminator)

In [1]:
import pandas as pd
import numpy as np
import os
import re

### Набор данных

1. [Источник](https://www.kaggle.com/datasets/maharshipandya/email-spam-dataset-extended/code)
2. [Пример обработки](https://www.kaggle.com/code/maharshipandya/email-spam-classification-98)

In [2]:
!pip install opendatasets
import opendatasets as od

od.download("https://www.kaggle.com/datasets/maharshipandya/email-spam-dataset-extended")

# user: iuliivasilievmsurus
# key: d2b6c216f2b747e4ebadb039821aa9ae

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: iuliivasilievmsurus
Your Kaggle Key: ··········
Downloading email-spam-dataset-extended.zip to ./email-spam-dataset-extended


100%|██████████| 21.3M/21.3M [00:00<00:00, 91.6MB/s]





In [3]:
f = open('/content/email-spam-dataset-extended/ham_zipped/main_ham/00001.1a31cc283af0060967a233d26548a6ce', 'rb+')
s = f.read()

In [4]:
### Документация парсера: https://docs.python.org/3/library/email.parser.html
import email
msg = email.message_from_bytes(s)
msg.__dict__

{'policy': Compat32(),
 '_headers': [('Return-Path', '<exmh-workers-admin@spamassassin.taint.org>'),
  ('Delivered-To', 'yyyy@localhost.netnoteinc.com'),
  ('Received',
   'from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id 7106643C34\n\tfor <jm@localhost>; Wed, 21 Aug 2002 08:33:03 -0400 (EDT)'),
  ('Received',
   'from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor jm@localhost (single-drop); Wed, 21 Aug 2002 13:33:03 +0100 (IST)'),
  ('Received',
   'from listman.spamassassin.taint.org (listman.spamassassin.taint.org [66.187.233.211]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7LCXvZ24654 for\n    <jm-exmh@jmason.org>; Wed, 21 Aug 2002 13:33:57 +0100'),
  ('Received',
   'from listman.spamassassin.taint.org (localhost.localdomain [127.0.0.1]) by\n    listman.redhat.com (Postfix) with ESMTP id F12A13EA25; Wed, 21 Aug 2002\n    08:34:00 -0400 (EDT)'),
  ('Delivered-To', 'exmh-workers@listman.spamassass

In [6]:
def set_prefix(l, prefix):
    return [f"{prefix}_{e}" for e in l]

def get_content(msg):
    if msg.is_multipart():
        for part in msg.walk():
            ctype = part.get_content_type()
            cdispo = str(part.get('Content-Disposition'))

            # skip text/plain type
            if ctype == 'text/plain' and 'attachment' not in cdispo:
                body = part.get_payload(decode=True)  # decode
                break
    else:
        body = msg.get_payload(decode=True)
    content = body.decode().lower()
    content = content.replace("=\n", "")
    content = re.sub(r"\s+", " ", content)
    return {"Content": content, "Subject": msg["Subject"].lower()}

def get_context(msg):
    return {}

def collect_by_folder(path, type_folder=None):
    l = []
    for file in os.listdir(path):
        try:
            with open(os.path.join(path, file), 'rb+') as f:
                s = f.read()
            msg = email.message_from_bytes(s)
            d_msg = {"id": file}
            d_msg.update(get_content(msg))
            d_msg.update(get_context(msg))
            if not(type_folder is None):
                d_msg["type"] = type_folder
            l.append(d_msg)
        except Exception as e:
            print(f"Error ({e}) in file: {file}")
    return l

In [None]:
l_msg = collect_by_folder("/content/email-spam-dataset-extended/ham_zipped/main_ham", type_folder="ham")
l_msg += collect_by_folder("/content/email-spam-dataset-extended/spam_zipped/main_spam", type_folder="spam")

In [8]:
df = pd.DataFrame(l_msg)

In [9]:
content = df["Content"]
subject = df["Subject"]

## Обработка

In [10]:
### Использование кастомного словаря

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

# from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.corpus import stopwords

# data = df["Content"]
# stopWords = set(stopwords.words('english') + stopwords.words('russian'))
# words = word_tokenize(data)
# wordsFiltered = [w for w in words if w not in stopWords]

# filtered_body = " ".join(wordsFiltered)

#### CountVectorizer

1. [Документация](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.fit)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words="english", ngram_range=(2, 2), max_features=5000)
X_ngramm = vectorizer.fit_transform(content)
features = set_prefix(vectorizer.get_feature_names_out(), "ngramm")
df_ngramm1 = pd.DataFrame(X_ngramm.toarray(), columns=features)

X_ngramm = vectorizer.transform(subject)
features_subj = set_prefix(vectorizer.get_feature_names_out(), "subj_ngramm")
df_ngramm2 = pd.DataFrame(X_ngramm.toarray(), columns=features_subj)

df_ngramm = pd.concat([df_ngramm1, df_ngramm2], axis=1)
df_ngramm

Unnamed: 0,ngramm_00 00,ngramm_00 000,ngramm_00 01,ngramm_00 12,ngramm_00 23,ngramm_00 asp,ngramm_00 br,ngramm_00 font,ngramm_00 html,ngramm_00 img,...,subj_ngramm_zdnet spamassassin,subj_ngramm_zdnet td,subj_ngramm_zdnetshopper cnet,subj_ngramm_zero length,subj_ngramm_zero sum,subj_ngramm_zip code,subj_ngramm_zoom br,subj_ngramm_zoom font,subj_ngramm_zzzz example,subj_ngramm_zzzz spamassassin
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8383,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8384,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8385,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### TfidfVectorizer

1. [Документация](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(content)
features = set_prefix(tfidf_vectorizer.get_feature_names_out(), "tfidf")
df_tfidf1 = pd.DataFrame(X_tfidf.toarray(), columns=features)

X_tfidf = tfidf_vectorizer.transform(subject)
features_subj = set_prefix(tfidf_vectorizer.get_feature_names_out(), "subj_tfidf")
df_tfidf2 = pd.DataFrame(X_tfidf.toarray(), columns=features_subj)

df_tfidf = pd.concat([df_tfidf1, df_tfidf2], axis=1)
df_tfidf

Unnamed: 0,tfidf_00,tfidf_000,tfidf_0000,tfidf_000000,tfidf_000033,tfidf_000066,tfidf_000080,tfidf_000099,tfidf_0000a0,tfidf_0000cc,...,subj_tfidf_zawodny,subj_tfidf_zdnet,subj_tfidf_zero,subj_tfidf_ziggy,subj_tfidf_zimbabwe,subj_tfidf_zip,subj_tfidf_zone,subj_tfidf_zoom,subj_tfidf_zope,subj_tfidf_zzzz
0,0.046552,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8382,0.008351,0.0,0.0,0.022204,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8383,0.008309,0.0,0.0,0.022090,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8384,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8385,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Word2vec

1. [Теория + базовые возможности](https://builtin.com/machine-learning/nlp-word2vec-python)
2. [Документация](https://radimrehurek.com/gensim/models/word2vec.html)
3. [Базовый MeanEmbeddingVectorizer](https://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/)

In [13]:
from gensim.models import Word2Vec

class MeanEmbeddingVectorizer(object):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def prepare_text(self, X):
        return list(map(lambda x: x.split(), X))

    def fit(self, X):
        X_by_word = self.prepare_text(X)
        model = Word2Vec(sentences=X_by_word, **self.kwargs)
        self.word2vec_ki = model.wv.key_to_index
        self.word2vec_v = model.wv.vectors
        self.word2vec_s = model.wv.index_to_key

        self.dim = model.vector_size
        return self

    def transform(self, X):
        X_by_word = self.prepare_text(X)
        return np.array([
            np.mean([self.word2vec_v[self.word2vec_ki[w]] for w in words if w in self.word2vec_s]
                    or [np.zeros(self.dim)], axis=0)
            for words in X_by_word
        ])

    def get_feature_names_out(self):
        return list(map(str, np.arange(self.dim)))

In [14]:
w2v_vectorizer = MeanEmbeddingVectorizer(vector_size=100, window=5, min_count=1, max_final_vocab=1000)
w2v_vectorizer.fit(content)

X_w2v = w2v_vectorizer.transform(content)
features = set_prefix(w2v_vectorizer.get_feature_names_out(), "w2v")
df_w2v1 = pd.DataFrame(X_w2v, columns=features)

X_w2v = w2v_vectorizer.transform(subject)
features_subj = set_prefix(w2v_vectorizer.get_feature_names_out(), "subj_w2v")
df_w2v2 = pd.DataFrame(X_w2v, columns=features_subj)

df_w2v = pd.concat([df_w2v1, df_w2v2], axis=1)
df_w2v

Unnamed: 0,w2v_0,w2v_1,w2v_2,w2v_3,w2v_4,w2v_5,w2v_6,w2v_7,w2v_8,w2v_9,...,subj_w2v_90,subj_w2v_91,subj_w2v_92,subj_w2v_93,subj_w2v_94,subj_w2v_95,subj_w2v_96,subj_w2v_97,subj_w2v_98,subj_w2v_99
0,-0.114616,0.137498,0.473242,0.028500,0.102352,-0.260491,0.055050,0.841154,0.051077,-0.169695,...,-0.289776,-1.024432,-0.731371,-0.252932,0.153949,-0.261508,-0.781899,-0.447173,-0.245203,0.151086
1,-0.302309,0.309293,0.485510,0.037970,0.671640,-0.102367,-0.031245,0.517556,-0.353060,-0.093903,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.319194,0.060433,0.450682,0.231017,0.295960,-0.131992,0.222295,0.509933,-0.304811,-0.204740,...,0.439870,1.766639,1.205280,-0.076285,-0.001209,1.181263,0.486506,-0.336269,1.188333,0.478243
3,0.018269,0.135673,0.407543,0.421317,0.520428,-0.404987,0.089381,0.492576,-0.278471,-0.258726,...,-0.380232,0.986664,0.860156,-0.296264,0.684170,0.453590,-0.530095,-0.861108,0.132631,0.489790
4,-0.102129,0.092357,0.311782,0.480979,0.372843,-0.143593,0.267161,0.471764,-0.059930,-0.289713,...,0.120013,-0.562223,-0.091336,-0.761406,1.230297,0.035621,0.093241,-0.362471,-0.101502,-0.606766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8382,0.222295,-0.604859,-0.409071,-0.273425,-0.225709,0.694403,-0.021962,-0.507799,0.716464,-0.136809,...,0.066243,0.369947,0.020814,-0.544434,0.809012,0.244901,-0.264427,0.175292,-0.653504,0.309936
8383,0.165631,-0.469783,-0.326535,-0.170899,-0.212949,0.597984,0.028667,-0.446117,0.611744,-0.179700,...,-0.217332,0.050134,0.401169,-0.338728,-0.381921,-0.285241,-0.182557,0.254774,0.086733,-0.160322
8384,0.441471,-0.186505,-0.078434,0.466437,0.234970,0.383239,-0.071461,0.034347,0.032162,-0.304754,...,-0.032549,-0.917930,-0.046407,0.217303,-0.018953,-0.183098,0.407883,0.213448,-0.600386,0.086042
8385,0.002987,0.090406,0.464931,0.542456,0.608738,-0.096946,-0.141118,0.574968,-0.295417,-0.480726,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [15]:
X = pd.concat([df_ngramm, df_tfidf, df_w2v], axis=1)
y = df["type"] == "spam"

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

print("Training set size: ", X_train.shape, y_train.shape)
print("Testing set size: ", X_test.shape, y_test.shape)

Training set size:  (6709, 20200) (6709,)
Testing set size:  (1678, 20200) (1678,)


In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_pred_prob = rf.predict_proba(X_test)[:, 1]

In [25]:
dict(zip(X_train.columns, rf.feature_importances_))

{'ngramm_00 00': 0.0,
 'ngramm_00 000': 0.0,
 'ngramm_00 01': 2.4660100421265976e-05,
 'ngramm_00 12': 0.0,
 'ngramm_00 23': 0.0,
 'ngramm_00 asp': 0.0,
 'ngramm_00 br': 0.0,
 'ngramm_00 font': 0.0,
 'ngramm_00 html': 1.2407356310275264e-05,
 'ngramm_00 img': 0.0,
 'ngramm_00 million': 0.0,
 'ngramm_000 00': 0.0004893821473853995,
 'ngramm_000 000': 0.00013345355242527185,
 'ngramm_000 50': 6.0841478149096525e-05,
 'ngramm_000 cash': 0.0,
 'ngramm_000 font': 0.0,
 'ngramm_000 free': 0.0,
 'ngramm_000 mails': 3.8175479526782505e-05,
 'ngramm_000 option': 0.0,
 'ngramm_000 orders': 1.5508628702353254e-05,
 'ngramm_000 people': 4.060881525461231e-05,
 'ngramm_000 strong': 0.0,
 'ngramm_000 tied': 0.0,
 'ngramm_000 year': 0.0,
 'ngramm_000000 br': 0.0,
 'ngramm_000000 cellpadding': 3.847693279827671e-05,
 'ngramm_000000 click': 0.0,
 'ngramm_000000 cnet': 3.348515865844813e-05,
 'ngramm_000000 colspan': 0.0,
 'ngramm_000000 downloads': 5.103651776140019e-05,
 'ngramm_000000 face': 0.0,
 'n

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

print("ROC-AUC score: ", roc_auc_score(y_test, y_pred_prob))
print("Accuracy score: ", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred, digits=6))

ROC-AUC score:  0.9992161678178562
Accuracy score:  0.9892729439809297
Classification report:
               precision    recall  f1-score   support

       False   0.988593  0.997698  0.993125      1303
        True   0.991736  0.960000  0.975610       375

    accuracy                       0.989273      1678
   macro avg   0.990164  0.978849  0.984367      1678
weighted avg   0.989295  0.989273  0.989210      1678



In [None]:
0.99046 # ngrams
0.98986 # + tfdif
0.99225 # + word2vec