# 1. Настройка окружения

### 1.1 Установка необходимых Python-пакетов

In [1]:
%pip install -U spacy pandas scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### 1.2 Импорт необходимых зависимостей

In [2]:
import locale
import re

import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding



### 1.3 Фикс для возникающих ошибок локализации

In [3]:
locale.getpreferredencoding = lambda: "UTF-8"

### 1.4 Загрузка предобученного spaCy-пайплайна среднего размера для английского языка, оптимизированного для CPU

In [4]:
!python3 -m spacy download en_core_web_md

2023-03-16 03:39:34.890320: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 03:39:34.890444: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 03:39:36.651644: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download

### 1.5 Инициализация констант

In [5]:
RAW_DATASET_FILENAME = "two_datasets.csv"
SAMPLE_SIZE = 6000
RANDOM_STATE = 42
TEST_SIZE = 0.2
TRAIN_DATA_FILENAME = "train.spacy"
TEST_DATA_FILENAME = "test.spacy"
VALIDATE_DATA_FILENAME = "validate.spacy"
MODEL_PATH = "output7/model-best"

# 2. Предобработка данных

### 2.1 Информация о первоначальном датасете

In [6]:
data = pd.read_csv(RAW_DATASET_FILENAME, sep=',')

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,Message,Spam
0,0,"date:may 16,2002. email:edum@hkem.com. dear si...",1
1,1,<html> <body> own your very own free casino an...,1
2,2,"hi, you can make $50,000 or more in the next 9...",1
3,3,special situation trading advisory dear valued...,1
4,4,dear dr. schaefer: i would greatly appreciate ...,1


In [8]:
#data['Spam/Ham'].value_counts(normalize=True)*100

### 2.2 Удаление ненужных колонок из датасета

In [9]:
#data["Spam"] = pd.get_dummies(data["Spam/Ham"])["spam"]
#data = data.drop(columns=["Message ID", "Date", "Spam/Ham"])

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6082 entries, 0 to 6081
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6082 non-null   int64 
 1   Message     6082 non-null   object
 2   Spam        6082 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 142.7+ KB


### 2.3 Извлечение части датасета (при необходимости)

In [11]:
data_sample = data.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE).reset_index()

### 2.4 Очистка датасета от пропущенных значений и разделение на subjects и messages датасеты

In [12]:
#subjects_sample = data_sample[["Subject", "Spam"]].dropna()
messages_sample = data_sample[["Message", "Spam"]].dropna()

In [13]:
for index, row in messages_sample.iterrows():
  messages_sample.loc[index, "Message"] = re.sub('[^A-Za-z0-9 ]+', '', row["Message"].strip())
messages_sample.head()

Unnamed: 0,Message,Spam
0,on sun 8 sep 2002 cdale wrote i agree w ya to...,0
1,are you ready to reach new prospects without t...,1
2,john benjamins publishing would like to call y...,0
3,on mon oct 07 2002 at 094311am 0100 ciaran joh...,0
4,hello i was recently browsing the internet and...,1


### 2.5 Сохранение subjects и messages датасетов в файлы

In [14]:
#subjects_sample.to_csv("subjects_sample.csv")

In [15]:
messages_sample.to_csv("messages_sample.csv")

In [16]:
messages_sample["Spam"].value_counts()

0    4859
1    1141
Name: Spam, dtype: int64

# 3. Обучение модели

### 3.1 Загрузка предобученного spacy пайплайна en_core_web_md, скачанного ранее

In [17]:
nlp = spacy.load("en_core_web_md")

### 3.2 Разбиение датасета messages на тренировочную и тестовую выборки

In [18]:
train_df, test_df = train_test_split(messages_sample, test_size=TEST_SIZE, random_state=RANDOM_STATE)

### 3.3 Конвертация тренировочной выборки в бинарный формат spaCy

In [19]:
db = DocBin()
for index, row in train_df.iterrows():
    text = nlp.make_doc(row["Message"])
    text.cats = {"SPAM": row["Spam"], "NOT SPAM": 1 - row["Spam"]}
    db.add(text)
db.to_disk(TRAIN_DATA_FILENAME)

### 3.4 Конвертация тестовой выборки в бинарный формат spaCy

In [20]:
db = DocBin()
for index, row in test_df.iterrows():
    text = nlp.make_doc(row["Message"])
    text.cats = {"SPAM": row["Spam"], "NOT SPAM": 1 - row["Spam"]}
    db.add(text)
db.to_disk(TEST_DATA_FILENAME)

### 3.5 Формирование конфиг-файла для обучения модели

In [21]:
!python -m spacy init fill-config ./base_config.cfg ./textcat_config.cfg

2023-03-16 03:40:40.177792: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 03:40:40.177941: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 03:40:41.947236: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
textcat_config.cfg
You can now add your data and train your pipeline:
python -m spacy train textcat_config.cfg --paths.train ./

### 3.6 Обучение модели на основе сформированного конфиг-файла

In [22]:
!python -m spacy train ./textcat_config.cfg --output ./output6 --paths.train ./train.spacy --paths.dev ./test.spacy

2023-03-16 03:40:47.109981: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 03:40:47.110126: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 03:40:49.688874: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;2m✔ Created output directory: output6[0m
[38;5;4mℹ Saving to output directory: output6[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-03-16 03:40:50,708] [INFO] Set up nlp object from config
[2023-03-16 

### 3.7 Формирование zip-архива с моделью (для скачивания с Google Colab)

In [23]:
!zip -r output_md_sample6000_two_datasets.zip output6

  adding: output6/ (stored 0%)
  adding: output6/model-best/ (stored 0%)
  adding: output6/model-best/config.cfg (deflated 61%)
  adding: output6/model-best/vocab/ (stored 0%)
  adding: output6/model-best/vocab/strings.json (deflated 77%)
  adding: output6/model-best/vocab/vectors.cfg (stored 0%)
  adding: output6/model-best/vocab/lookups.bin (stored 0%)
  adding: output6/model-best/vocab/vectors (deflated 10%)
  adding: output6/model-best/vocab/key2row (deflated 8%)
  adding: output6/model-best/textcat/ (stored 0%)
  adding: output6/model-best/textcat/cfg (deflated 20%)
  adding: output6/model-best/textcat/model (deflated 48%)
  adding: output6/model-best/meta.json (deflated 58%)
  adding: output6/model-best/tokenizer (deflated 81%)
  adding: output6/model-last/ (stored 0%)
  adding: output6/model-last/config.cfg (deflated 61%)
  adding: output6/model-last/vocab/ (stored 0%)
  adding: output6/model-last/vocab/strings.json (deflated 77%)
  adding: output6/model-last/vocab/vectors.cfg (

# 4. Проверка модели на примере

### 4.1 Загрузка обученной модели

In [24]:
nlp = spacy.load('output6/model-best')

### 4.2 Проверка модели на примере

In [25]:
test_text = "dear consumers, increase your business sales!"
doc = nlp(test_text)
doc.cats

{'SPAM': 0.9999572038650513, 'NOT SPAM': 4.2756848415592685e-05}

In [26]:
valid_data = pd.read_csv("result.csv")[["Message", "Spam"]]
valid_data.to_csv("result.csv")
valid_data

Unnamed: 0,Message,Spam
0,the internet's online pharmacy viagra - xenica...,1
1,"dear consumers, increase your business sales! ...",1
2,1) fight the risk of cancer! http://www.adclic...,1
3,we are offering you quality marketing lists wh...,1
4,"<html> <body> <font face=""ms sans serif""> <fon...",1
...,...,...
911,">>>>> ""g"" == geege schuman <geege@barrera.org>...",0
912,"> i'm not sure what you mean by ""let's you and...",0
913,">>>robert elz said: > date: wed, 28 aug 2002 0...",0
914,"chuck murcko wrote: > heh, ten years ago sayin...",0


In [27]:
db = DocBin()
for index, row in valid_data.iterrows():
    text = nlp.make_doc(row["Message"])
    text.cats = {"SPAM": row["Spam"], "NOT SPAM": 1 - row["Spam"]}
    db.add(text)
db.to_disk(VALIDATE_DATA_FILENAME)

In [28]:
counter = 0
valid = 0

for index, row in valid_data.iterrows():
    counter += 1
    text = row["Message"]
    spam = row["Spam"]
    doc = nlp(text)
    temp_spam = int(doc.cats["SPAM"] + 0.5)
    if temp_spam == spam:
      valid += 1

valid_acc = valid/counter * 100
print(valid_acc)

94.32314410480349


In [29]:
!python -m spacy benchmark accuracy output6/model-best ./validate.spacy

2023-03-16 03:52:43.840887: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 03:52:43.841051: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 03:52:45.684173: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;4mℹ Using CPU[0m
[1m

TOK                 100.00
TEXTCAT (macro F)   93.98 
SPEED               39458 

[1m

               P       R       F
SPAM       96.12   89.20   92.53
NOT SPAM   93.29   9

In [30]:
!python -m spacy benchmark speed output6/model-best ./validate.spacy

2023-03-16 03:53:06.609339: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 03:53:06.609491: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 03:53:09.155493: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;4mℹ Using CPU[0m
Warming up for 3 epochs...
100% 2748/2748 [00:12<00:00, 223.11doc/s]

Benchmarking 50 batches...
100% 50000/50000 [04:45<00:00, 175.17doc/s]

Outliers: 0.0%, extreme outliers: 0.0%