In [119]:
import pandas as pd

In [120]:
data = pd.read_csv('data/train_spam.csv')
data.head()

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


# Анализ и предобработка данных

In [121]:
# заменим spam на 1 и ham на 0
data.loc[data['text_type'] == 'spam', 'text_type'] = 1
data.loc[data['text_type'] == 'ham', 'text_type'] = 0
data.head()

Unnamed: 0,text_type,text
0,0,make sure alex knows his birthday is over in f...
1,0,a resume for john lavorato thanks vince i will...
2,1,plzz visit my website moviesgodml to get all m...
3,1,urgent your mobile number has been awarded wit...
4,0,overview of hr associates analyst project per ...


In [122]:
# посмотрим на сбаллансированность данных
data.groupby(by='text_type').count()

Unnamed: 0_level_0,text
text_type,Unnamed: 1_level_1
0,11469
1,4809


In [123]:
# посмотрим на пропуски
data.isna().sum()

text_type    0
text         0
dtype: int64

In [124]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kudr.max/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kudr.max/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [125]:
# предобработаем текста

import string
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


def clear_text(text: str, method=None):
    """
    Функция для предобработки текста

    Parameters
    ----------
    text
    method:
        - 'lemmatize': использовать WordNetLemmatizer()
        - 'stem': использовать PorterStemmer()
        - None: не использовать обработку слов

    Returns
    -------
    Текст, готовый к векторизации
    """
    stopwords_set = set(stopwords.words('english'))

    text = text.strip().lower()
    text = text.translate(str.maketrans('', '', string.punctuation))  # удаление пунктуации
    text = re.sub(r'\d+', '', text)  # удаление чисел
    text_list = text.split()
    text_list = [word for word in text_list if word not in stopwords_set]
    if method == 'lemmatize':
        lemmatizer = WordNetLemmatizer()
        text_list = [lemmatizer.lemmatize(word) for word in text_list]
    elif method == 'stem':
        stemmer = PorterStemmer()
        text_list = [stemmer.stem(word) for word in text_list]
    text = ' '.join(text_list)
    return text


data['cleared_text'] = data['text'].apply(clear_text)
data['cleared_text_stem'] = data['text'].apply(clear_text, method='stem')
data['cleared_text_lemmatize'] = data['text'].apply(clear_text, method='lemmatize')
data.head()

Unnamed: 0,text_type,text,cleared_text,cleared_text_stem,cleared_text_lemmatize
0,0,make sure alex knows his birthday is over in f...,make sure alex knows birthday fifteen minutes ...,make sure alex know birthday fifteen minut far...,make sure alex know birthday fifteen minute fa...
1,0,a resume for john lavorato thanks vince i will...,resume john lavorato thanks vince get moving r...,resum john lavorato thank vinc get move right ...,resume john lavorato thanks vince get moving r...
2,1,plzz visit my website moviesgodml to get all m...,plzz visit website moviesgodml get movies free...,plzz visit websit moviesgodml get movi free al...,plzz visit website moviesgodml get movie free ...
3,1,urgent your mobile number has been awarded wit...,urgent mobile number awarded £ prize guarantee...,urgent mobil number award £ prize guarante cal...,urgent mobile number awarded £ prize guarantee...
4,0,overview of hr associates analyst project per ...,overview hr associates analyst project per dav...,overview hr associ analyst project per david r...,overview hr associate analyst project per davi...


In [24]:
# векторизируем текст

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

X_tfidf = TfidfVectorizer().fit_transform(data['text'])
X_count = CountVectorizer().fit_transform(data['text'])

In [27]:
y = data['text_type']

# Обучение модели

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train)

In [29]:
y_pred = svm_model.predict(X_test)
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")

ROC-AUC Score: 0.9146726827666559
Classification report:              precision    recall  f1-score   support

           0       0.94      0.97      0.96      3479
           1       0.93      0.86      0.89      1405

    accuracy                           0.94      4884
   macro avg       0.94      0.91      0.92      4884
weighted avg       0.94      0.94      0.94      4884



In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

gbc_model = GradientBoostingClassifier()
gbc_model.fit(X_train, y_train)

In [33]:
y_pred = svm_model.predict(X_test)
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")

ROC-AUC Score: 0.7987269831495326
Classification report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      3479
           1       0.95      0.61      0.74      1405

    accuracy                           0.88      4884
   macro avg       0.91      0.80      0.83      4884
weighted avg       0.89      0.88      0.87      4884



In [34]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

catboost_model = CatBoostClassifier(iterations=1000,
                                    depth=6,
                                    learning_rate=0.1,
                                    loss_function='Logloss',
                                    eval_metric='AUC',
                                    random_seed=42)
catboost_model.fit(train_pool, eval_set=test_pool)

0:	test: 0.6614279	best: 0.6614279 (0)	total: 312ms	remaining: 5m 11s
1:	test: 0.7749314	best: 0.7749314 (1)	total: 496ms	remaining: 4m 7s
2:	test: 0.8103817	best: 0.8103817 (2)	total: 604ms	remaining: 3m 20s
3:	test: 0.8230492	best: 0.8230492 (3)	total: 697ms	remaining: 2m 53s
4:	test: 0.8420913	best: 0.8420913 (4)	total: 780ms	remaining: 2m 35s
5:	test: 0.8540856	best: 0.8540856 (5)	total: 858ms	remaining: 2m 22s
6:	test: 0.8639873	best: 0.8639873 (6)	total: 934ms	remaining: 2m 12s
7:	test: 0.8724689	best: 0.8724689 (7)	total: 1s	remaining: 2m 4s
8:	test: 0.8796216	best: 0.8796216 (8)	total: 1.07s	remaining: 1m 57s
9:	test: 0.8845285	best: 0.8845285 (9)	total: 1.14s	remaining: 1m 53s
10:	test: 0.8881134	best: 0.8881134 (10)	total: 1.21s	remaining: 1m 48s
11:	test: 0.8925016	best: 0.8925016 (11)	total: 1.28s	remaining: 1m 45s
12:	test: 0.8976211	best: 0.8976211 (12)	total: 1.35s	remaining: 1m 42s
13:	test: 0.9021128	best: 0.9021128 (13)	total: 1.42s	remaining: 1m 40s
14:	test: 0.90302

<catboost.core.CatBoostClassifier at 0x17f81d010>

In [36]:
y_pred = catboost_model.predict(X_test)
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")

ROC-AUC: 0.9035517630439475
Classification report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      3479
           1       0.95      0.82      0.88      1405

    accuracy                           0.94      4884
   macro avg       0.94      0.90      0.92      4884
weighted avg       0.94      0.94      0.94      4884

