In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data/train_spam.csv')
data.head()

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


# Анализ и предобработка данных

In [3]:
# заменим spam на 1 и ham на 0
data.loc[data['text_type'] == 'spam', 'text_type'] = 1
data.loc[data['text_type'] == 'ham', 'text_type'] = 0
data['text_type'] = data['text_type'].astype(int)

In [4]:
# посмотрим на пропуски
data.isna().sum()

text_type    0
text         0
dtype: int64

In [5]:
# посмотрим на сбаллансированность данных
data.groupby(by='text_type').count()

Unnamed: 0_level_0,text
text_type,Unnamed: 1_level_1
0,11469
1,4809


In [6]:
# предобработаем текста

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')


def clear_text(text: str, method=None):
    """
    Функция для предобработки текста

    Parameters
    ----------
    text
    method:
        - 'lemmatize': использовать WordNetLemmatizer()
        - 'stem': использовать PorterStemmer()
        - None: не использовать обработку слов

    Returns
    -------
    Текст, готовый к векторизации
    """
    stopwords_set = set(stopwords.words('english'))

    text = text.strip().lower()
    text = text.translate(str.maketrans('', '', string.punctuation))  # удаление пунктуации
    text = re.sub(r'\d+', '', text)  # удаление чисел
    text_list = text.split()
    text_list = [word for word in text_list if word not in stopwords_set]
    if method == 'lemmatize':
        lemmatizer = WordNetLemmatizer()
        text_list = [lemmatizer.lemmatize(word) for word in text_list]
    elif method == 'stem':
        stemmer = PorterStemmer()
        text_list = [stemmer.stem(word) for word in text_list]
    text = ' '.join(text_list)
    return text


data['cleared_text'] = data['text'].apply(clear_text)
data['cleared_text_stem'] = data['text'].apply(clear_text, method='stem')
data['cleared_text_lemmatize'] = data['text'].apply(clear_text, method='lemmatize')
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kudr.max/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kudr.max/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text_type,text,cleared_text,cleared_text_stem,cleared_text_lemmatize
0,0,make sure alex knows his birthday is over in f...,make sure alex knows birthday fifteen minutes ...,make sure alex know birthday fifteen minut far...,make sure alex know birthday fifteen minute fa...
1,0,a resume for john lavorato thanks vince i will...,resume john lavorato thanks vince get moving r...,resum john lavorato thank vinc get move right ...,resume john lavorato thanks vince get moving r...
2,1,plzz visit my website moviesgodml to get all m...,plzz visit website moviesgodml get movies free...,plzz visit websit moviesgodml get movi free al...,plzz visit website moviesgodml get movie free ...
3,1,urgent your mobile number has been awarded wit...,urgent mobile number awarded £ prize guarantee...,urgent mobil number award £ prize guarante cal...,urgent mobile number awarded £ prize guarantee...
4,0,overview of hr associates analyst project per ...,overview hr associates analyst project per dav...,overview hr associ analyst project per david r...,overview hr associate analyst project per davi...


In [7]:
X = data['cleared_text_stem']
y = data['text_type']

In [8]:
# векторизируем текст

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vect = TfidfVectorizer()
# count_vect = CountVectorizer()

tfidf_vect.fit(X_train)
# count_vect.fit(X_train)

X_train_vect = tfidf_vect.transform(X_train)
X_test_vect = tfidf_vect.transform(X_test)
# X_train_vect = count_vect.transform(X_train)
# X_test_vect = count_vect.transform(X_test)

# Обучение модели

Рассмотрим несколько моделей и для каждой из них подберем оптимальные гиперпараметры

In [9]:
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np

### Логистическая регрессия

In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=parameters,
    scoring='roc_auc',
    cv=5)

grid_search.fit(X_train_vect, y_train)

print(f'Лучший roc_auc: {grid_search.best_score_}')
print(f'При параметрах: {grid_search.best_params_}')

Лучший roc_auc: 0.9776756178251041
При параметрах: {'C': 10}


In [11]:
model = grid_search.best_estimator_
model.fit(X_train_vect, y_train)

y_pred = model.predict(X_test_vect)
y_pred_proba = model.predict_proba(X_test_vect)[:, 1]
print(f"ROC-AUC on test: {roc_auc_score(y_test, y_pred_proba)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")

ROC-AUC on test: 0.9819255484105828
Classification report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      2321
           1       0.91      0.88      0.89       935

    accuracy                           0.94      3256
   macro avg       0.93      0.92      0.93      3256
weighted avg       0.94      0.94      0.94      3256



### SVM

In [13]:
from sklearn.svm import SVC

model = SVC()

parameters = {
    'C': [1, 10]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=parameters,
    scoring='roc_auc',
    cv=5)

grid_search.fit(X_train_vect, y_train)

print(f'Лучший roc_auc: {grid_search.best_score_}')
print(f'При параметрах: {grid_search.best_params_}')

Лучший roc_auc: 0.9807214073960498
При параметрах: {'C': 10}


In [16]:
model = grid_search.best_estimator_
model.fit(X_train_vect, y_train)

y_pred = model.predict(X_test_vect)
print(f"ROC-AUC on test: {roc_auc_score(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")

ROC-AUC on test: 0.9242998707453683
Classification report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      2321
           1       0.94      0.87      0.90       935

    accuracy                           0.95      3256
   macro avg       0.94      0.92      0.93      3256
weighted avg       0.95      0.95      0.95      3256



### Градиентный бустинг (CatBoost)

In [30]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score, classification_report

train_pool = Pool(
    data=X_train_vect,
    label=y_train,
)

test_pool = Pool(
    data=X_test_vect,
    label=y_test,
)

parameters = {
    'depth': [4, 6, 10],
}

catboost_model = CatBoostClassifier(
    metric_period=100,
    iterations=1200,
    loss_function='Logloss',
    eval_metric='AUC',
)

grid_search_result = catboost_model.grid_search(
    parameters,
    X=X_train_vect,
    y=y_train,
    cv=3
)

# catboost_model = CatBoostClassifier(
#     metric_period=50,
#     use_best_model=True,
#     iterations=1100,
#     depth=7,
#     learning_rate=0.1,
#     loss_function='Logloss',
#     eval_metric='AUC',
#     random_seed=42,
# )

# catboost_model = CatBoostClassifier(
#     metric_period=50,
#     iterations=1200,
#     loss_function='Logloss',
#     eval_metric='AUC',
# )
# catboost_model.fit(train_pool, eval_set=test_pool)

0:	test: 0.6586458	best: 0.6586458 (0)	total: 929ms	remaining: 18m 34s
100:	test: 0.9065830	best: 0.9065830 (100)	total: 7.8s	remaining: 1m 24s
200:	test: 0.9315541	best: 0.9315541 (200)	total: 11.3s	remaining: 56.3s
300:	test: 0.9451815	best: 0.9451815 (300)	total: 14.8s	remaining: 44.2s
400:	test: 0.9521601	best: 0.9521601 (400)	total: 18.2s	remaining: 36.2s
500:	test: 0.9560932	best: 0.9560932 (500)	total: 21.6s	remaining: 30.2s
600:	test: 0.9598429	best: 0.9598429 (600)	total: 25.1s	remaining: 25s
700:	test: 0.9619161	best: 0.9619161 (700)	total: 28.5s	remaining: 20.3s
800:	test: 0.9632587	best: 0.9632587 (800)	total: 31.9s	remaining: 15.9s
900:	test: 0.9641415	best: 0.9641415 (900)	total: 35.3s	remaining: 11.7s
1000:	test: 0.9646061	best: 0.9646061 (1000)	total: 39s	remaining: 7.75s
1100:	test: 0.9657778	best: 0.9657778 (1100)	total: 42.7s	remaining: 3.84s
1199:	test: 0.9664009	best: 0.9664009 (1199)	total: 46.1s	remaining: 0us

bestTest = 0.9664008527
bestIteration = 1199

Metric

KeyboardInterrupt: 

In [29]:
grid_search_result

{'params': {'depth': 10, 'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0, 50, 99],
              'test-AUC-mean': [0.7056238760568929,
               0.9507888414933449,
               0.9623946642269613],
              'test-AUC-std': [0.005768529533307851,
               0.0005150479080499211,
               0.0004180761924597694],
              'test-Logloss-mean': [0.6353816381029032,
               0.3126873722953252,
               0.2655740261953718],
              'test-Logloss-std': [0.0019881238488726714,
               0.002497119532633967,
               0.0010855222083853428],
              'train-Logloss-mean': [0.6345874345444429,
               0.2917693735239006,
               0.230149201964164],
              'train-Logloss-std': [0.0019379718172498218,
               0.005118150556778108,
               0.002869586378770749]})}

In [28]:
y_pred = catboost_model.predict(X_test_vect)
y_pred_proba = catboost_model.predict_proba(X_test_vect)[:, 1]
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")

ROC-AUC: 0.9688881106474942
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      2321
           1       0.94      0.75      0.83       935

    accuracy                           0.91      3256
   macro avg       0.92      0.86      0.89      3256
weighted avg       0.92      0.91      0.91      3256

