<a href="https://colab.research.google.com/github/klordo/nlp_homeworks/blob/hw2/nlp_hw2_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Библиотеки и установки

In [None]:
import pandas as pd
import numpy as np
import spacy

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import ComplementNB

In [None]:
!python3 -m spacy download en_core_web_sm

In [None]:
RANDOM_STATE = 1000 - 7

# Подготовка датасета, train и test данных

In [45]:
data = pd.read_csv('spam_or_not_spam.csv', encoding='iso-8859-1').rename(columns={'email': 'text'})
data.sample(5)

Unnamed: 0,text,label
2273,url URL date NUMBER NUMBER NUMBERtNUMBER NUMBE...,0
1695,skip montanaro i m listed as a developer on s...,0
2673,hyperlink hyperlink hyperlink hyperlink fist ...,1
1677,URL this is the binary pickle of my classifie...,0
1108,URL there s a realy nasty shortage on sequenc...,0


In [46]:
nlp = spacy.load("en_core_web_sm")

In [47]:
%%time

data['cleaned_text'] = data['text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(str(x)) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)
data.sample(5)

CPU times: user 2min 12s, sys: 730 ms, total: 2min 13s
Wall time: 2min 14s


Unnamed: 0,text,label,cleaned_text
2565,hyperlink hyperlink hyperlink hyperlink you a...,1,hyperlink hyperlink hyperlink hyperlink receiv...
2388,url URL date NUMBER NUMBER NUMBERtNUMBER NUMBE...,0,url url date number number numbertnumber numbe...
1385,urban boquist wrote hi matt and thanks for you...,0,urban boquist write hi matt thank quick reply ...
2648,unlimited web conferencing subscribe to the w...,1,unlimited web conferencing subscribe web confe...
1735,because i get mail through several different ...,0,mail different email address frequently duplic...


In [48]:
data.loc[data.isnull().any(axis=1)]

Unnamed: 0,text,label,cleaned_text
2966,,1,


In [None]:
data = data.dropna()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], random_state=RANDOM_STATE, train_size=0.66)

# Задание пайплайнов

## LogisticRegression


In [None]:
pipe_cntr_logreg = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('clf', LogisticRegression())
    ]
)

pipe_tfidf_logreg = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ]
)

## DecisionTreeClassifier


In [None]:
pipe_cntr_dectree = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('clf', DecisionTreeClassifier(random_state=RANDOM_STATE))
    ]
)

pipe_tfidf_dectree = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', DecisionTreeClassifier(random_state=RANDOM_STATE))
    ]
)

## ComplementNB


In [None]:
pipe_cntr_nb = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('clf', ComplementNB())
    ]
)

pipe_tfidf_nb = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', ComplementNB())
    ]
)

# Задание параметров для перебора

In [None]:
splitter = StratifiedShuffleSplit(n_splits=5, random_state=RANDOM_STATE)

## CountVectorizer, TfidfVectorizer


In [None]:
cntr_param_grid = {
    'counter__max_df': [0.6, 0.65, 0.7, 0.75, 0.8],
    'counter__min_df': [0.001, 0.005, 0.01, 0.03, 0.05, 0.1],
}
tfidf_param_grid = {
    'tfidf__max_df': [0.6, 0.65, 0.7, 0.75, 0.8],
    'tfidf__min_df': [0.001, 0.005, 0.01, 0.03, 0.05, 0.1],
}

## LogisticRegression, DecisionTree, ComplementNB


In [None]:
logreg_param_grid = {
    'clf__penalty': ['l2', None],
    'clf__fit_intercept': (False, True),
    'clf__C': [0.01, 0.1, 0.2, 0.4, 0.6, 0.8, 1],
}

dectree_param_grid = {
    'clf__criterion': ['gini', 'entropy', 'log_loss'],
    'clf__max_features': ['auto', 'sqrt', 'log2'],
    'clf__min_impurity_decrease': [0, 0.01, 0.1, 0.5, 1],
}

nb_param_grid = {
    'clf__alpha': [0, 0.2, 0.5, 0.8, 1],
    'clf__fit_prior': (False, True),
    'clf__norm': (False, True)
}

# Перебор с помощью HalvingGridSearchCV

## CountVectorizer + LogisticRegression


In [None]:
%%time
grid_search_cntr_logreg = HalvingGridSearchCV(
    pipe_cntr_logreg,
    param_grid=cntr_param_grid | logreg_param_grid,
    n_jobs=-1,
    cv=splitter,
    scoring='f1',
    random_state=RANDOM_STATE,
)
grid_search_cntr_logreg.fit(X_train, y_train)
grid_search_cntr_logreg.best_estimator_

CPU times: user 16.8 s, sys: 1.58 s, total: 18.4 s
Wall time: 3min 46s




## TfidfVectorizer + LogisticRegression


In [None]:
%%time
grid_search_tfidf_logreg = HalvingGridSearchCV(
    pipe_tfidf_logreg,
    param_grid=tfidf_param_grid | logreg_param_grid,
    n_jobs=-1,
    cv=splitter,
    scoring='f1',
    random_state=RANDOM_STATE,
)
grid_search_tfidf_logreg.fit(X_train, y_train)
grid_search_tfidf_logreg.best_estimator_

CPU times: user 13.3 s, sys: 1.06 s, total: 14.3 s
Wall time: 3min 30s




## CountVectorizer + DecisionTree



In [None]:
%%time
grid_search_cntr_dectree = HalvingGridSearchCV(
    pipe_cntr_dectree,
    param_grid=cntr_param_grid | dectree_param_grid,
    n_jobs=-1,
    cv=splitter,
    scoring='f1',
    random_state=RANDOM_STATE,
)
grid_search_cntr_dectree.fit(X_train, y_train)
grid_search_cntr_dectree.best_estimator_

CPU times: user 19.2 s, sys: 1.61 s, total: 20.8 s
Wall time: 4min 17s




## TfidfVectorizer + DecisionTree


In [None]:
%%time
grid_search_tfidf_dectree = HalvingGridSearchCV(
    pipe_tfidf_dectree,
    param_grid=tfidf_param_grid | dectree_param_grid,
    n_jobs=-1,
    cv=splitter,
    scoring='f1',
    random_state=RANDOM_STATE,
)
grid_search_tfidf_dectree.fit(X_train, y_train)
grid_search_tfidf_dectree.best_estimator_

CPU times: user 19.6 s, sys: 1.48 s, total: 21.1 s
Wall time: 4min 38s




## CountVectorizer + ComplementNB


In [None]:
%%time
grid_search_cntr_nb = HalvingGridSearchCV(
    pipe_cntr_nb,
    param_grid=cntr_param_grid | nb_param_grid,
    n_jobs=-1,
    cv=splitter,
    scoring='f1',
    random_state=RANDOM_STATE,
)
grid_search_cntr_nb.fit(X_train, y_train)
grid_search_cntr_nb.best_estimator_

CPU times: user 7.82 s, sys: 743 ms, total: 8.56 s
Wall time: 1min 55s


## TfidfVectorizer + ComplementNB


In [None]:
%%time
grid_search_tfidf_nb = HalvingGridSearchCV(
    pipe_tfidf_nb,
    param_grid=tfidf_param_grid | nb_param_grid,
    n_jobs=-1,
    cv=splitter,
    scoring='f1',
    random_state=RANDOM_STATE,
)
grid_search_tfidf_nb.fit(X_train, y_train)
grid_search_tfidf_nb.best_estimator_

CPU times: user 8.21 s, sys: 685 ms, total: 8.9 s
Wall time: 2min


# Итоговые точности моделей

## CountVectorizer + LogisticRegression

In [None]:
print(classification_report(y_test, grid_search_cntr_logreg.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       862
           1       0.94      0.96      0.95       158

    accuracy                           0.99      1020
   macro avg       0.97      0.98      0.97      1020
weighted avg       0.99      0.99      0.99      1020



## TfidfVectorizer + LogisticRegression

In [None]:
print(classification_report(y_test, grid_search_tfidf_logreg.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       862
           1       1.00      0.97      0.98       158

    accuracy                           1.00      1020
   macro avg       1.00      0.98      0.99      1020
weighted avg       1.00      1.00      1.00      1020



## CountVectorizer + DecisionTree

In [None]:
print(classification_report(y_test, grid_search_cntr_dectree.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97       862
           1       0.80      0.86      0.83       158

    accuracy                           0.95      1020
   macro avg       0.89      0.91      0.90      1020
weighted avg       0.95      0.95      0.95      1020



## TfidfVectorizer + DecisionTree

In [None]:
print(classification_report(y_test, grid_search_tfidf_dectree.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       862
           1       0.76      0.82      0.79       158

    accuracy                           0.93      1020
   macro avg       0.86      0.89      0.87      1020
weighted avg       0.93      0.93      0.93      1020



## CountVectorizer + ComplementNB

In [None]:
print(classification_report(y_test, grid_search_cntr_nb.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       862
           1       0.97      0.97      0.97       158

    accuracy                           0.99      1020
   macro avg       0.99      0.99      0.99      1020
weighted avg       0.99      0.99      0.99      1020



## TfidfVectorizer + ComplementNB

In [None]:
print(classification_report(y_test, grid_search_tfidf_nb.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       862
           1       0.84      0.96      0.90       158

    accuracy                           0.97      1020
   macro avg       0.92      0.96      0.94      1020
weighted avg       0.97      0.97      0.97      1020



# Вывод
Наилучший результат показала модель:

TfidfVectorizer + LogisticRegression

С результатом по f1:

Не спам - 1.00

Спам - 0.98

Возможно, если подкрутить параметры точнее или увеличить объем обучающей воборки, то можно получить результаты лучше.