<a href="https://colab.research.google.com/github/klordo/nlp_homeworks/blob/hw3/nlp_hw3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Предустановки

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!pip install fasttext

In [2]:
import pandas as pd
import numpy as np
import spacy
import gensim
import fasttext
import warnings

# from sklearn.experimental import enable_halving_search_cv # разрешаем использование эксперементальных функций
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
RANDOM_STATE = 1000 - 7

# Подготовка датасета

In [5]:
df = pd.read_csv('spam_or_not_spam.csv', encoding='iso-8859-1').rename(columns={'email': 'text'})

In [37]:
df.sample(5)

Unnamed: 0,text,label,cleaned_text
2848,dear free member you were gifted a free member...,1,dear free member gift free membership cash wav...
317,i think that this and other articles confuse s...,0,think article confuse socialism bureaucracy li...
1721,greg ward case of headers is definitely helpf...,0,greg ward case header definitely helpful spama...
1598,i m getting no servers available about half th...,0,m get server available half time day razor num...
2758,dear sir madam i got this email id from your ...,1,dear sir madam get email d website organisatio...


In [None]:
nlp = spacy.load("en_core_web_sm")

In [8]:
%%time

df['cleaned_text'] = df['text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(str(x)) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)

CPU times: user 2min 42s, sys: 1.33 s, total: 2min 44s
Wall time: 2min 56s


In [9]:
df.sample(5)

Unnamed: 0,text,label,cleaned_text
1496,URL jm URL changed what removed added status ...,0,url jm url change remove add status new resolv...
1497,URL additional comments from jm URL NUMBER NU...,0,url additional comment jm url number number nu...
2478,i don t know how one can expect better and mo...,0,don t know expect well secure code community s...
705,reminds me of cheney during the vp debates whe...,0,remind cheney vp debate declare wealth product...
711,begin pgp signed message hash shaNUMBER at NU...,0,begin pgp sign message hash shanumber number n...


In [10]:
df['cleaned_text'].replace('', str(np.nan), inplace=True)

In [11]:
dataset = [text.split() for text in df.cleaned_text.values]

In [34]:
dataset[111][:10]

['have',
 'great',
 'fun',
 'try',
 'find',
 'dumb',
 'adsl',
 'modem',
 'ethernet',
 'presentation']

# Создание моделей и их сравнение с помощью внутренней оценки

## Word2Vec

### SkipGram

In [12]:
%%time

model_skipGram = gensim.models.Word2Vec(
    sentences=dataset,
    vector_size=256,
    window=3,
    min_count=10,
    sg=1, # 1 for skip-gram
    hs=0,
    negative=5,
    epochs=25,
    seed=RANDOM_STATE,
)

CPU times: user 1min 17s, sys: 286 ms, total: 1min 17s
Wall time: 48.3 s


In [13]:
model_skipGram.wv.most_similar(positive=['january'])

[('christmas', 0.4056288003921509),
 ('guardian', 0.3988199830055237),
 ('t_html_consec_imgsnumber', 0.3905406594276428),
 ('japan', 0.39043235778808594),
 ('t_html_image_areanumber', 0.3893399238586426),
 ('carolina', 0.38749414682388306),
 ('council', 0.38148215413093567),
 ('rafael', 0.3795776069164276),
 ('sheep', 0.3770703971385956),
 ('cabinet', 0.37689781188964844)]

In [14]:
model_skipGram.wv.most_similar(positive=['computer'])

[('picasso', 0.4979810416698456),
 ('pablo', 0.4911339282989502),
 ('hazardous', 0.4523615539073944),
 ('useless', 0.4129263758659363),
 ('biz', 0.4026617407798767),
 ('science', 0.3634575307369232),
 ('communications', 0.36008220911026),
 ('teledynamic', 0.35942184925079346),
 ('unwanted', 0.3520834147930145),
 ('academic', 0.3517932891845703)]

In [15]:
model_skipGram.wv.most_similar(positive=['winter'])

[('hill', 0.5493069291114807),
 ('altitude', 0.5397081971168518),
 ('statue', 0.5253927111625671),
 ('warm', 0.5026311278343201),
 ('marriott', 0.4996064305305481),
 ('dry', 0.49276405572891235),
 ('helium', 0.48235368728637695),
 ('casualty', 0.4789895713329315),
 ('season', 0.4772513508796692),
 ('beach', 0.4772185683250427)]

In [16]:
model_skipGram.wv.doesnt_match(['winter', 'january', 'computer'])

'computer'

### CBOW

In [17]:
%%time

model_cbow = gensim.models.Word2Vec(
    sentences=dataset,
    vector_size=256,
    window=3,
    min_count=10,
    sg=0, # 0 for CBOW
    hs=0,
    negative=5,
    epochs=25,
    seed=RANDOM_STATE,
)

CPU times: user 27.9 s, sys: 237 ms, total: 28.2 s
Wall time: 17.3 s


In [18]:
model_cbow.wv.most_similar(positive=['winter'])

[('southern', 0.7143707871437073),
 ('italian', 0.6795506477355957),
 ('west', 0.6783661246299744),
 ('helium', 0.6709034442901611),
 ('era', 0.6695430874824524),
 ('asia', 0.6629147529602051),
 ('wind', 0.6604577302932739),
 ('sea', 0.6580536961555481),
 ('warm', 0.6575800776481628),
 ('northern', 0.6552695035934448)]

In [19]:
model_cbow.wv.most_similar(positive=['computer'])

[('virus', 0.4828771650791168),
 ('useless', 0.4752280116081238),
 ('vulnerable', 0.46516263484954834),
 ('science', 0.44851550459861755),
 ('crack', 0.4475710988044739),
 ('hacker', 0.42553094029426575),
 ('hazardous', 0.423048734664917),
 ('biz', 0.41948097944259644),
 ('whatsoever', 0.4137769341468811),
 ('unwanted', 0.4134383797645569)]

## Fasttext

In [20]:
with open('data.txt', 'w') as f:
    f.writelines(df.cleaned_text.values)

In [21]:
%%time

model_fasttext = fasttext.train_unsupervised('data.txt', wordNgrams=3, dim=256)

CPU times: user 2min 27s, sys: 1.74 s, total: 2min 28s
Wall time: 2min 30s


In [22]:
model_fasttext.get_nearest_neighbors('computer')

[(0.9223428964614868, 'compute'),
 (0.8154644966125488, 'computing'),
 (0.8139657378196716, 'wireless'),
 (0.8046521544456482, 'answer'),
 (0.7977580428123474, 'collaborate'),
 (0.7914772629737854, 'useless'),
 (0.7876890301704407, 'storage'),
 (0.7873976230621338, 'scalable'),
 (0.7794970870018005, 'dynamic'),
 (0.7704171538352966, 'nonetheless')]

Вывод:

Модели показывают неплохие результаты.

В Word2Vec для CBOW заметен более хорошие результат.


# Сравнение качества на полученных векторах

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], random_state=RANDOM_STATE)

In [24]:
class MyTransformer():
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self # модель уже обучена

    def transform(self, X):
        X_copy = X.apply(self.text2vector)
        return pd.DataFrame(X_copy.to_list())

    def text2vector(self, text):
        pass

## Word2Vec

In [27]:
class Word2VecTransformer(MyTransformer):
    def __init__(self, model):
        super().__init__(model)

    def text2vector(self, text):
        vec = []
        for w in text.split():
            try:
                emb_for_word = self.model.wv[w]
            except KeyError:
                emb_for_word = np.zeros(256)
            vec.append(np.array(emb_for_word))
        text_vec = np.sum(vec, axis=0)
        return text_vec

In [None]:
pipe_skipgram = Pipeline([
    ('ft', Word2VecTransformer(model_skipGram)),
    ('clf', LogisticRegression(random_state=RANDOM_STATE)),
]).fit(X_train, y_train)

In [None]:
pipe_cbow = Pipeline([
    ('ft', Word2VecTransformer(model_cbow)),
    ('clf', LogisticRegression(random_state=RANDOM_STATE)),
]).fit(X_train, y_train)

## Fasttext

In [25]:
class FastTextTransformer(MyTransformer):
    def __init__(self, model):
        super().__init__(model)

    def text2vector(self, text):
        vec = [np.array(self.model.get_word_vector(w)) for w in text.split()]
        text_vec = np.sum(vec, axis=0)
        return text_vec

In [None]:
pipe_fasttext = Pipeline([
    ('ft', FastTextTransformer(model_fasttext)),
    ('clf', LogisticRegression(random_state=RANDOM_STATE)),
]).fit(X_train, y_train)

## Итог

In [30]:
print(classification_report(y_test, pipe_skipgram.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       642
           1       0.94      0.95      0.94       108

    accuracy                           0.98       750
   macro avg       0.96      0.97      0.97       750
weighted avg       0.98      0.98      0.98       750



In [31]:
print(classification_report(y_test, pipe_cbow.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       642
           1       0.92      0.96      0.94       108

    accuracy                           0.98       750
   macro avg       0.96      0.97      0.97       750
weighted avg       0.98      0.98      0.98       750



In [32]:
print(classification_report(y_test, pipe_fasttext.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       642
           1       0.73      0.81      0.77       108

    accuracy                           0.93       750
   macro avg       0.85      0.88      0.86       750
weighted avg       0.93      0.93      0.93       750



Вывод:

В задаче определения спама по моему мнению стоит выбрать CBOW из-за наибольшего значения recall по классу 1.