<a href="https://colab.research.google.com/github/klordo/nlp_homeworks/blob/hw3/nlp_hw3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Предустановки

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!pip install fasttext

In [2]:
import pandas as pd
import numpy as np
import spacy
import gensim
import fasttext
import warnings

# from sklearn.experimental import enable_halving_search_cv # разрешаем использование эксперементальных функций
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
RANDOM_STATE = 1000 - 7

# Подготовка датасета

In [5]:
df = pd.read_csv('spam_or_not_spam.csv', encoding='iso-8859-1').rename(columns={'email': 'text'})

In [6]:
df.sample(5)

Unnamed: 0,text,label
1094,once upon a time harri wrote on wed feb NUMBER...,0
2879,hyperlink never pay retail unleash your pc s ...,1
610,at fermi yes i m back there long story we re b...,0
965,from brent welch welch panasas com date wed N...,0
202,philip reynolds wrote does anyone know how to ...,0


In [None]:
nlp = spacy.load("en_core_web_sm")

In [8]:
%%time

df['cleaned_text'] = df['text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(str(x)) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)

CPU times: user 1min 28s, sys: 529 ms, total: 1min 28s
Wall time: 1min 30s


In [9]:
df.sample(5)

Unnamed: 0,text,label,cleaned_text
21,on thu aug NUMBER NUMBER at NUMBER NUMBER NUMB...,0,thu aug number number number number numberpm n...
2864,friend now you can copy dvd s and games URL ba...,1,friend copy dvd s game url backup dvd video s ...
1634,on NUMBER september NUMBER tim peters said gre...,0,number september number tim peters say greg wa...
2556,lowest rates available for term life insurance...,1,low rate available term life insurance moment ...
1348,justin mason wrote phil r lawrence said someth...,0,justin mason write phil r lawrence say watch u...


In [10]:
df['cleaned_text'].replace('', str(np.nan), inplace=True)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], random_state=RANDOM_STATE)

In [40]:
X_train_list = [text.split() for text in X_train.values]

In [41]:
X_train_list[1][:10]

['view',
 'newsletter',
 'color',
 'url',
 'media',
 'unspun',
 'press',
 'report',
 'www',
 'mediaunspun']

# Создание моделей и их сравнение с помощью внутренней оценки

## Word2Vec

### SkipGram

In [19]:
%%time

model_skipGram = gensim.models.Word2Vec(
    sentences=X_train_list, # изменил на train
    vector_size=256,
    window=3,
    min_count=10,
    sg=1, # 1 for skip-gram
    hs=0,
    negative=5,
    epochs=25,
    seed=RANDOM_STATE,
)

CPU times: user 30 s, sys: 118 ms, total: 30.1 s
Wall time: 20 s


In [20]:
model_skipGram.wv.most_similar(positive=['january'])

[('hawaii', 0.5169601440429688),
 ('december', 0.45577940344810486),
 ('japan', 0.444751113653183),
 ('ii', 0.44286930561065674),
 ('porter', 0.44119882583618164),
 ('guardian', 0.43025562167167664),
 ('rafael', 0.4269408881664276),
 ('council', 0.4259926378726959),
 ('urge', 0.42067092657089233),
 ('convergence', 0.42037853598594666)]

In [21]:
model_skipGram.wv.most_similar(positive=['computer'])

[('picasso', 0.4886655807495117),
 ('pablo', 0.47680971026420593),
 ('hazardous', 0.46309053897857666),
 ('ref', 0.4338524043560028),
 ('science', 0.4269062578678131),
 ('biz', 0.4258567690849304),
 ('useless', 0.4197920560836792),
 ('resell', 0.39436814188957214),
 ('hacker', 0.37690114974975586),
 ('accessible', 0.37124428153038025)]

In [22]:
model_skipGram.wv.most_similar(positive=['winter'])

[('warm', 0.629181444644928),
 ('hill', 0.573944628238678),
 ('cold', 0.5700160264968872),
 ('peru', 0.5248795747756958),
 ('metro', 0.5120118260383606),
 ('gibbon', 0.5107038021087646),
 ('summer', 0.4937472641468048),
 ('qaeda', 0.48532822728157043),
 ('wet', 0.47882452607154846),
 ('wind', 0.4783901274204254)]

In [23]:
model_skipGram.wv.doesnt_match(['winter', 'january', 'computer'])

'computer'

### CBOW

In [24]:
%%time

model_cbow = gensim.models.Word2Vec(
    sentences=X_train_list, # изменил на train
    vector_size=256,
    window=3,
    min_count=10,
    sg=0, # 0 for CBOW
    hs=0,
    negative=5,
    epochs=25,
    seed=RANDOM_STATE,
)

CPU times: user 11.1 s, sys: 137 ms, total: 11.3 s
Wall time: 5.95 s


In [25]:
model_cbow.wv.most_similar(positive=['winter'])

[('south', 0.7863070368766785),
 ('district', 0.765735387802124),
 ('warm', 0.7490406036376953),
 ('southern', 0.7410219311714172),
 ('italian', 0.7398777604103088),
 ('thug', 0.7312845587730408),
 ('north', 0.7178866863250732),
 ('northern', 0.7132603526115417),
 ('gibbon', 0.7119941711425781),
 ('german', 0.7113614678382874)]

In [26]:
model_cbow.wv.most_similar(positive=['computer'])

[('hacker', 0.5715683698654175),
 ('useless', 0.5585527420043945),
 ('virus', 0.5357949137687683),
 ('hazardous', 0.5313625931739807),
 ('science', 0.5090427398681641),
 ('vulnerable', 0.47992753982543945),
 ('apple', 0.4777365028858185),
 ('biz', 0.46867677569389343),
 ('laptop', 0.46193721890449524),
 ('picasso', 0.44450458884239197)]

## Fasttext

In [42]:
with open('data.txt', 'w') as f:
    f.writelines(X_train) # поменял на train

In [28]:
%%time

model_fasttext = fasttext.train_unsupervised('data.txt', wordNgrams=3, dim=256)

CPU times: user 56.5 s, sys: 518 ms, total: 57 s
Wall time: 57.1 s


In [29]:
model_fasttext.get_nearest_neighbors('computer')

[(0.9073851704597473, 'compute'),
 (0.7971869111061096, 'pablo'),
 (0.794200599193573, 'answer'),
 (0.7888842225074768, 'dynamic'),
 (0.7828190326690674, 'software'),
 (0.7800711393356323, 'useless'),
 (0.7711679339408875, 'nonetheless'),
 (0.7700068354606628, 'computing'),
 (0.7598996758460999, 'teledynamic'),
 (0.7550809979438782, 'teledynamics')]

Вывод:

Модели показывают неплохие результаты.

В Word2Vec для CBOW заметен более хорошие результат.


# Сравнение качества на полученных векторах

In [30]:
class MyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self # модель уже обучена

    def transform(self, X):
        X_copy = X.apply(self.text2vector)
        return pd.DataFrame(X_copy.to_list())

    def text2vector(self, text):
        pass

## Word2Vec

In [31]:
class Word2VecTransformer(MyTransformer):
    def __init__(self, model):
        super().__init__(model)

    def text2vector(self, text):
        vec = []
        for w in text.split():
            try:
                emb_for_word = self.model.wv[w]
            except KeyError:
                emb_for_word = np.zeros(256)
            vec.append(np.array(emb_for_word))
        text_vec = np.sum(vec, axis=0)
        return text_vec

In [None]:
pipe_skipgram = Pipeline([
    ('ft', Word2VecTransformer(model_skipGram)),
    ('clf', LogisticRegression(random_state=RANDOM_STATE)),
]).fit(X_train, y_train)

In [None]:
pipe_cbow = Pipeline([
    ('ft', Word2VecTransformer(model_cbow)),
    ('clf', LogisticRegression(random_state=RANDOM_STATE)),
]).fit(X_train, y_train)

## Fasttext

In [45]:
class FastTextTransformer(MyTransformer):
    def __init__(self, model):
        super().__init__(model)

    def text2vector(self, text):
        vec = [np.array(self.model.get_word_vector(w)) for w in text.split()]
        text_vec = np.sum(vec, axis=0)
        return text_vec

In [None]:
pipe_fasttext = Pipeline([
    ('ft', FastTextTransformer(model_fasttext)),
    ('clf', LogisticRegression(random_state=RANDOM_STATE)),
]).fit(X_train, y_train)

## Итог

SkipGram

In [47]:
print(classification_report(y_test, pipe_skipgram.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       642
           1       1.00      0.95      0.98       108

    accuracy                           0.99       750
   macro avg       1.00      0.98      0.99       750
weighted avg       0.99      0.99      0.99       750



CBOW

In [48]:
print(classification_report(y_test, pipe_cbow.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       642
           1       0.95      0.97      0.96       108

    accuracy                           0.99       750
   macro avg       0.97      0.98      0.98       750
weighted avg       0.99      0.99      0.99       750



Fasttext

In [49]:
print(classification_report(y_test, pipe_fasttext.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       642
           1       0.95      0.94      0.95       108

    accuracy                           0.99       750
   macro avg       0.97      0.97      0.97       750
weighted avg       0.99      0.99      0.99       750



Вывод:

В задаче определения спама по моему мнению стоит выбрать CBOW из-за наибольшего значения recall по классу 1.