In [2]:
import pandas as pd
from lxml import html
from string import punctuation
import os
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
%matplotlib inline

import gensim
import numpy as np
import adagram
from sklearn.metrics.pairwise import cosine_distances
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer



In [3]:
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]

    return ' '.join(words)

In [4]:
PATH_TO_TRAIN_DATA = './data'

In [5]:
files = [os.path.join(PATH_TO_TRAIN_DATA, file) for file in os.listdir(PATH_TO_TRAIN_DATA)]

In [6]:
train_set = pd.concat([pd.read_json(file, lines=True, encoding='UTF-8') for file in files], axis=0, ignore_index=True)

In [7]:
train_set['norm'] = train_set['content'].apply(normalize)

In [8]:
train_set['tok'] = train_set['content'].apply(tokenize)

In [9]:
corpus_xml = html.fromstring(open('paraphraser/paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [10]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

#### SVD, NMF, Word2Vec, Fastext, Adagram

In [11]:
cv = CountVectorizer(min_df=3, max_df=0.5)#, max_features=1000)
common = cv.fit_transform(train_set['norm'])

In [12]:
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.5) #, max_features=1000)
common_tfidf = tfidf.fit_transform(train_set['norm'])

In [13]:
def most_similar(word, id2vec):
    similar = [id2word[i] for i in cosine_distances(id2vec[word2id[word]].reshape(1, -1), id2vec).argsort()[0][:10]]
    return similar

In [14]:
id2word = {i:w for i,w in enumerate(cv.get_feature_names())}
word2id = {w:i for i,w in id2word.items()}

### SVD

In [15]:
svd = TruncatedSVD(200)
svd.fit(common)

TruncatedSVD(algorithm='randomized', n_components=200, n_iter=5,
       random_state=None, tol=0.0)

In [16]:
id2vec_svd = svd.components_.T

In [17]:
most_similar('авария', id2vec_svd)

['авария',
 'дтп',
 'кювета',
 'салагай',
 'кювет',
 'бьянки',
 'водитель',
 'автомобиль',
 'аварийность',
 'неаварийный']

### NMF

In [18]:
nmf = NMF(200)
nmf.fit(common)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=200, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [19]:
id2vec_nmf = nmf.components_.T

In [20]:
most_similar('авария', id2vec_nmf)

['авария',
 'разлагаться',
 'метан',
 'маг',
 'безмолвие',
 'каменистый',
 'arco',
 'скуп',
 'поворот',
 'микронный']

### Word2Vec

In [21]:
w2v = gensim.models.Word2Vec([text.split() for text in train_set['norm']], size=300, window=5, min_count=5, sg=1)

In [22]:
w2v.most_similar('авария')

  """Entry point for launching an IPython kernel.


[('неполадка', 0.6342647075653076),
 ('гололёд', 0.6298475861549377),
 ('бьянки', 0.624162495136261),
 ('неисправность', 0.6012779474258423),
 ('поломка', 0.5839831829071045),
 ('возгорание', 0.5814708471298218),
 ('врезаться', 0.5809741020202637),
 ('дтп', 0.5802427530288696),
 ('фукусима-1', 0.5732443332672119),
 ('жюля', 0.562548041343689)]

### FastText

In [23]:
fast_text = gensim.models.FastText([text.split() for text in train_set['tok']], size=100, min_n=3, max_n=6, sg=1)

In [24]:
fast_text.most_similar('авария')

  """Entry point for launching an IPython kernel.


[('аварии', 0.7561020851135254),
 ('аварий', 0.7436084151268005),
 ('бавария', 0.7136021852493286),
 ('посадка', 0.6971679329872131),
 ('мрия', 0.6920362710952759),
 ('эпилепсия', 0.6874281167984009),
 ('аварийность', 0.6747534275054932),
 ('аварию', 0.6707984805107117),
 ('осадка', 0.6693997383117676),
 ('подземка', 0.6646732091903687)]

### Adagram

In [25]:
common_data_list = train_set["norm"].tolist()

In [26]:
with open("paraphraser/corpus_ada.txt", "w", encoding="utf-8") as f:
    for i in common_data_list:
        f.write(i+"\n")

In [None]:
!adagram-train paraphraser\corpus_ada.txt paraphraser\out.pkl --dim 100 --window 5 --epochs 5 --workers 3

При первой же попытке это счастье зависло и погубило мне все тут. Больше я в ipython notebook так не развлекаюсь, потому запустила обучение из командной строки.
А еще я хотела запустить на 5 эпохах, но мне выдали ошибку -- запускать больше чем на 1 эпохе -- это еще TODO :)

In [61]:
vm = adagram.VectorModel.load("paraphraser/out2.pkl")

In [65]:
vm.word_sense_probs('полиция')

[(0, 0.3705956947904682),
 (1, 0.31139444234191604),
 (2, 0.2665665698114904),
 (3, 0.051378498126314134)]

In [67]:
vm.sense_neighbors('полиция', 0)

[('сведение', 1, 0.5179818),
 ('доставать', 0, 0.47486418),
 ('молодой', 1, 0.46848193),
 ('корреспондент', 1, 0.44809744),
 ('сокрытие', 0, 0.43769744),
 ('вокзал', 0, 0.43360248),
 ('человек', 4, 0.43116698),
 ('происшествие', 0, 0.41979373),
 ('рассчитать', 0, 0.4103935),
 ('бессмысленно', 0, 0.3845089)]

In [66]:
vm.sense_neighbors('полиция', 1)

[('противодействовать', 0, 0.49866498),
 ('слабо', 0, 0.4482285),
 ('отток', 0, 0.43033817),
 ('алма-ата', 0, 0.42459294),
 ('обнаружить', 0, 0.37844193),
 ('фактор', 1, 0.37360293),
 ('молл', 0, 0.3576057),
 ('finabudapest2017', 0, 0.35505432),
 ('прошлое', 0, 0.34196287),
 ('усугубить', 0, 0.3387936)]

### Давайте проверим наши модельки

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import svm
from collections import Counter
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

In [33]:
def get_embedding(text, model, dim):
    text = text.split()
    
    # чтобы не доставать одно слово несколько раз
    # сделаем счетчик, а потом векторы домножим на частоту
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total) # просто умножаем вектор на частоту
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [34]:
def get_adagram_embedding(text, model, window, dim):
    text = text.split()
    
    
    word2context = []
    for i in range(len(text)-1):
        left = max(0, i-window)
        word = text[i]
        left_context = text[left:i]
        right_context = text[i+1:i+window]
        context = left_context + right_context
        word2context.append((word, context))
    
    
    
    vectors = np.zeros((len(word2context), dim))
    
    for i,word in enumerate(word2context):
        word, context = word
        try:
            sense = model.disambiguate(word, context).argmax()
            v = model.sense_vector(word, sense)
            vectors[i] = v # просто умножаем вектор на частоту
        
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [35]:
y = data['label'].values

In [36]:
def run_kfolds(X_text, y, n_splits=5):
    kf = KFold(n_splits)
    f1_scores = []
    for train_index, test_index in kf.split(X_text):
        X_train, X_test = X_text[train_index], X_text[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = svm.LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
                            intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
                            penalty='l2', random_state=None, tol=0.0001, verbose=0)
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        f1_scores.append(f1_score(y_test, preds, average='micro'))
    print(np.mean(f1_scores))

In [37]:
def predict_on_random_state(X_text, y, random_state=1):
    train_X, valid_X, train_y, valid_y = train_test_split(X_text, y, random_state=1)
    clf = svm.LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
                        intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
                        penalty='l2', random_state=None, tol=0.0001, verbose=0)
    clf.fit(train_X, train_y)
    preds = clf.predict(valid_X)
    print(classification_report(valid_y, preds))

In [38]:
def generate_distances(X_1, X_2):
    distances = []
    for i in range(data.shape[0]):
        distance = 1 - cosine_distances([X_1[i]], [X_2[i]])[0]
        distances.append(distance)
    return distances

### SVD

In [40]:
X_svd_text_1 = svd.transform(cv.transform(data['text_1_norm']))
X_svd_text_2 = svd.transform(cv.transform(data['text_2_norm']))

X_svd_text = np.concatenate([X_svd_text_1, X_svd_text_2], axis=1)


In [41]:
svd_distance = generate_distances(X_svd_text_1, X_svd_text_2)

In [42]:
len(svd_distance)

7227

### NMF

In [44]:
X_nmf_text_1 = nmf.transform(cv.transform(data['text_1_norm']))
X_nmf_text_2 = nmf.transform(cv.transform(data['text_2_norm']))

X_nmf_text = np.concatenate([X_nmf_text_1, X_nmf_text_2], axis=1)


In [45]:
nmf_distance = generate_distances(X_nmf_text_1, X_nmf_text_2)

In [46]:
len(nmf_distance)

7227

### Word2Vec

In [47]:
dim = 300
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, w2v, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, w2v, dim)

  if sys.path[0] == '':


In [48]:
X_text_w2v = np.concatenate([X_text_1_w2v, X_text_2_w2v], axis=1)

In [49]:
w2v_distance = generate_distances(X_text_1_w2v, X_text_2_w2v)

In [50]:
len(w2v_distance)

7227

### FastText

In [51]:
dim = 300
data['text_1_notnorm'] = data['text_1'].apply(tokenize)
data['text_2_notnorm'] = data['text_2'].apply(tokenize)

X_text_1_ft = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_notnorm']), dim))

for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft[i] = get_embedding(text, fast_text, dim)
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft[i] = get_embedding(text, fast_text, dim)

  if sys.path[0] == '':


In [52]:
X_text_ft = np.concatenate([X_text_1_ft, X_text_2_ft], axis=1)

In [53]:
fasttext_distance = generate_distances(X_text_1_ft, X_text_2_ft)

In [54]:
len(fasttext_distance)

7227

### Adagram

In [68]:
dim = 100
X_text_ada_1 = np.zeros((len(data['text_1_norm']), dim))
X_text_ada_2 = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_ada_1[i] = get_adagram_embedding(text, vm, 5, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_ada_2[i] = get_adagram_embedding(text, vm, 5, dim)

X_ada_text = np.concatenate([X_text_ada_1, X_text_ada_2], axis=1)

  z = np.log(z)


In [69]:
ada_distance = generate_distances(X_text_ada_1, X_text_ada_2)

In [70]:
len(ada_distance)

7227

### Обучим модель

In [98]:
X_distances = np.concatenate([svd_distance, nmf_distance, w2v_distance, fasttext_distance, ada_distance], axis=-1)

In [99]:
run_kfolds(X_distances, y, n_splits=10)

0.49039110661563273


In [100]:
predict_on_random_state(X_distances, y, random_state=1)

             precision    recall  f1-score   support

         -1       0.52      0.83      0.64       629
          0       0.50      0.28      0.35       737
          1       0.49      0.43      0.46       441

avg / total       0.50      0.51      0.48      1807



### Давайте что-нибудь улучшим

С фантазией у меня туговато, поэтому давайте попробуем 2 вещи:
* попробуем добавить чатси речи к леммам для нашего w2v
* попробуем использовать дополнительно w2v, обученный на новостных статьях из RusVectores

### Word2Vec + POS

In [118]:
def define_pos(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0] for word in words if word and word not in stops]
    words = [word.normal_form+"_"+str(word.tag.POS) for word in words if word]

    return ' '.join(words)

In [119]:
train_set['pos'] = train_set['content'].apply(define_pos)

In [120]:
data['text_1_pos'] = data['text_1'].apply(define_pos)
data['text_2_pos'] = data['text_2'].apply(define_pos)

In [121]:
w2v_pos = gensim.models.Word2Vec([text.split() for text in train_set['pos']], size=300, window=5, min_count=5, sg=1)

In [123]:
w2v_pos.most_similar('авария_NOUN')

  """Entry point for launching an IPython kernel.


[('гололёд_NOUN', 0.6457265019416809),
 ('бьянки_NOUN', 0.6411608457565308),
 ('неполадка_NOUN', 0.6307287216186523),
 ('дтп_NOUN', 0.6277387142181396),
 ('неисправность_NOUN', 0.6102908849716187),
 ('врезаться_VERB', 0.5891246795654297),
 ('жюля_NOUN', 0.5794494152069092),
 ('обледенение_NOUN', 0.5678970813751221),
 ('караченцов_NOUN', 0.5654643774032593),
 ('пит-лейн_NOUN', 0.5645472407341003)]

In [125]:
dim = 300
X_text_1_w2v_pos = np.zeros((len(data['text_1_pos']), dim))
X_text_2_w2v_pos = np.zeros((len(data['text_2_pos']), dim))

for i, text in enumerate(data['text_1_pos'].values):
    X_text_1_w2v_pos[i] = get_embedding(text, w2v_pos, dim)
    
for i, text in enumerate(data['text_2_pos'].values):
    X_text_2_w2v_pos[i] = get_embedding(text, w2v_pos, dim)

  if sys.path[0] == '':


In [126]:
w2v_pos_distance = generate_distances(X_text_1_w2v_pos, X_text_2_w2v_pos)

Наконец-то мы добрались до самого интересного

In [127]:
X_distances = np.concatenate([svd_distance, nmf_distance, w2v_distance, fasttext_distance, w2v_pos_distance, ada_distance], axis=-1)

In [130]:
run_kfolds(X_distances, y, n_splits=10)

0.5045099098477795


In [131]:
predict_on_random_state(X_distances, y, random_state=1)

             precision    recall  f1-score   support

         -1       0.52      0.82      0.63       629
          0       0.47      0.49      0.48       737
          1       0.42      0.03      0.06       441

avg / total       0.47      0.49      0.43      1807



Посмотрим, что будет, если использовать только один w2v_pos

In [132]:
X_distances = np.concatenate([svd_distance, nmf_distance, fasttext_distance, w2v_pos_distance, ada_distance], axis=-1)

In [133]:
run_kfolds(X_distances, y, n_splits=10)

0.5035271625230361


In [134]:
predict_on_random_state(X_distances, y, random_state=1)

             precision    recall  f1-score   support

         -1       0.52      0.83      0.64       629
          0       0.50      0.23      0.32       737
          1       0.46      0.48      0.47       441

avg / total       0.50      0.50      0.47      1807



Мне кажется, это очень даже неплохо =/

### Word2Vec news_upos_cbow_600_2_2018 from RusVectores

In [135]:
modelfile = 'news_upos_cbow_600_2_2018.vec.gz'
big_model = gensim.models.KeyedVectors.load_word2vec_format(modelfile, binary=False)

In [154]:
big_model.most_similar('авария_NOUN')

[('дтп_NOUN', 0.7848729491233826),
 ('автоавария_NOUN', 0.7186106443405151),
 ('автокатастрофа_NOUN', 0.6146150827407837),
 ('чп_NOUN', 0.608892560005188),
 ('происшествие_NOUN', 0.6018447875976562),
 ('инцидент_NOUN', 0.5716761946678162),
 ('автопроисшествие_NOUN', 0.5431926250457764),
 ('авиакатастрофа_NOUN', 0.5024130344390869),
 ('трагедия_NOUN', 0.5010612607002258),
 ('авиапроисшествие_NOUN', 0.4828478693962097)]

In [156]:
big_model.most_similar('аварийный_ADJ')

[('ветхий_ADJ', 0.5710058212280273),
 ('нештатный_ADJ', 0.4783197343349457),
 ('аварийна_NOUN', 0.4630982577800751),
 ('аварийний_ADJ', 0.442495733499527),
 ('аварийщик_NOUN', 0.4103148877620697),
 ('ветхоаварийный_ADJ', 0.4031134247779846),
 ('аварийно-ремонтный_ADJ', 0.38955214619636536),
 ('ветхость_NOUN', 0.3825417160987854),
 ('экстренный_ADJ', 0.3801092803478241),
 ('негодный_ADJ', 0.37957727909088135)]

Так как теги pymorphy и Universal Tags не совпадают (а это влияет на качество, я проверила :( ), то давайте исправим теги =/

In [190]:
def correct_tags(text):
    return text.replace('ADJF', 'ADJ').replace('ADJS', "ADJ").replace('NPRO', 'PROPN').replace('NUMR', 'NUM').replace('ADVB', 'ADV')

In [191]:
data['text_1_pos_norm'] = data['text_1_pos'].apply(correct_tags)
data['text_2_pos_norm'] = data['text_2_pos'].apply(correct_tags)

In [192]:
dim = 600
X_text_1_w2v_big = np.zeros((len(data['text_1_pos_norm']), dim))
X_text_2_w2v_big = np.zeros((len(data['text_2_pos_norm']), dim))

for i, text in enumerate(data['text_1_pos_norm'].values):
    X_text_1_w2v_big[i] = get_embedding(text, big_model, dim)
    
for i, text in enumerate(data['text_2_pos_norm'].values):
    X_text_2_w2v_big[i] = get_embedding(text, big_model, dim)

In [193]:
w2v_big_distance = generate_distances(X_text_1_w2v_big, X_text_2_w2v_big)

Проверим, улучшится ли качество

In [194]:
X_distances = np.concatenate([svd_distance, nmf_distance, w2v_distance, fasttext_distance, w2v_big_distance, ada_distance], axis=-1)

In [195]:
run_kfolds(X_distances, y, n_splits=10)

0.51626533028356


In [196]:
predict_on_random_state(X_distances, y, random_state=1)

             precision    recall  f1-score   support

         -1       0.52      0.83      0.64       629
          0       0.47      0.38      0.42       737
          1       0.49      0.23      0.31       441

avg / total       0.49      0.50      0.47      1807



А что будет, если использовать только RusVectores

In [197]:
X_distances = np.concatenate([svd_distance, nmf_distance, fasttext_distance, w2v_big_distance, ada_distance], axis=-1)

In [198]:
run_kfolds(X_distances, y, n_splits=10)

0.4848515151166845


In [199]:
predict_on_random_state(X_distances, y, random_state=1)

             precision    recall  f1-score   support

         -1       0.51      0.82      0.63       629
          0       0.48      0.31      0.37       737
          1       0.52      0.38      0.44       441

avg / total       0.50      0.50      0.48      1807



Или вообще только его

In [201]:
X_distances = np.concatenate([w2v_big_distance], axis=-1)

In [202]:
run_kfolds(X_distances, y, n_splits=10)

0.5025664455964106


In [203]:
predict_on_random_state(X_distances, y, random_state=1)

             precision    recall  f1-score   support

         -1       0.51      0.83      0.63       629
          0       0.47      0.03      0.06       737
          1       0.42      0.70      0.53       441

avg / total       0.47      0.47      0.37      1807



Вывод: не могу сказать, что мне очень нравится результат, но, в любом случае, эмбеддинги -- это любовь