In [109]:
import pandas as pd
import numpy as np
import gensim
import sklearn.metrics
from lxml import html
from collections import Counter
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from string import punctuation
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer

punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))
morph = MorphAnalyzer()

In [2]:
data = open('embeddings/corpus_hum.txt').read().splitlines()

In [4]:
def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

In [5]:
data_norm = [normalize(text) for text in data]

In [6]:
data_norm = [text for text in data_norm if text]

In [7]:
w2v = Word2Vec([text.split() for text in data_norm], size=50, sg=1)

In [10]:
taiga = gensim.models.KeyedVectors.load_word2vec_format('185/model.bin', binary=True)

In [11]:
w2v.most_similar('мода')

  """Entry point for launching an IPython kernel.


[('модный', 0.8695891499519348),
 ('дизайнер', 0.7002891898155212),
 ('индустрия', 0.6708942651748657),
 ('костюм', 0.6677462458610535),
 ('сегодня', 0.6563939452171326),
 ('брючный', 0.6519395709037781),
 ('прижиться', 0.6437793374061584),
 ('трендсеттер', 0.6367740631103516),
 ('ворт', 0.6363341808319092),
 ('новинка', 0.6333620548248291)]

In [36]:
taiga.most_similar('мода_NOUN')

[('мода_PROPN', 0.7142900228500366),
 ('модный_ADJ', 0.6649705171585083),
 ('моде_VERB', 0.6315901875495911),
 ('тренд_NOUN', 0.6045886278152466),
 ('мода_ADV', 0.5420243144035339),
 ('супермодный_ADJ', 0.5170178413391113),
 ('модный_NOUN', 0.5159582495689392),
 ('новинка_NOUN', 0.5103620290756226),
 ('джов_NOUN', 0.5080969929695129),
 ('майнкрафть_NOUN', 0.4937824606895447)]

In [17]:
corpus_xml = html.fromstring(open('paraphraser/paraphrases.xml', 'rb').read())

In [18]:
texts_1 = []
texts_2 = []
classes = []

In [19]:
for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])

In [20]:
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [44]:
data.head()

Unnamed: 0,text_1,text_2,label,text_1_norm,text_2_norm
0,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,0,полицейский разрешить стрелять поражение гражд...,полиция мочь разрешить стрелять хулиган травма...
1,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...,0,право полицейский проникновение жилища решить ...,правило внесудебный проникновение полицейский ...
2,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...,0,президент египет ввести чрезвычайный положение...,власть египет угрожать ввести страна чрезвычай...
3,Вернувшихся из Сирии россиян волнует вопрос тр...,Самолеты МЧС вывезут россиян из разрушенной Си...,-1,вернуться сирия россиянин волновать вопрос тру...,самолёт мчс вывезти россиянин разрушить сирия
4,В Москву из Сирии вернулись 2 самолета МЧС с р...,Самолеты МЧС вывезут россиян из разрушенной Си...,0,москва сирия вернуться 2 самолёт мчс россиянин...,самолёт мчс вывезти россиянин разрушить сирия


In [21]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

In [22]:
def get_embedding(text, model, dim):
    text = text.split()
    
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total)
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [25]:
dim = 50
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

In [28]:
for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, w2v, dim)

  # Remove the CWD from sys.path while we load stuff.


In [29]:
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, w2v, dim)

  # Remove the CWD from sys.path while we load stuff.


In [30]:
X_text_w2v = np.concatenate([X_text_1_w2v, X_text_2_w2v], axis=1)

In [31]:
dim = 50
X_text_1_taiga = np.zeros((len(data['text_1_norm']), dim))
X_text_2_taiga = np.zeros((len(data['text_2_norm']), dim))

In [32]:
for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_taiga[i] = get_embedding(text, taiga, dim)

In [37]:
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_taiga[i] = get_embedding(text, taiga, dim)

In [38]:
X_text_taiga = np.concatenate([X_text_1_taiga, X_text_2_taiga], axis=1)

In [40]:
y = data['label'].values

In [43]:
train_X, valid_X, train_y, valid_y = train_test_split(X_text_w2v, y,random_state=1)

In [53]:
logreg = LogisticRegression().fit(train_X, train_y)
pred = logreg.predict(valid_X)

In [72]:
w2v_score = cross_val_score(logreg, X_text_w2v, y, cv=10, scoring='f1_micro')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [73]:
np.mean(w2v_score)

0.4567481982965713

In [61]:
train_X, valid_X, train_y, valid_y = train_test_split(X_text_taiga, y,random_state=1)

In [62]:
logreg = LogisticRegression().fit(train_X, train_y)
pred = logreg.predict(valid_X)

In [74]:
taiga_score = cross_val_score(logreg, X_text_taiga, y, cv=10, scoring='f1_micro')

In [75]:
np.mean(taiga_score)

0.4091598564001181

Лучше справилась Word2Vec-модель.

In [100]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)
tfidf.fit(pd.concat([data['text_1_norm'], data['text_2_norm']]))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.4, max_features=1000,
                min_df=3, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [96]:
svd = TruncatedSVD(200)

In [101]:
X_text_1 = svd.fit_transform(tfidf.transform(data['text_1_norm']))
X_text_2 = svd.fit_transform(tfidf.transform(data['text_2_norm']))

In [149]:
X_text_1.shape[0]

7227

In [88]:
nmf = NMF(50)

In [116]:
X_nmf_1 = svd.fit_transform(tfidf.transform(data['text_1_norm']))
X_nmf_2 = svd.fit_transform(tfidf.transform(data['text_2_norm']))

In [129]:
def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]

    return ' '.join(words)

In [130]:
data['text_1_notnorm'] = data['text_1'].apply(tokenize)
data['text_2_notnorm'] = data['text_2'].apply(tokenize)

In [131]:
X_text_1_ft = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_notnorm']), dim))

In [133]:
fast_text = gensim.models.FastText([text.split() for text in data_norm], size=50, 
                                   min_n=4, max_n=8) 

In [134]:
for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft[i] = get_embedding(text, fast_text, dim)
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft[i] = get_embedding(text, fast_text, dim)

  # Remove the CWD from sys.path while we load stuff.


In [143]:
all5 = np.zeros((7227, 5))

In [None]:
for i in range(7227):
    cs = sklearn.metrics.pairwise.cosine_distances([X_text_1[i]], [X_text_2[i]])
    all[i][0] = cs

In [159]:
for i in range(7227):
    cs = sklearn.metrics.pairwise.cosine_distances([X_nmf_1[i]], [X_nmf_2[i]])
    all[i][1] = cs

In [160]:
for i in range(7227):
    cs = sklearn.metrics.pairwise.cosine_distances([X_text_1_w2v[i]], [X_text_2_w2v[i]])
    all[i][2] = cs

In [161]:
for i in range(7227):
    cs = sklearn.metrics.pairwise.cosine_distances([X_text_1_taiga[i]], [X_text_2_taiga[i]])
    all[i][3] = cs

In [162]:
for i in range(7227):
    cs = sklearn.metrics.pairwise.cosine_distances([X_text_1_ft[i]], [X_text_2_ft[i]])
    all[i][4] = cs

In [163]:
logreg = LogisticRegression()

In [164]:
np.mean(cross_val_score(logreg, all, y, cv=10, scoring='f1_micro'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.5421073704133669