In [28]:
Author: Ilia Kabanov

In [None]:
Task: to predict whether product review is positive 

In [1]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import string

from sklearn.metrics import make_scorer, roc_auc_score
import json
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from skopt.space import Integer, Real, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
from scipy.sparse import csr_matrix, csc_matrix
from tqdm import tqdm
from gensim.models import Word2Vec
import re
from pymystem3 import Mystem
from pymorphy2 import MorphAnalyzer
from copy import deepcopy
from sklearn.svm import SVC

import gc
import os
import sys
import time
import itertools
from tqdm.notebook import tqdm
import pickle
import json
import joblib
import collections
import requests 
from urllib.parse import urlencode 


import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings

warnings.filterwarnings("ignore")

In [3]:
# Read input data
train = pd.read_csv('train.csv', sep=',', engine='python')
test = pd.read_csv('test.csv', sep=',', engine='python' )
print(train.head())

                                              review  sentiment
0  КАТЕГОРИЧЕСКИ НЕ СОВЕТУЮ брать продукцию texet...        0.0
1  первый и последний раз приобрел продукт данног...        0.0
2  Если Вы например ходите на разничные тусовки и...        0.0
3  Ни кому бы не стала советовать покупать этот т...        0.0
4  Из явных глюков, которые не устранились обновл...        0.0


In [4]:
print(test.head())

  id                                             review
0  0  До этого был Хонор 10 и пользовался только анд...
1  1                                         не советую
2  2  Больше года использования не подвел ни разу. М...
3  3  Телефон держит заряд 3-4 дня при 15-20 минут р...
4  4  Была раньше только нокия, но по работе пришлос...


In [5]:
# Drop lines in X_train with empty 'sentiment' and lower all the words
train = train[train['sentiment'].notnull()]
train.reset_index(inplace=True)
train.drop(['index'], axis=1, inplace=True)
train.iloc[:, 0] = train.iloc[:, 0].apply(lambda x: x.lower())

# Drop lines with no id in X_train, and lower all the words 
test = test[test['id'].notnull()]
test = test[test['id'].apply(lambda x: x.isnumeric())]
test.reset_index(inplace=True)
test.drop(['index'], axis=1, inplace=True)
test.iloc[:, 1] = test.iloc[:, 1].apply(lambda x: x.lower())
test_ids = test['id']
X_test = test['review']

X_train = train.iloc[:, 0]
y_train = train['sentiment']

In [None]:
Then let's use lemmatization procedure with both X_train and X_test

In [6]:
# Lemmatization
morph = MorphAnalyzer()
train_lemmatized_descriptions = []
for i, description in tqdm(enumerate(X_train.values), total=len(X_train)):
    try:
        lemmatized_description = [
            morph.parse(token)[0].normal_form for token in
            re.findall(r'\w+', description)
        ]
        train_lemmatized_descriptions.append(lemmatized_description)
    except Exception as e:
        print(f'Не удалось распарсить description с индексом={i}:')
        print("descrition:")
        print(description, end='\n\n')
        print(e, end='\n\n')
        train_lemmatized_descriptions.append([])

  0%|          | 0/22892 [00:00<?, ?it/s]

In [7]:
morph = MorphAnalyzer()
test_lemmatized_descriptions = []
for i, description in tqdm(enumerate(test['review'].values), total=len(test['review'])):
    try:
        lemmatized_description = [
            morph.parse(token)[0].normal_form for token in
            re.findall(r'\w+', description)
        ]
        test_lemmatized_descriptions.append(lemmatized_description)
    except Exception as e:
        print(f'Не удалось распарсить description с индексом={i}:')
        print("descrition:")
        print(description, end='\n\n')
        print(e, end='\n\n')
        train_lemmatized_descriptions.append([])

  0%|          | 0/21747 [00:00<?, ?it/s]

In [None]:
Transform list with lemmatized words into dataset with previous form 

In [33]:
X_train = pd.DataFrame(train_lemmatized_descriptions).fillna(value=' ', inplace=False)
X_train = X_train.apply(' '.join, axis=1)

In [34]:
X_test = pd.DataFrame(test_lemmatized_descriptions).fillna(value=' ', inplace=False)
X_test = X_test.apply(' '.join, axis=1)

In [None]:
Lets have a look at our lemmatized train and test datasets

In [35]:
X_train.head(5)

0    категорически не советовать брать продукция te...
1    первый и последний раз приобрести продукт данн...
2    если вы например ходить на разничный тусовка и...
3    ни кто бы не стать советовать покупать этот те...
4    из явный глюк который не устраниться обновлени...
dtype: object

In [36]:
X_test.head(5)

0    до это быть хонора 10 и пользоваться только ан...
1    не советовать                                 ...
2    большой год использование не подвести ни раз м...
3    телефон держать заряд 3 4 день при 15 20 минут...
4    быть ранний только нокия но по работа прийтись...
dtype: object

In [None]:
We got exactly what we need, our dataset are ready for embedding.
Let's count tf_idf of every word in text_corpus

In [37]:
# Implementing td-idf and getting our X_train, y_train
russian_stop_words = list(
    pd.read_csv('https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt', header=None)[0]
)
text_corpus = pd.concat([X_train, X_test], axis=0).reset_index().drop(['index'], axis=1, inplace=False).iloc[:, 0]
tf_idf = TfidfVectorizer(stop_words=russian_stop_words,
                             ngram_range=(1, 2),
                             min_df=5,
                             max_df=0.99
                            )

In [38]:
%%time
tf_idf.fit([' '.join(str(word) for word in sentence) for sentence in train_lemmatized_descriptions])
X_tr = tf_idf.transform(
    [' '.join(str(word) for word in sentence) for sentence in train_lemmatized_descriptions]
)
X_te = tf_idf.transform(
    [' '.join(str(word) for word in sentence) for sentence in test_lemmatized_descriptions]
)
print(f"X_tr: {X_tr.shape[0]:,} x {X_tr.shape[1]:,}")
print(f"X_te: {X_te.shape[0]:,} x {X_te.shape[1]:,}")

X_tr: 22,892 x 21,901
X_te: 21,747 x 21,901
CPU times: total: 6.28 s
Wall time: 6.33 s


In [39]:
# Creating Word2Vec vocabulary, based on our lemmatized X_train
text_corpus = train_lemmatized_descriptions
d = 50 # embedding dimension
w2v_model = Word2Vec(sentences=text_corpus,
                     min_count=10,
                     window=10,
                     vector_size=d)

In [40]:
# Translate words from the dataset into word2vec vectors
copy_X_train = deepcopy(X_train)
X_train = pd.DataFrame(train_lemmatized_descriptions).fillna(value=0, inplace=False)
X_train = X_train.applymap(lambda x: w2v_model.wv[x] if x in w2v_model.wv else 0)

In [43]:
X_train.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,393,394,395,396,397,398,399,400,401,402
0,"[0.044200912, -0.008069042, -0.30456, -0.02325...","[-0.61658484, 0.99378943, 0.669043, -1.5624499...","[-0.38431868, -2.865498, -3.3376913, 1.2617782...","[2.4528542, -1.6575896, -3.5384734, 2.6203744,...","[-0.010254101, -0.28837365, -1.1872854, 0.0344...","[-0.023676293, -0.1382317, 0.0022645644, 0.037...","[-0.55358464, 0.2415753, -0.39307374, 1.591386...","[0.73056346, 1.5591023, -0.77380407, -0.267105...","[-0.55358464, 0.2415753, -0.39307374, 1.591386...","[-0.009174255, 1.0757917, 1.0208045, -0.333414...",...,"[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940..."
1,"[0.1060523, -0.17561156, -0.5548251, -0.071880...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-0.76158327, -1.7124096, -0.5286558, 0.856061...","[-0.88319707, -1.4063718, 1.593841, 2.488779, ...","[1.1973634, -0.8015441, -2.6771297, 2.8275838,...","[0.18506458, -0.27211612, -1.2946948, 0.298019...","[-0.9703825, 1.7481861, -1.1898727, -1.808933,...","[0.72331065, -0.56463075, -0.323733, -1.384814...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...",...,"[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940...","[-1.7984205, -1.0761415, 0.80871004, 0.1154940..."


In [None]:
In every cell we have 50-dimensional vector of an embedded word. Let's check: the first vector in the first line should be
the word 'категорически'

In [45]:
(X_train.iloc[0, 0] == w2v_model.wv['категорически']).all()

True

In [None]:
Good, everything is allright
Lets take embedding of a sentnence as weighted mean of every word in the sentence.
I am going to use idf of a word divided by sum of idf's in this sentence. It allows us to give more weight to the rarest words.
They are what makes each review unique and defines it.

In [46]:
# Create mapping: (token, idf вес)
word2idf = {}
mean_idf_weight, n = 0, 0
for word, idx in tqdm(tf_idf.vocabulary_.items()):
    word2idf[word] = tf_idf.idf_[idx]
    mean_idf_weight += tf_idf.idf_[idx]
    n += 1
mean_idf_weight /= n

X_tr_emb = []
emb_size = len(w2v_model.wv['не'])
for text in tqdm(train_lemmatized_descriptions):
    res = np.zeros(emb_size)
    denominator = 1e-20
    for token in text:
        idf_weight = word2idf.get(token, mean_idf_weight)
        denominator += idf_weight
        try:
            res += w2v_model.wv[token] * idf_weight
        except:
            res += np.zeros(emb_size)
    res /= denominator
    X_tr_emb.append(list(res))
X_tr_emb = np.array(X_tr_emb)

  0%|          | 0/21901 [00:00<?, ?it/s]

  0%|          | 0/22892 [00:00<?, ?it/s]

In [31]:
X_te_emb = []
emb_size = len(w2v_model.wv['не'])
for text in tqdm(test_lemmatized_descriptions):
    res = np.zeros(emb_size)
    denominator = 1e-20
    for token in text:
        idf_weight = word2idf.get(token, mean_idf_weight)
        denominator += idf_weight
        try:
            res += w2v_model.wv[token] * idf_weight
        except:
            res += np.zeros(emb_size)
    res /= denominator
    X_te_emb.append(list(res))
X_te_emb = np.array(X_te_emb)

  0%|          | 0/21747 [00:00<?, ?it/s]

In [50]:
print(X_tr_emb[0, :])
print(np.shape(X_tr_emb))

[-0.62232392 -0.0476773  -0.67265277 -0.11154408  0.36035627 -0.85433612
 -0.7293352  -0.06254526 -0.75194891  0.52786277 -0.24420437 -0.85110287
  0.63250629  0.67324755 -0.71638756  0.23487896 -0.58970651  0.81449972
  0.06440735  0.59710945 -0.09617775  0.20778339 -0.21112607 -0.47319356
  0.49213558 -1.40247931 -0.41586154  0.21096614 -0.32726582 -0.14366438
  0.14812559 -0.62541031 -0.14207857 -0.09900206 -0.7507755  -0.13075403
  0.13117144 -0.54457227 -0.01814818 -0.6151011  -0.09563059  0.02106113
  0.16496755 -0.18688816 -0.60734472 -0.4024456   0.32109422 -0.87516085
  0.46509093  0.59177074]
(22892, 50)


In [None]:
Here we got the embedding for the first review.
Nice, now we have fully embedded dataset, we can run the ml model. We are going to use logistic regression as we need to get
the best score according to ROS-AUC score and we believe that it describes our data structure rather well. Before training we
need to optimize the hyperparameter C and l1_ratio through cross-validation.

In [52]:
# define search space
search_space = list()
search_space.append(Real(0.1, 10, name='C'))
search_space.append(Real(10**(-3), 1, name='l1_ratio'))


# define the function needed to optimize
@use_named_args(search_space)
def cross_val_mean(**param):
    log_reg = LogisticRegression(**param, n_jobs=-1)
    acc = -np.mean(cross_val_score(estimator=log_reg, X=X_tr, y=y_train, scoring=make_scorer(roc_auc_score), cv=5))
    return acc


# perform optimization
result = gp_minimize(
    func=cross_val_mean,
    dimensions=search_space,
    n_calls=40,
    random_state=42,
    verbose=True
)

print('Best AUC-ROC: %.3f' % (-result.fun))
print('Best Parameters: %s' % (result.x))

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 8.6867
Function value obtained: -0.7060
Current minimum: -0.7060
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 7.6483
Function value obtained: -0.7068
Current minimum: -0.7068
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 8.0521
Function value obtained: -0.7135
Current minimum: -0.7135
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 7.7719
Function value obtained: -0.7133
Current minimum: -0.7135
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 7.3245
Function value obtained: -0.7191
Current minimum: -0.7191
Iteration No: 6 started. 

In [None]:
We got our best hyperparameters, now we can fit the model and give predictions for test dataset.

In [53]:
# Fit our best model on the whole train dataframe
C, l1_ratio = result.x
log_reg = LogisticRegression(C=C, l1_ratio=l1_ratio, n_jobs=-1)

log_reg.fit(X=X_tr_emb, y=y_train)
pred_test = log_reg.predict_proba(X=X_te_emb)[:, 1]
submission = pd.DataFrame()
submission["id"] = test_ids
submission["sentiment"] = pred_test
submission.to_csv("submission.csv", index=False)