In [1]:
import pandas as pd
import numpy as np

In [2]:
from navec import Navec

path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

In [3]:
df = pd.read_csv('/home/jupyter/mnt/s3/bank-data/features/test_for_participants.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,sentence
0,0,"Очень неприятная ситуация, надеюсь, банк либо ..."
1,1,За что выражаю благодарность и банку и данному...
2,2,"Вывод: информация полученная в смс от банка, и..."
3,3,Хочу по благодарить ее за чуткое отношение к н...
4,4,"Показал, что я и вклад могу свой пополнять пря..."


In [4]:
import re

from pymystem3 import Mystem
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

mystem = Mystem() 
russian_stopwords = stopwords.words("russian")
#english_stopwords = stopwords.words("english")
stopwords = russian_stopwords #+ english_stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from collections import Counter

In [5]:
def lemmatize_sentence(x):
    x = x.strip()
    x = x.lower()
    pattern = r'[^А-Яа-яЁё\s]'
    x = re.sub(pattern, '', x)
    tokens = mystem.lemmatize(x)
    tokens = [token for token in tokens if token not in stopwords and not token.isspace()]
    return tokens

In [6]:
df['lemmatized'] = [lemmatize_sentence(sentence) for sentence in df['sentence']]

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,sentence,lemmatized
0,0,"Очень неприятная ситуация, надеюсь, банк либо ...","[очень, неприятный, ситуация, надеяться, банк,..."
1,1,За что выражаю благодарность и банку и данному...,"[выражать, благодарность, банка, данный, сотру..."
2,2,"Вывод: информация полученная в смс от банка, и...","[вывод, информация, получать, смс, банк, инфор..."
3,3,Хочу по благодарить ее за чуткое отношение к н...,"[хотеть, благодарить, чуткий, отношение, пожил..."
4,4,"Показал, что я и вклад могу свой пополнять пря...","[показывать, вклад, мочь, свой, пополнять, пря..."


In [9]:
def get_most_common_words(category):
    all_words = []
    if category == 'all':
        for sentence in df['lemmatized']:
            all_words.extend(sentence)
    else:
        for sentence in df[df['1category'] == category]['lemmatized']:
            all_words.extend(sentence)
    counter = Counter(all_words)
    return [pair[0] for pair in counter.most_common(100)]

In [15]:
df['1category'].unique()

array(['Communication', '?', 'Quality', 'Price', 'Safety'], dtype=object)

In [8]:
def encode(word):
    if word in navec:
        return navec[word]
    return navec["<unk>"]

In [11]:
words_comm = get_most_common_words('Communication')
words_qual = get_most_common_words('Quality')
words_price = get_most_common_words('Price')
words_safe = get_most_common_words('Safety')
words_all = get_most_common_words('all')

In [13]:
words_comm = list(set(words_comm) - set(words_all))
words_qual = list(set(words_qual) - set(words_all))
words_price = list(set(words_price) - set(words_all))
words_safe = list(set(words_safe) - set(words_all))

print(len(words_comm))
print(len(words_qual))
print(len(words_price))
print(len(words_safe))

17
12
58
49


In [9]:
words_comm = ['отказываться',
 'девушка',
 'сегодня',
 'объяснять',
 'предлагать',
 'абсолютно',
 'обманывать',
 'написать',
 'поддержка',
 'чат',
 'общение',
 'горячий',
 'сообщать',
 'писать',
 'дозваниваться',
 'смс',
 'вежливый']

In [10]:
words_qual = ['тинькофф',
 'минута',
 'перевод',
 'выражать',
 'удобно',
 'ужасный',
 'отвратительный',
 'оказываться',
 'удобный',
 'закрывать',
 'оплата',
 'мобильный']

In [11]:
words_price = ['снятие',
 'покупка',
 'годовой',
 'руб',
 'течение',
 'процент',
 'бесплатный',
 'тк',
 'кредитование',
 'дебетовый',
 'продукт',
 'остаток',
 'кешбек',
 'вклад',
 'выгодный',
 'курс',
 'плюс',
 'заплатить',
 'перевод',
 'вернуть',
 'снимать',
 'грабительский',
 'привлекательный',
 'момент',
 'валюта',
 'любой',
 'высокий',
 'считать',
 'закрывать',
 'бонус',
 'мало',
 'халва',
 'ставка',
 'изза',
 'акция',
 'возможность',
 'бесплатно',
 'предлагать',
 'комиссия',
 'оказываться',
 'предложение',
 'рубль',
 'одобрять',
 'открывать',
 'нравиться',
 'ипотека',
 'переводить',
 'учет',
 'платить',
 'тариф',
 'приемлемый',
 'нужный',
 'удобный',
 'брать',
 'отличный',
 'кэшбэк',
 'магазин',
 'оплата']

In [12]:
words_safe = ['трубка',
 'вносить',
 'становиться',
 'имя',
 'безопасность',
 'предоставлять',
 'абсолютно',
 'родственник',
 'тк',
 'закон',
 'просить',
 'иметь',
 'мошенник',
 'право',
 'обработка',
 'указывать',
 'россия',
 'начинать',
 'обманывать',
 'снимать',
 'написать',
 'момент',
 'случай',
 'действие',
 'банковский',
 'считать',
 'закрывать',
 'данные',
 'персональный',
 'разговор',
 'история',
 'предлагать',
 'задолженность',
 'мошеннический',
 'сообщать',
 'проводить',
 'рубль',
 'нарушение',
 'поступать',
 'платить',
 'образ',
 'требовать',
 'служба',
 'поддельный',
 'видеть',
 'получаться',
 'проверка',
 'согласие',
 'долг']

In [13]:
from scipy.spatial.distance import cosine

def distances_words(plenty_words, sentence):
    res = []
    for spec_word in plenty_words:
        dists = [cosine(encode(spec_word), encode(word)) for word in sentence]
        if len(dists) > 0:
            res.append(min(dists))
        else:
            res.append(1)
    return res

In [14]:
from tqdm.notebook import tqdm

In [15]:
def get_features(name, special_words):
    all_dist = distances_words(special_words, df.iloc[0].lemmatized)
    for i in tqdm(range(1, df.shape[0])):
        all_dist = np.vstack([all_dist, distances_words(special_words, df.iloc[i].lemmatized)])
    return pd.DataFrame(all_dist, columns=[name + f'_{i}' for i in range(len(special_words))])

In [16]:
df_dist_comm = get_features('dist_comm', words_comm)
df_dist_qual = get_features('dist_qual', words_qual)
df_dist_price = get_features('dist_price', words_price)
df_dist_safe = get_features('dist_safe', words_safe)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2151.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2151.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2151.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2151.0), HTML(value='')))







In [17]:
final_df = pd.concat([df.sentence, df_dist_comm, df_dist_qual, df_dist_price, df_dist_safe], axis=1)

In [20]:
final_df.to_csv('TEST_distances_category.csv')

In [19]:
final_df.head()

Unnamed: 0,sentence,dist_comm_0,dist_comm_1,dist_comm_2,dist_comm_3,dist_comm_4,dist_comm_5,dist_comm_6,dist_comm_7,dist_comm_8,dist_comm_9,dist_comm_10,dist_comm_11,dist_comm_12,dist_comm_13,dist_comm_14,dist_comm_15,dist_comm_16,dist_qual_0,dist_qual_1,dist_qual_2,dist_qual_3,dist_qual_4,dist_qual_5,dist_qual_6,dist_qual_7,dist_qual_8,dist_qual_9,dist_qual_10,dist_qual_11,dist_price_0,dist_price_1,dist_price_2,dist_price_3,dist_price_4,dist_price_5,dist_price_6,dist_price_7,dist_price_8,dist_price_9,...,dist_safe_9,dist_safe_10,dist_safe_11,dist_safe_12,dist_safe_13,dist_safe_14,dist_safe_15,dist_safe_16,dist_safe_17,dist_safe_18,dist_safe_19,dist_safe_20,dist_safe_21,dist_safe_22,dist_safe_23,dist_safe_24,dist_safe_25,dist_safe_26,dist_safe_27,dist_safe_28,dist_safe_29,dist_safe_30,dist_safe_31,dist_safe_32,dist_safe_33,dist_safe_34,dist_safe_35,dist_safe_36,dist_safe_37,dist_safe_38,dist_safe_39,dist_safe_40,dist_safe_41,dist_safe_42,dist_safe_43,dist_safe_44,dist_safe_45,dist_safe_46,dist_safe_47,dist_safe_48
0,"Очень неприятная ситуация, надеюсь, банк либо ...",0.636743,0.545505,0.482961,0.472426,0.625368,0.569874,0.665901,0.698506,0.724423,0.794806,0.719439,0.627459,0.614425,0.566074,0.733565,0.822475,0.631657,0.0,0.742133,0.702809,0.631811,0.692539,0.439048,0.325843,0.787232,0.704625,0.715301,0.610182,0.691542,0.694729,0.720757,0.653693,0.646657,0.637195,0.616703,0.718038,0.800011,0.714764,0.0,...,0.626018,0.625866,0.509159,0.631946,0.651247,0.783738,0.643038,0.709495,0.630794,0.665901,0.705245,0.698506,0.572157,0.585323,0.664859,0.359387,0.473458,0.715301,0.693033,0.609295,0.598124,0.591405,0.625368,0.668182,0.708114,0.614425,0.663398,0.565213,0.695007,0.646024,0.389751,0.737198,0.636058,0.610361,0.717173,0.576627,0.814674,0.685259,0.736806,0.589689
1,За что выражаю благодарность и банку и данному...,0.807445,0.841545,0.632007,0.675173,0.686378,0.649975,0.707264,0.745493,0.705368,0.90818,0.755878,0.886779,0.665506,0.678186,0.988081,0.920661,0.768624,1.122423,0.845339,0.761949,0.0,0.753849,0.85816,0.896177,0.857241,0.693155,0.745496,0.755175,0.760119,0.928742,0.742462,0.823996,0.660285,0.758523,0.725778,0.868377,0.878505,0.732276,1.122423,...,0.646017,0.731779,0.612513,0.821422,0.701801,0.917923,0.619089,0.783137,0.754879,0.707264,0.85823,0.745493,0.524037,0.620731,0.67476,0.547199,0.643096,0.745496,0.617858,0.743214,0.64721,0.800022,0.686378,0.735014,0.974979,0.665506,0.763034,0.763955,0.808454,0.762165,0.789953,0.652613,0.693905,0.664755,0.938565,0.734458,0.80109,0.784105,0.625096,0.694495
2,"Вывод: информация полученная в смс от банка, и...",0.680918,0.656162,0.629593,0.581495,0.558219,0.652914,0.544852,0.68679,0.657461,0.550744,0.617236,0.759711,0.457938,0.634758,0.733565,0.0,0.668193,0.0,0.772488,0.777672,0.619089,0.724344,0.75011,0.830791,0.702597,0.485209,0.628441,0.767499,0.427188,0.863422,0.77111,0.776084,0.718057,0.676922,0.672078,0.614084,0.840834,0.743829,0.0,...,0.717652,0.653664,0.0,0.631946,0.454068,0.729828,0.0,0.73163,0.632444,0.544852,0.705745,0.68679,0.624636,0.596523,0.675623,0.359387,0.61917,0.628441,0.532062,0.586674,0.0,0.661509,0.558219,0.773862,0.708114,0.457938,0.646284,0.687162,0.783921,0.565704,0.515798,0.6806,0.555691,0.65384,0.758928,0.523307,0.757976,0.655158,0.742264,0.73247
3,Хочу по благодарить ее за чуткое отношение к н...,0.616265,0.657581,0.670491,0.648124,0.700614,0.678177,0.647914,0.807196,0.713758,0.872525,0.663675,0.804223,0.779972,0.726564,0.903319,0.993579,0.583088,1.021174,0.828795,0.892693,0.695038,0.859643,0.769555,0.783761,0.820382,0.830565,0.794961,0.927112,0.885901,0.914264,0.980372,0.921601,0.892132,0.745743,0.810752,0.907314,0.976358,0.983745,1.021174,...,0.76536,0.476223,0.62245,0.74192,0.728152,0.950951,0.733427,0.830895,0.758771,0.647914,0.847369,0.807196,0.60132,0.711187,0.731292,0.910359,0.73977,0.794961,0.804483,0.851967,0.692381,0.76546,0.700614,0.930916,0.949112,0.779972,0.786156,0.882906,0.807326,0.71282,0.702828,0.700656,0.612434,0.87706,0.931754,0.646977,0.795788,0.853786,0.764244,0.740064
4,"Показал, что я и вклад могу свой пополнять пря...",0.715489,0.524262,0.500276,0.507652,0.584389,0.4897,0.672669,0.66042,0.632061,0.82037,0.788727,0.725285,0.569633,0.691193,0.733565,0.822475,0.835282,0.0,0.785035,0.781701,0.57257,0.676342,0.678083,0.743101,0.83368,0.690866,0.669209,0.719855,0.693796,0.884846,0.737471,0.766494,0.673196,0.821356,0.639587,0.730613,0.796957,0.792561,0.0,...,0.739447,0.749506,0.688263,0.863308,0.721527,0.671529,0.541066,0.715769,0.685252,0.672669,0.55784,0.66042,0.562868,0.646967,0.792428,0.660491,0.70048,0.669209,0.651089,0.605883,0.639384,0.724429,0.584389,0.738753,0.708114,0.569633,0.739334,0.761742,0.838161,0.738129,0.376353,0.626975,0.731739,0.626674,0.788439,0.614405,0.839079,0.605171,0.828965,0.599277


In [29]:
final_df.to_csv("distances_category_FINAL.csv")