# The project:
Train classifier: product query vs. everything else (a product request can be considered equal to the name or description of the product).

Add logic to search for similar products by product query.

All logic should be wrapped in a method **get_answer()**. The response to a product request should look like **"product_id title"**.

In [2]:
pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.2.0-cp38-cp38-win_amd64.whl (24.0 MB)
     ---------------------------------------- 24.0/24.0 MB 5.0 MB/s eta 0:00:00
Collecting Cython==0.29.28
  Downloading Cython-0.29.28-py2.py3-none-any.whl (983 kB)
     -------------------------------------- 983.8/983.8 kB 4.8 MB/s eta 0:00:00
Collecting smart-open>=1.8.1
  Downloading smart_open-6.2.0-py3-none-any.whl (58 kB)
     ---------------------------------------- 58.6/58.6 kB 3.0 MB/s eta 0:00:00
Installing collected packages: smart-open, Cython, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.32
    Uninstalling Cython-0.29.32:
      Successfully uninstalled Cython-0.29.32
Successfully installed Cython-0.29.28 gensim-4.2.0 smart-open-6.2.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install stop_words

Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: stop_words
  Building wheel for stop_words (setup.py): started
  Building wheel for stop_words (setup.py): finished with status 'done'
  Created wheel for stop_words: filename=stop_words-2018.7.23-py3-none-any.whl size=32893 sha256=7778eb4403ef62b7fb95af70a58131b495696eacd99a886cca6f36db148e95ca
  Stored in directory: c:\users\1\appdata\local\pip\cache\wheels\eb\03\0d\3bd31c983789aeb0b4d5e2ca48590288d9db1586cf5f225062
Successfully built stop_words
Installing collected packages: stop_words
Successfully installed stop_words-2018.7.23
Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install pymorphy2

Note: you may need to restart the kernel to use updated packages.


In [38]:
import os
import string
import annoy
import pickle

from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from gensim.models import Word2Vec

import numpy as np
from tqdm import tqdm_notebook
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pickle
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import pymorphy2
from nltk.corpus import stopwords

In [39]:
# Functions

def preprocess_txt(line):
    # Let's clean the line from punctuation. To do this, let's go over each character and check if it is a punctuation mark.
    exclude = set(string.punctuation)
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    morpher = pymorphy2.MorphAnalyzer()
    sw = set(stopwords.words("russian"))
    # Let's lemmatize all the words in our text
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

def prepro_txt(line):
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

# Let's train the classifier “product request vs. chatter"

Let's load and preprocess the dataset for training the classifier:

In [57]:
import pandas as pd
dataset = pd.read_csv('ProductsDataset.csv')
dataset.drop(['Unnamed: 0'], inplace = True, axis = 1)
dataset['descrirption'] = dataset['descrirption'].apply(lambda x: x[1:-1])

In [46]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35548 entries, 0 to 35547
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           35548 non-null  object 
 1   descrirption    33537 non-null  object 
 2   product_id      35536 non-null  object 
 3   category_id     35536 non-null  float64
 4   subcategory_id  35536 non-null  object 
 5   properties      35536 non-null  object 
 6   image_links     35533 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.9+ MB


In [49]:
dataset #dataset has already been pre-processed

Unnamed: 0,title,descrirption,product_id,category_id,subcategory_id,properties,image_links
0,Юбка детская ORBY,"Новая, не носили ни разу. В реале красивей чем...",58e3cfe6132ca50e053f5f82,22.0,2211,"{'detskie_razmer_rost': '81-86 (1,5 года)'}",http://cache3.youla.io/files/images/360_360/58...
1,Ботильоны,"Новые,привезены из Чехии ,указан размер 40,но ...",5667531b2b7f8d127d838c34,9.0,902,"{'zhenskaya_odezhda_tzvet': 'Зеленый', 'visota...",http://cache3.youla.io/files/images/360_360/5b...
2,Брюки,Размер 40-42. Брюки почти новые - не знаю как ...,59534826aaab284cba337e06,9.0,906,{'zhenskaya_odezhda_dzhinsy_bryuki_tip': 'Брюк...,http://cache3.youla.io/files/images/360_360/59...
3,Продам детские шапки,"Продам шапки,кажда 200р.Розовая и белая проданны.",57de544096ad842e26de8027,22.0,2217,"{'detskie_pol': 'Девочкам', 'detskaya_odezhda_...",http://cache3.youla.io/files/images/360_360/57...
4,Блузка,"Темно-синяя, 42 размер,состояние отличное,как ...",5ad4d2626c86cb168d212022,9.0,907,"{'zhenskaya_odezhda_tzvet': 'Синий', 'zhenskay...",http://cache3.youla.io/files/images/360_360/5a...
...,...,...,...,...,...,...,...
35543,Юбка,Юбка Белая по.Турция фирма adL,5b5f181c62e1c6616a7f6472,9.0,904,"{'zhenskaya_odezhda_platya_yubki_tip': 'Юбки',...",http://cache3.youla.io/files/images/360_360/5b...
35544,Новый твидовый пиджак,Новый с бирками пиджак размер S в стиле Coco C...,5bd6c8b29e94ba033d31f8d0,9.0,908,"{'brand_zhenskii': 'Chanel', 'zhenskaya_odezhd...",http://cache3.youla.io/files/images/360_360/5b...
35545,Женская зимняя куртка,Женская зимняя спортивная куртка фирмы Rossiqn...,5bd6c8bc074b3e1c056f69b2,9.0,903,"{'zhenskaya_odezhda_razmer': '48-50 (XL)', 'zh...",http://cache3.youla.io/files/images/360_360/5b...
35546,Новая золотая ветровка,Женская ветровка размер 44-46. Цвет приглушённ...,5bd6c8fb2138bbc55745362c,9.0,903,"{'zhenskaya_odezhda_razmer': '44-46 (М)', 'zhe...",http://cache3.youla.io/files/images/360_360/5b...


Divide the sample:

In [55]:
train, test = train_test_split(dataset, test_size=0.2, shuffle=True)

Vectorize the text:

In [56]:
# create TfidfVectorizer object and fit it on out training set texts

#vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features = 50000)
#vectorizer.fit(train['text'], train['label'])

In [58]:
# 1. convert texts to tf-idf vectors using .transform
# 2. convert your labels into numpy arrays 

X_train = vectorizer.transform(train['text'])
y_train = np.array(train['label'], int)
X_test = vectorizer.transform(test['text'])
y_test = np.array(test['label'], int)

Let's train the classifier::

In [60]:
# create LogisticRegression model object and fit the model

model = LogisticRegression()
model.fit(X_train, y_train)

In [10]:
predictions = model.predict(X_test)

Test:

In [11]:
accuracy = (predictions == y_test).mean()
accuracy

0.9865634892718959

In [12]:
q_1 = 'Блузка Темно-синяя'

vec = vectorizer.transform([q_1])
model.predict(vec)

array([0])

In [13]:
q_2 = 'Энергетики, вредно или нет?'

vec = vectorizer.transform([q_2])
model.predict(vec)

array([1])

For comparison, we use the model LinearSVC

In [14]:
clf = LinearSVC()
clf.fit(X_train, y_train)
predictions_clf = clf.predict(X_test)
accuracy_clf = (predictions_clf == y_test).mean()
accuracy_clf

0.990995427365459

OMG, we'll take LinearSVC as our model

In [15]:
# we'll save the model into the file and going to load: 

with open('project14_clf.pkl', 'wb') as output:
    pickle.dump(clf, output) #save

with open('project14_clf.pkl', 'rb') as pkl_file:
    regressor_from_file = pickle.load(pkl_file) #load

In [127]:
def get_predictions(question):
    vec = vectorizer.transform([question])
    predicted_answer = model.predict(vec)[0]
    return predicted_answer

We'll check saving and loading correctly 

In [128]:
get_predictions('Блузка Темно-синяя')

0

In [129]:
get_predictions('Энергетики, вредно или нет?')

1

# We implement the search for similar products in the content part of the bot

In [29]:
product_data = pd.read_csv('ProductsDataset.csv')

All product names will be rolled into a vector representation Word2Vec

In [31]:
sentences = []

morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)
c = 0

for line in product_data['title']:
    spls = prepro_txt(line)
    sentences.append(spls)
    c += 1
    if c > 500000:
        break

NameError: name 'MorphAnalyzer' is not defined

In [109]:
# Train the Model word2vec 
sentences = [i for i in sentences if len(i) > 2]
model_wv = Word2Vec(sentences=sentences, vector_size=100, min_count=5, window=5)
model_wv.save("w2v_model")

Now we need to build an index on the titles of the documents. Using the library 'annoy'. We go through all the names, we believe that the supply vector is the sum word2vec words, that are included in it (of course, the average)

In [110]:
index_goods = annoy.AnnoyIndex(100 ,'angular')

index_map_goods = {}
counter = 0

for line in product_data['title']:
    n_w2v = 0
    spls = line.split("\t")
    index_map_goods[counter] = spls[0]
    question = prepro_txt(spls[0])
    vector = np.zeros(100)
    for word in question:
        if word in model_wv.wv:
            vector += model_wv.wv[word]
            n_w2v += 1
    if n_w2v > 0:
        vector = vector / n_w2v
    index_goods.add_item(counter, vector)
            
    counter += 1

index_goods.build(10)
index_goods.save('smth.ann')

True

We implement the search for the answer by index

In [111]:
# working normally
def find_answer(question, model):
    preprocessed_question = prepro_txt(question)
    n_w2v = 0
    vector = np.zeros(100)
    for word in preprocessed_question:
        if word in model_wv.wv:
            vector += model_wv.wv[word]
            n_w2v += 1
    if n_w2v > 0:
        vector = vector / n_w2v
    answer_index = index_goods.get_nns_by_vector(vector, 1)
    return index_map_goods[answer_index[0]]

In [97]:
# Проверка... ну такое себе
find_answer('Юбка детская ORBY', model_wv)

'Юбка детская'

# We implement chat

Let's preprocess the mail.ru answers from the file: add 1 answer to each question and write it to the file for the future. This will allow us to save time and resources during further text preprocessing.

In [99]:
question = None
written = False

#Мы идем по всем записям, берем первую строку как вопрос
# и после знака --- находим ответ
with open("prepared_answers.txt", "w") as fout:
    with open("Otvety.txt", "r") as fin:
        for line in tqdm_notebook(fin):
            if line.startswith("---"):
                written = False
                continue
            if not written and question is not None:
                fout.write(question.replace("\t", " ").strip() + "\t" + line.replace("\t", " "))
                written = True
                question = None
                continue
            if not written:
                question = line.strip()
                continue

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm_notebook(fin):


0it [00:00, ?it/s]

Now we need to preprocess the text in order to train word2vec and get embeddings. Removing punctuation marks and doing lemmatization

In [100]:
sentences = []

morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)
c = 0

with open("Otvety.txt", "r") as fin:
    for line in tqdm_notebook(fin):
        spls = prepro_txt(line)
        sentences.append(spls)
        c += 1
        if c > 500000:
            break

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm_notebook(fin):


0it [00:00, ?it/s]

In [101]:
# Train the Model word2vec on our questions
sentences = [i for i in sentences if len(i) > 2]
model_chat = Word2Vec(sentences=sentences, vector_size=100, min_count=1, window=5)
model_chat.save("w2v_model_chat")

Now we need to add all the questions to the index. Using the library annoy. We go through all the answers, we believe that the sentence vector is the sum of the word2vecs of the words that are included in it (averaged, of course)

In [103]:
index = annoy.AnnoyIndex(100 ,'angular')

index_map = {}
counter = 0

with open("prepared_answers.txt", "r") as f:
    for line in tqdm_notebook(f):
        n_w2v = 0
        spls = line.split("\t")
        index_map[counter] = spls[1]
        question = prepro_txt(spls[0])
        vector = np.zeros(100)
        for word in question:
            if word in model_chat.wv:
                vector += model_chat.wv[word]
                n_w2v += 1
        if n_w2v > 0:
            vector = vector / n_w2v
        index.add_item(counter, vector)
            
        counter += 1

index.build(10)
index.save('speaker.ann')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm_notebook(f):


0it [00:00, ?it/s]

True

Now it remains to implement a method that will receive a question as an input and find an answer to it! We preprocess the question, find the closest question, and select the answer to the closest question.

In [104]:
def find_answer_chat(question):
    preprocessed_question = prepro_txt(question)
    n_w2v = 0
    vector = np.zeros(100)
    for word in preprocessed_question:
        if word in model_chat.wv:
            vector += model_chat.wv[word]
            n_w2v += 1
    if n_w2v > 0:
        vector = vector / n_w2v
    answer_index = index.get_nns_by_vector(vector, 1)
    return index_map[answer_index[0]]

In [115]:
# test
find_answer_chat('Как погодка?')

'у нас тепло и дождей не предвидеться до ноября. \n'

# Implementing a chat bot

In [137]:
def get_answer(question):
    
    # Классифицируем
    predicted_question = get_predictions(question)
    
    # Ищем ответ в таблице
    if predicted_question == 0:
        find_in_table = find_answer(question, model_wv)
        for counter, item in enumerate(product_data.title):
            if item == find_in_table:
                answ_to_return = [product_data.product_id[counter], product_data.title[counter]]
                break
    
    # Chatting
    else:
        answ_to_return = find_answer_chat(question)
        
    return answ_to_return

In [138]:
get_answer('Юбка детская ORBY')

['5922cd12de885467545e72a2', 'Юбка для девочки.']

In [139]:
get_answer('Как погодка?')

'у нас тепло и дождей не предвидеться до ноября. \n'

# Autotest

In [61]:
assert(get_answer('Юбка детская ORBY').startswith('58e3cfe6132ca50e053f5f82'))

In [143]:
assert(not get_answer('Где ключи от танка').startswith('5')) 