In [1]:
import os
import json
import pandas as pd
from elasticsearch import Elasticsearch, helpers
import multiprocessing
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to /home/johnny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/johnny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def request_create_index():
    return {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 1,
        },
        "mappings": {
            "properties": {
              "id": { "type": "text" },
              "title": { "type": "text" },
              "description": { "type": "text" },

            }
        }
    }


def create_request_document(json_text, index):
    description = json_text["description"]
    if pd.isna(description):
        description = ""
        
    return {
        "_op_type": "create",
        "_index": index,
        "_source": {
            "id": json_text["doc_id"],
            "title": json_text["title"],
            "description": description,
        },
    } 


def request_search(query):
    return {
        "from":0, "size":107,
        "_source": ["id", "title", "description"],
            "query":{
                "multi_match": {
                    "query":    query,
                    "fields": ["title", "description"]
                }
            }
        }

def request_match_all():
    body = {"query": {"match_all": {}}}
    return body

In [3]:
class Elasticsearch_service:
    def __init__(self, index_name, ip="localhost", timeout=1000):
        self.ip = ip
        self.es = Elasticsearch(hosts=ip)
        self.timeout = timeout
        self.index_name = index_name

    def create_index(self):
        request_body = request_create_index()
        try:
            ret = self.es.indices.create(
                index=self.index_name, body=request_body, request_timeout=self.timeout
            )
        except:
            return False
        return ret["acknowledged"]

    def indexing(self, list_json_text):
        
        request_body = [create_request_document(doc, self.index_name) for doc in list_json_text]
        try:
            res = helpers.bulk(self.es, request_body, request_timeout=self.timeout)
        except:
            return False
        return True

    def index_exists(self):
        return self.es.indices.exists(index=self.index_name)

    def delete_index(self):
        self.es.indices.delete(index=self.index_name)

    def search(self, query):
        request_body = request_search(query)
        try:
            text_return = self.es.search(
                body=request_body, index=self.index_name, request_timeout=self.timeout
            )
        except:
            raise ValueError("Search Error!")

        hits = []
        for hit in text_return["hits"]["hits"]:
            hits.append({"id": hit["_source"]["id"],"title": hit["_source"]["title"], "description": hit["_source"]["description"]})

        return hits

### Inicio

In [4]:
df = pd.read_csv("../data/produtos.csv")

In [5]:
df = df.fillna("")

### Tokenização

In [6]:
df['tokenized_title'] = df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
df['tokenized_description'] = df.apply(lambda row: nltk.word_tokenize(row['description']), axis=1)

In [7]:
df.head()

Unnamed: 0,doc_id,title,description,consulta_1,consulta_2,tokenized_title,tokenized_description
0,0,"Samsung Smart TV 55"" Neo QLED 4K 55QN85A Mini ...",,1,0,"[Samsung, Smart, TV, 55, '', Neo, QLED, 4K, 55...",[]
1,1,"Smart TV LED LG 24"" Monitor Wi-Fi Webos 3.5 DT...","Smart TV Monitor LG 24"" LED Wi-Fi webOS 3.5 DT...",0,0,"[Smart, TV, LED, LG, 24, '', Monitor, Wi-Fi, W...","[Smart, TV, Monitor, LG, 24, '', LED, Wi-Fi, w..."
2,2,"Samsung Smart TV 50"" QLED 4K The Frame 2021 50...",,0,0,"[Samsung, Smart, TV, 50, '', QLED, 4K, The, Fr...",[]
3,3,"Android Tv Led 43"" Tcl 43s6500 Bluetooth, Cont...",,0,0,"[Android, Tv, Led, 43, '', Tcl, 43s6500, Bluet...",[]
4,4,Smart TV LED 55” Philco PTV55Q20SNBL Ultra HD ...,,0,0,"[Smart, TV, LED, 55, ”, Philco, PTV55Q20SNBL, ...",[]


In [8]:
def remove_stopwords(palavras):
    palavras = [palavra.lower() for palavra in palavras]
    result = []
    for palavra in palavras:
        if palavra not in stop_words:
            result.append(palavra)
    return result

In [9]:
stemmer = nltk.stem.RSLPStemmer()
def stemming(palavras):
    result = []
    for w in palavras:
        result.append(stemmer.stem(w))
    return result

In [10]:
def create_base(df_prod, processings):
    d_desc = df_prod['tokenized_description']
    d_title = df_prod['tokenized_title']

    if "stopwords" in processings:
        d_desc = d_desc.apply(lambda row: remove_stopwords(row))
        d_title = d_title.apply(lambda row: remove_stopwords(row))

    
    if "stemming" in processings:
        d_desc = d_desc.apply(lambda row: stemming(row))
        d_title = d_title.apply(lambda row: stemming(row))

    df_final = pd.DataFrame()
    df_final['doc_id'] = df_prod['doc_id']
    df_final['title'] = d_title.str.join(" ")
    df_final['description'] = d_desc.str.join(" ")
    return df_final

In [11]:
Base1 = create_base(df, [])
Base2 = create_base(df, ['stopwords'])
Base3 = create_base(df, ['stemming'])
Base4 = create_base(df, ['stopwords','stemming'])

In [12]:
consulta_1 = "televisão com tela de bordas infinitas"
consulta_2 = "smartphone samsung android com baterias de longa duração"

### Base 1

In [13]:
b1 = Base1.to_dict('records')

##### Criar indice da base 1:

In [14]:
es = Elasticsearch_service("base_1")
es.create_index()

True

In [15]:
es.indexing(b1)

True

##### Busca:

In [16]:
df_result_search = pd.DataFrame()

Consulta 1

In [17]:
consulta_1

'televisão com tela de bordas infinitas'

In [20]:
df_consulta1 = es.search(consulta_1)
df_consulta1 = pd.DataFrame(df_consulta1)
df_consulta1.head()

Unnamed: 0,id,title,description
0,50,Smart TV 40 '' Philco PTV40G60SNBL FHD com Tel...,Televisor com tela de 40 polegadas em resoluçã...
1,3,"Android Tv Led 43 '' Tcl 43s6500 Bluetooth , C...",
2,32,Smart Tv Philips 43 '' Full Hd Sem Bordas Hdr ...,
3,52,Smart TV Android LED 32 '' Semp 32S5300 Blueto...,
4,94,Smartphone Samsung Galaxy M62 128GB 8GB RAM Ba...,


In [22]:
df_result_search["consulta_1_B1"] = df_consulta1["id"]

Consulta 2

In [23]:
consulta_2

'smartphone samsung android com baterias de longa duração'

In [24]:
df_consulta2 = es.search(consulta_2)
df_consulta2 = pd.DataFrame(df_consulta2)
df_consulta2.head()

Unnamed: 0,id,title,description
0,3,"Android Tv Led 43 '' Tcl 43s6500 Bluetooth , C...",
1,52,Smart TV Android LED 32 '' Semp 32S5300 Blueto...,
2,55,Smart TV LED 32 ” TCL S5200 HD HDR Android com...,A Smart TV LED 32 ” TCL S5200 é Sua TV é uma a...
3,16,Smart TV LED 50 ” TCL P615 4K UHD HDR Android ...,"4K ANDORIDTV TV P615 , SUA TV É UMA ANDROID TV..."
4,56,Smartphone Motorola Moto G60s 128GB 4G Wi-Fi T...,O Smartphone Motorola Moto G60s chegou para su...


In [26]:
df_result_search["consulta_2_B1"] = df_consulta2["id"]

### Base 2

In [27]:
b2 = Base2.to_dict('records')

Criar indice da base 2

In [28]:
es = Elasticsearch_service("base_2")
es.create_index()

True

In [29]:
es.indexing(b2)

True

##### Busca:

Consulta 1

In [30]:
q1_toke = nltk.word_tokenize(consulta_1)

In [31]:
consulta_1b2 = remove_stopwords(q1_toke)
consulta_1b2 = " ".join(consulta_1b2)
consulta_1b2

'televisão tela bordas infinitas'

In [32]:
df_consulta1 = es.search(consulta_1b2)
df_consulta1 = pd.DataFrame(df_consulta1)
df_consulta1.head()

Unnamed: 0,id,title,description
0,32,smart tv philips 43 '' full hd bordas hdr plus...,
1,22,samsung smart tv led 43 '' full hd lh43betmlgg...,"smart tv led 43 '' samsung lh43betmlggxzd , eq..."
2,50,smart tv 40 '' philco ptv40g60snbl fhd tela in...,televisor tela 40 polegadas resolução hd . tec...
3,37,smart google tv philco 50 '' led borderless 4k...,televisor tela 50 polegadas resolução 4k uhd ....
4,69,smartphone motorola moto g20 128gb 4g wi-fi te...,conecte-se melhor smartphone motorola moto g20...


In [33]:
df_result_search["consulta_1_B2"] = df_consulta1["id"]

Consulta 2

In [34]:
q2_toke = nltk.word_tokenize(consulta_2)

In [35]:
consulta_2b2 = remove_stopwords(q2_toke)
consulta_2b2 = " ".join(consulta_2b2)
consulta_2b2

'smartphone samsung android baterias longa duração'

In [36]:
df_consulta2 = es.search(consulta_2b2)
df_consulta2 = pd.DataFrame(df_consulta2)
df_consulta2.head()

Unnamed: 0,id,title,description
0,72,smartphone motorola moto e7 32gb 4g wi-fi tela...,merece ter smartphone tecnológico ideal ativid...
1,82,smartphone motorola moto e7 32gb 4g wi-fi tela...,merece ter smartphone tecnológico ideal ativid...
2,56,smartphone motorola moto g60s 128gb 4g wi-fi t...,smartphone motorola moto g60s chegou surpreend...
3,77,smartphone motorola moto g60s 128gb 4g wi-fi t...,smartphone motorola moto g60s chegou surpreend...
4,64,smartphone samsung galaxy a02s 32gb 4g wi-fi t...,smartphone samsung galaxy a02s conta design co...


In [37]:
df_result_search["consulta_2_B2"] = df_consulta2["id"]

### Base 3

In [38]:
b3 = Base3.to_dict('records')

Criar indice da base 3

In [39]:
es = Elasticsearch_service("base_3")
es.create_index()

True

In [40]:
es.indexing(b3)

True

##### Busca:

Consulta 1

In [41]:
consulta_1b3 = stemming(q1_toke)
consulta_1b3 = " ".join(consulta_1b3)
consulta_1b3

'televis com tel de bord infinit'

In [42]:
df_consulta1 = es.search(consulta_1b3)
df_consulta1 = pd.DataFrame(df_consulta1)
df_consulta1.head()

Unnamed: 0,id,title,description
0,4,smart tv led 55 ” philc ptv55q20snbl ultr hd 4...,
1,50,smart tv 40 '' philc ptv40g60snbl fhd com tel ...,televi com tel de 40 poleg em resoluç hd . tec...
2,5,"samsung smart tv 65 '' qled 8k 65q800t , proce...",
3,3,"android tv led 43 '' tcl 43s6500 bluetooth , c...",
4,52,smart tv android led 32 '' semp 32s5300 blueto...,


In [43]:
df_result_search["consulta_1_B3"] = df_consulta1["id"]

Consulta 2

In [44]:
consulta_2b3 = stemming(q2_toke)
consulta_2b3 = " ".join(consulta_2b3)
consulta_2b3

'smartphon samsung android com bat de long dur'

In [45]:
df_consulta2 = es.search(consulta_2b3)
df_consulta2 = pd.DataFrame(df_consulta2)
df_consulta2.head()

Unnamed: 0,id,title,description
0,3,"android tv led 43 '' tcl 43s6500 bluetooth , c...",
1,94,smartphon samsung galaxy m62 128gb 8gb ram bat...,
2,99,smartphon samsung galaxy m62 128gb 8gb ram bat...,
3,52,smart tv android led 32 '' semp 32s5300 blueto...,
4,56,"smartphon motorol mot g60 128gb 4g wi-f tel 6,...",o smartphon motorol mot g60 cheg par surpreend...


In [46]:
df_result_search["consulta_2_B3"] = df_consulta1["id"]

### Base 4

In [47]:
b4 = Base4.to_dict('records')

Criar indice da base 4

In [48]:
es = Elasticsearch_service("base_4")
es.create_index()

True

In [49]:
es.indexing(b4)

True

##### Busca:

Consulta 1

In [50]:
consulta_1b4 = remove_stopwords(q1_toke)
consulta_1b4 = stemming(consulta_1b4)
consulta_1b4 = " ".join(consulta_1b4)
consulta_1b4

'televis tel bord infinit'

In [51]:
df_consulta1 = es.search(consulta_1b4)
df_consulta1 = pd.DataFrame(df_consulta1)
df_consulta1.head()

Unnamed: 0,id,title,description
0,4,smart tv led 55 ” philc ptv55q20snbl ultr hd 4...,
1,5,"samsung smart tv 65 '' qled 8k 65q800t , proce...",
2,50,smart tv 40 '' philc ptv40g60snbl fhd tel infi...,televi tel 40 poleg resoluç hd . tecnolog smar...
3,32,smart tv philip 43 '' full hd bord hdr plu wif...,
4,64,smartphon samsung galaxy a02 32gb 4g wi-f tel ...,smartphon samsung galaxy a02 cont design compa...


In [52]:
df_result_search["consulta_1_B4"] = df_consulta1["id"]

Consulta 2

In [53]:
consulta_2b4 = remove_stopwords(q2_toke)
consulta_2b4 = stemming(consulta_2b4)
consulta_2b4 = " ".join(consulta_2b4)
consulta_2b4

'smartphon samsung android bat long dur'

In [54]:
df_consulta2 = es.search(consulta_2b4)
df_consulta2 = pd.DataFrame(df_consulta2)
df_consulta2.head()

Unnamed: 0,id,title,description
0,94,smartphon samsung galaxy m62 128gb 8gb ram bat...,
1,99,smartphon samsung galaxy m62 128gb 8gb ram bat...,
2,72,smartphon motorol mot e7 32gb 4g wi-f tel 6.5 ...,merec ter smartphon tecnológ ideal ativ diár ....
3,82,smartphon motorol mot e7 32gb 4g wi-f tel 6.5 ...,merec ter smartphon tecnológ ideal ativ diár ....
4,56,"smartphon motorol mot g60 128gb 4g wi-f tel 6,...",smartphon motorol mot g60 cheg surpreend apaix...


In [55]:
df_result_search["consulta_2_B4"] = df_consulta2["id"]

Write data

In [62]:
df_result_search

Unnamed: 0,consulta_1_B1,consulta_2_B1,consulta_1_B2,consulta_2_B2,consulta_1_B3,consulta_2_B3,consulta_1_B4,consulta_2_B4
0,50,3.0,32.0,72.0,4,4,4.0,94.0
1,3,52.0,22.0,82.0,50,50,5.0,99.0
2,32,55.0,50.0,56.0,5,5,50.0,72.0
3,52,16.0,37.0,77.0,3,3,32.0,82.0
4,94,56.0,69.0,64.0,52,52,64.0,56.0
...,...,...,...,...,...,...,...,...
87,95,33.0,,,104,104,,
88,34,28.0,,,60,60,,
89,20,34.0,,,20,20,,
90,48,1.0,,,48,48,,


In [63]:
df_result_search.reset_index().to_csv("../data/result_search.csv", index=False)