In [1]:
import os
import json
import pandas as pd
from elasticsearch import Elasticsearch, helpers
import multiprocessing
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to /home/johnny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/johnny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def request_create_index():
    return {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 1,
        },
        "mappings": {
            "properties": {
              "id": { "type": "text" },
              "title": { "type": "text" },
              "description": { "type": "text" },

            }
        }
    }


def create_request_document(json_text, index):
    description = json_text["description"]
    if pd.isna(description):
        description = ""
        
    return {
        "_op_type": "create",
        "_index": index,
        "_source": {
            "id": json_text["doc_id"],
            "title": json_text["title"],
            "description": description,
        },
    } 


def request_search(query, size):
    return {
        "_source": ["id"],
        "from": 0,
        "size": size,
        "query": {"match": {"corpus": query}},
    }


def request_match_all():
    body = {"query": {"match_all": {}}}
    return body

In [3]:
class Elasticsearch_service:
    def __init__(self, index_name, ip="localhost", timeout=1000):
        self.ip = ip
        self.es = Elasticsearch(hosts=ip)
        self.timeout = timeout
        self.index_name = index_name

    def create_index(self):
        request_body = request_create_index()
        try:
            ret = self.es.indices.create(
                index=self.index_name, body=request_body, request_timeout=self.timeout
            )
        except:
            return False
        return ret["acknowledged"]

    def indexing(self, list_json_text):
        
        request_body = [create_request_document(doc, self.index_name) for doc in list_json_text]
        try:
            res = helpers.bulk(self.es, request_body, request_timeout=self.timeout)
        except:
            return False
        return True

    def index_exists(self):
        return self.es.indices.exists(index=self.index_name)

    def delete_index(self):
        self.es.indices.delete(index=self.index_name)

    def search(self, query, topk=40):

        request_body = request_search(query, topk)
        try:
            text_return = self.es.search(
                body=request_body, index=self.index_name, request_timeout=self.timeout
            )
        except:
            raise ValueError("Search Error!")

        hits = []
        for hit in text_return["hits"]["hits"]:
            hits.append(hit["_source"]["id"])

        return hits

### Inicio

In [4]:
df = pd.read_csv("../data/produtos.csv")

In [5]:
df = df.fillna("")

### Tokenização

In [6]:
df['tokenized_title'] = df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
df['tokenized_description'] = df.apply(lambda row: nltk.word_tokenize(row['description']), axis=1)

In [7]:
df.head()

Unnamed: 0,doc_id,title,description,consulta_1,consulta_2,tokenized_title,tokenized_description
0,0,"Samsung Smart TV 55"" Neo QLED 4K 55QN85A Mini ...",,1,0,"[Samsung, Smart, TV, 55, '', Neo, QLED, 4K, 55...",[]
1,1,"Smart TV LED LG 24"" Monitor Wi-Fi Webos 3.5 DT...","Smart TV Monitor LG 24"" LED Wi-Fi webOS 3.5 DT...",0,0,"[Smart, TV, LED, LG, 24, '', Monitor, Wi-Fi, W...","[Smart, TV, Monitor, LG, 24, '', LED, Wi-Fi, w..."
2,2,"Samsung Smart TV 50"" QLED 4K The Frame 2021 50...",,0,0,"[Samsung, Smart, TV, 50, '', QLED, 4K, The, Fr...",[]
3,3,"Android Tv Led 43"" Tcl 43s6500 Bluetooth, Cont...",,0,0,"[Android, Tv, Led, 43, '', Tcl, 43s6500, Bluet...",[]
4,4,Smart TV LED 55” Philco PTV55Q20SNBL Ultra HD ...,,0,0,"[Smart, TV, LED, 55, ”, Philco, PTV55Q20SNBL, ...",[]


In [8]:
def remove_stopwords(palavras):
    palavras = [palavra.lower() for palavra in palavras]
    result = []
    for palavra in palavras:
        if palavra not in stop_words:
            result.append(palavra)
    return result

In [9]:
stemmer = nltk.stem.RSLPStemmer()
def stemming(palavras):
    result = []
    for w in palavras:
        result.append(stemmer.stem(w))
    return result

In [10]:
def create_base(df_prod, processings):
    d_desc = df_prod['tokenized_description']
    d_title = df_prod['tokenized_title']

    if "stopwords" in processings:
        d_desc = d_desc.apply(lambda row: remove_stopwords(row))
        d_title = d_title.apply(lambda row: remove_stopwords(row))

    
    if "stemming" in processings:
        d_desc = d_desc.apply(lambda row: stemming(row))
        d_title = d_title.apply(lambda row: stemming(row))

    df_final = pd.DataFrame()
    df_final['doc_id'] = df_prod['doc_id']
    df_final['title'] = d_title.str.join(" ")
    df_final['description'] = d_desc.str.join(" ")
    return df_final

In [11]:
Base1 = create_base(df, [])
Base2 = create_base(df, ['stopwords'])
Base3 = create_base(df, ['stemming'])
Base4 = create_base(df, ['stopwords','stemming'])

### Base 1

In [12]:
b1 = Base1.to_dict('records')

Criar indice da base 1

In [13]:
es = Elasticsearch_service("base_1")
es.create_index()

True

In [14]:
es.indexing(b1)

True

### Base 2

In [15]:
b2 = Base2.to_dict('records')

Criar indice da base 2

In [16]:
es = Elasticsearch_service("base_2")
es.create_index()

True

In [17]:
es.indexing(b2)

True

### Base 3

In [18]:
b3 = Base3.to_dict('records')

Criar indice da base 3

In [20]:
es = Elasticsearch_service("base_3")
es.create_index()

True

In [21]:
es.indexing(b3)

True

### Base 4

In [22]:
b4 = Base4.to_dict('records')

Criar indice da base 4

In [24]:
es = Elasticsearch_service("base_4")
es.create_index()

True

In [25]:
es.indexing(b4)

True