In [None]:
from elasticsearch import Elasticsearch, RequestsHttpConnection, helpers

In [None]:
import pandas as pd
import numpy as np

In [None]:
# connect to the elasticsearch cluster
es=Elasticsearch([{'host':'localhost','port':9200}], timeout = 300)

# create the index

In [None]:
# create the index, use similarity function DFR
request_body = {
   "settings" : {
        "number_of_shards": 5,
        "number_of_replicas": 1,
       "index" : {
            "similarity" : {
                # similarity function : DFR
              "my_similarity" : {
                "type" : "DFR",
                "basic_model" : "g",
                "after_effect" : "l",
                "normalization" : "h2",
                "normalization.h2.c" : "3.0"
              },
                # similarity function : BM25 (b, k default)
                 "my_similarity_bm25" : {
                "type" : "BM25"
              },
                # similarity function :  default
                "my_similarity_default": {
                  "type": "boolean"
                    }
            }
        }
       
    },

    'mappings': {
        'examplecase': {
            'properties': {
                # setup type of fields, and call the similarity to the fields
                'pk': {'type': 'text',  "similarity" : "my_similarity"},
                'name': {'type': 'text',  "similarity" : "my_similarity"},
                # if in this field, we call similarity function : default (option, we can use other similarity function)
                'date_of_birth': {'format': 'dateOptionalTime', 'type': 'date',  "similarity" : "my_similarity_default"},
                # if in this field, we call similarity function: bm25
                'about': { 'type': 'text',  "similarity" : "my_similarity_bm25"},
                'interests': {'type': 'text',  "similarity" : "my_similarity"},
            }}}
}

In [None]:
es.indices.create(index = 'new_idx', body = request_body)

In [None]:
# delete: index
#es.indices.delete(index = 'new_idx')

# prepare data

In [None]:
df = pd.DataFrame(index= range(0, 4))

In [None]:
df['pk'] = ["01", "02", "03", "04"]
df['name'] = ["hoang an", "nguyen ha", "Tran nam", "trong tuan"]
df['about'] = ["Love to play guitar", "I like to collect rock albums", "I like to love the dog", "Love to play football"]
df['interests'] = [["sports","music"], ["music", "reading story", "watching anime", "traveling"], [ "traveling", "shopping", "watching drama" ], ["sports", "music"]]

In [None]:
df

In [None]:
bulk_data = []
for index, row in df.iterrows():
    #print(index)
    #print(row)
    
    #print(len(row))

    data_dict = {
        "_index": "new_idx",
        "_type": "examplecase",
        "_id": df.loc[index, "pk"],
        "_source": {
            "pk": df.loc[index, "pk"],
            "name": df.loc[index, "name"],
            "about": df.loc[index, "about"],
            "interest": df.loc[index, "interests"]
        }
    }
    bulk_data.append(data_dict)
        

In [None]:
bulk_data

# insert data

In [None]:
# the way 1
res = helpers.bulk(es, bulk_data)
# the way 2
#res = es.bulk(index = 'new_idx', body = bulk_data)

In [None]:
res

In [None]:
es.indices.refresh(index = 'new_idx')

# search data

In [None]:
es.search(body={"query": {"match_all": {}}}, index = 'new_idx')

In [None]:
es.indices.get_mapping(index = 'new_idx')

In [None]:
es.search(index = 'new_idx', body = {
    "query": {
        "match": {
            "about": "like"
        }
    }
})

In [None]:
es.search(index = 'new_idx', body = {
    "query": {
        "bool":{
            "must":{"match": {
            "about": "like"
        }}}
    }
})

In [None]:
es.search(index = 'new_idx', body = {
    "query": {
        "bool":{
            "must":{"match": {
            "about": "like"
        }},
        "must_not":{
            "match" :{
                "about": " ".join(["collect", "rock", "music"])}
        }}
    }
})

In [None]:
" ".join(["collect", "rock", "music"])

In [None]:
es.search(index = 'new_idx', body = {
    "query": {
        "bool":{
            "must":{"match": {
            "about": "like"
        }},
        "must_not":{
            "match" :{
                "about": "'collect', 'rock', 'music'"}
        }}
    }
})

In [None]:
es.search(index = 'new_idx', body = {
    "query": {
        "bool":{
            "must":{"match": {
            "about": "like"
        }},
        "filter":{
            "term" :{
                "about": " ".join(["dog"])}
        }}
    }
})

In [None]:
es.search(index = 'new_idx', body = {
    "query": {
        "bool":{
            "must":{"match": {
            "about": "like"
        }},
        "filter":{
            "term" :{
                "about": " ".join(["dog", "the"])}
        }}
    }
})

# delete index

In [None]:
es.indices.delete(index = 'new_idx')