In [None]:
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch, helpers
import json
import time

In [None]:
data = json.load(open('.../data_formular/formular_norm.txt'))

In [None]:
data

In [None]:
mathematical_symbols = json.load(open('.../data_formular/mathematical_symbols_latex.txt'))

In [None]:
print(len(mathematical_symbols))
print(mathematical_symbols)

In [None]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}], timeout = 300)

In [None]:
request_body = {
    "settings": {
        "similarity": {
            "my_bm25": {
                "type": "BM25",
                "b": 0.75,
                "k": 1.25
            }
        }
    },
    "mappings": {
        "example_bm25": {
            "properties": {
                "stt": {"type": "long", "similarity": "my_bm25"},
                "formular": {"type": "text", "similarity": "my_bm25"}
            }
        }
    }
}

In [None]:
# create the index
es.indices.create(index = "idx_formular_02", body = request_body)

In [None]:
# prepare data
bulk_data = []

for i,j in zip(data, range(1, len(data)+1)):
    data_dict = {
        "_index": "idx_formular_02",
        "_type" : "example_bm25",
        "_id": j,
        "_source": {
            "stt": j,
            "formular": i
        }
    }
    bulk_data.append(data_dict)

In [None]:
bulk_data

In [None]:
# insert data
helpers.bulk(es, bulk_data)

In [None]:
es.indices.refresh(index = "idx_formular_02")

In [None]:
# get data
es.get(index = "idx_formular_02", doc_type = "example_bm25", id = 5)

In [None]:
# search data
a = es.search(index = 'idx_formular_02', doc_type = "example_bm25", body = {
    "query": {
        "match": {
            "formular": "\\sqrt { 2 ^ { 2 } .3 ^ { 4 } }"
        }
    }
})
for i in a['hits']['hits']:
    print("id : {}, score: {}, formular: {}".format(i["_id"], i["_score"], i["_source"]["formular"]))

In [None]:
a = es.search(index = 'idx_formular_02', doc_type = "example_bm25", body = {
    "query": {
        "match": {
            "formular": "\\sqrt { 2 ^ { 4 } .3 }"
        }
    }
})
for i in a['hits']['hits']:
    print("id : {}, score: {}, formular: {}".format(i["_id"], i["_score"], i["_source"]["formular"]))

In [None]:
query_string = "\\frac { 3 } { 2 + a }"
a = es.search(index = 'idx_formular_02', doc_type = "example_bm25", body = {
    "query": {
        "match": {
            "formular": query_string
        }
    }
})
for i in a['hits']['hits']:
    print("id : {}, score: {}, formular: {}".format(i["_id"], i["_score"], i["_source"]["formular"]))

In [None]:
for i in query_string.split():
    if '\\' in i:
        print(i)
    else:
        print(0)

In [None]:
query_string = "\\frac { 3 } { 2 + a }"
a = es.search(index = 'idx_formular_02', doc_type = "example_bm25", body = {
    "query": {
        "bool": {
            "must": { "match": {
                "formular": query_string
            }}
        }
    }
})
for i in a['hits']['hits']:
    print("id : {}, score: {}, formular: {}".format(i["_id"], i["_score"], i["_source"]["formular"]))

In [None]:
# create a new list, math_symbol_in_query, the mathematical symbols appear in query string and mathematical symbols
# we use the mathematical symbols this is preference term in the search documents
math_symbols_in_query = []
for i in query_string.split():
    if i in mathematical_symbols:
        math_symbols_in_query.append(i)
print(math_symbols_in_query)
# create a new list, math_symbols_in_query, the mathematical symbols not appear in query string
math_symbols_not_in_query = list(set(mathematical_symbols)^set(math_symbols_in_query))
print(len(math_symbols_not_in_query))
print(math_symbols_not_in_query)

In [None]:
if "\\sqrt" in math_symbols_not_in_query:
    print(1)

In [None]:
" ".join(math_symbols_not_in_query)

In [None]:
query_string = "\\frac { 3 } { 2 + a }"
a = es.search(index = 'idx_formular_02', doc_type = "example_bm25", body = {
    "query": {
        "bool": {
            "must": { "match": {
                "formular": query_string
            }},
            "must_not":{
                "match": {
                    "about": " ".join(math_symbols_not_in_query)
                }
            }
        }
    }
})
for i in a['hits']['hits']:
    print("id : {}, score: {}, formular: {}".format(i["_id"], i["_score"], i["_source"]["formular"]))

In [None]:
" ".join(math_symbols_in_query)

In [None]:
query_string = "\\frac { 3 } { 2 + a }"
a = es.search(index = 'idx_formular_02', doc_type = "example_bm25", body = {
    "query": {
        "bool": {
            "must": { "match": {
                "formular": query_string
            }},
            "filter": {
                "term": {
                    "formular": " ".join(math_symbols_in_query)
                }
            },
            "must_not":{
                "match": {
                    "about": " ".join(math_symbols_not_in_query)
                }
            }
        }
    }
})
for i in a['hits']['hits']:
    print("id : {}, score: {}, formular: {}".format(i["_id"], i["_score"], i["_source"]["formular"]))

In [None]:
a

In [None]:
query_string = "\\frac { 3 } { 2 + a }"
a = es.search(index = 'idx_formular_02', doc_type = "example_bm25", body = {
    "query": {
        "bool": {
            "must": { "match": {
                "formular": query_string
            }},
            "filter": {
                "term": {
                    "formular": "\\frac"
                }
            }
        }
    }
})
for i in a['hits']['hits']:
    print("id : {}, score: {}, formular: {}".format(i["_id"], i["_score"], i["_source"]["formular"]))

In [None]:
a

In [None]:
# delete the index
es.indices.delete(index = "idx_formular_02")