In [1]:
import json
from elasticsearch import Elasticsearch, helpers
import pandas as pd
import numpy as np

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}], timeout = 300)

In [3]:
data = json.load(open(".../data_formular/formular.txt"))

In [4]:
data[:5]

['\\sqrt { 2 x + 7 }',
 '\\sqrt { \\frac { 1 } { - 1 + x } }',
 '\\sqrt { - 3 x + 4 }',
 '\\sqrt { 1 + x ^ { 2 } }',
 '\\sqrt { 0,09.64 }']

In [5]:
data_filter = []
for i in data:
    if len(i.split())>2:
        print(i)
        data_filter.append(i)

\sqrt { 2 x + 7 }
\sqrt { \frac { 1 } { - 1 + x } }
\sqrt { - 3 x + 4 }
\sqrt { 1 + x ^ { 2 } }
\sqrt { 0,09.64 }
\sqrt { 12,1.360 }
\sqrt { 2 ^ { 2 } .3 ^ { 4 } }
\sqrt { 0,36 a ^ { 2 } }
a < 0
\hat { a } \% _ { 0 }
a > 1
\frac { 1 } { a - b }
a > b
\sqrt { \frac { 289 } { 225 } }
\sqrt { \frac { 14 } { 25 } }
\sqrt { \frac { 0,25 } { 9 } }
\sqrt { \frac { 8,1 } { 1,6 } }
\sqrt { 4 x ^ { 2 } + 4 x + 1 } = 6
\sqrt { 9,11 }
\sqrt { 39,82 }
\sqrt { 9,119 } \approx 3,019
\sqrt { 911,9 }
\sqrt { 91190 }
\sqrt { 0,09119 }
\sqrt { 0,0009119 }
\frac { 5 } { 3 \sqrt { 8 } } ; \frac { 2 } { \sqrt { b } }
\frac { 5 } { 5 - 2 \sqrt { 3 } } ; \frac { 2 a } { 1 - \sqrt { a } }
a \geq 0
a \neq 1
\frac { 4 } { \sqrt { 7 } + \sqrt { 5 } } ; \frac { 6 a } { 2 \sqrt { a } - \sqrt { b } }
\frac { 5 } { \sqrt { 10 } } \frac { 5 } { 2 \sqrt { 5 } } , \frac { 1 } { 3 \sqrt { 20 } } , \quad \frac { 2 \sqrt { 2 } + 2 } { 5 \sqrt { 2 } } , \frac { y + b \cdot \sqrt { y } } { b \cdot \sqrt { y } }
B = \sqrt { 1

In [6]:
len(data_filter)

1674

In [7]:
# define index
request_body = {
    "settings": {
        "similarity": {
            "my_bm25":{
                "type": "BM25",
                "b": 0.75,
                "k1": 1.25
            }
        }
    },
    "mappings": {
        "exam_bm25_01": {
            "properties":{
                "stt": {"type": "long", "similarity": "my_bm25"},
                "formular": {"type": "text", "similarity": "my_bm25"}
            }
        }
    }
}

In [8]:
# create the index
es.indices.create(index = "idx_formular_00", body = request_body)

{'acknowledged': True, 'index': 'idx_formular_00', 'shards_acknowledged': True}

In [9]:
# prepare data
bulk_data = []

for i, j in zip(data_filter, range(1, len(data) + 1)):
    data_dict = {
        "_index": "idx_formular_00",
        "_type": "exam_bm25_01",
        "_id":j,
        "_source": {
            "stt": j,
            "formular": i
        }
    }
    bulk_data.append(data_dict)

In [10]:
bulk_data

[{'_id': 1,
  '_index': 'idx_formular_00',
  '_source': {'formular': '\\sqrt { 2 x + 7 }', 'stt': 1},
  '_type': 'exam_bm25_01'},
 {'_id': 2,
  '_index': 'idx_formular_00',
  '_source': {'formular': '\\sqrt { \\frac { 1 } { - 1 + x } }', 'stt': 2},
  '_type': 'exam_bm25_01'},
 {'_id': 3,
  '_index': 'idx_formular_00',
  '_source': {'formular': '\\sqrt { - 3 x + 4 }', 'stt': 3},
  '_type': 'exam_bm25_01'},
 {'_id': 4,
  '_index': 'idx_formular_00',
  '_source': {'formular': '\\sqrt { 1 + x ^ { 2 } }', 'stt': 4},
  '_type': 'exam_bm25_01'},
 {'_id': 5,
  '_index': 'idx_formular_00',
  '_source': {'formular': '\\sqrt { 0,09.64 }', 'stt': 5},
  '_type': 'exam_bm25_01'},
 {'_id': 6,
  '_index': 'idx_formular_00',
  '_source': {'formular': '\\sqrt { 12,1.360 }', 'stt': 6},
  '_type': 'exam_bm25_01'},
 {'_id': 7,
  '_index': 'idx_formular_00',
  '_source': {'formular': '\\sqrt { 2 ^ { 2 } .3 ^ { 4 } }', 'stt': 7},
  '_type': 'exam_bm25_01'},
 {'_id': 8,
  '_index': 'idx_formular_00',
  '_sour

In [11]:
# insert data
helpers.bulk(es, bulk_data)

(1674, [])

In [12]:
es.indices.refresh(index = "idx_formular_00")

{'_shards': {'failed': 0, 'successful': 5, 'total': 10}}

In [13]:
a = es.search(index = "idx_formular_00", body = {
    "query": {
        "match": {
            "formular": "\\frac { 3 } { 2 + a }"
        }
    }
})

In [14]:
for i in a['hits']['hits']:
    print("score : {} and formular : {}".format(i['_score'], i['_source']['formular']))
    print(" ")

score : 6.0467944 and formular : \sqrt { \frac { 2 a } { 3 } }
 
score : 5.9997387 and formular : \sqrt { \frac { 3 } { 2 a ^ { 2 } } }
 
score : 5.8697634 and formular : \frac { 2 a } { \sqrt { 3 } }
 
score : 5.7890425 and formular : \sqrt { \frac { 2 a } { 3 } }
 
score : 5.7078996 and formular : \frac { a } { 3 }
 
score : 5.664493 and formular : - \frac { a } { 3 }
 
score : 5.6155534 and formular : A B = \frac { 2 } { 3 } A C
 
score : 5.1236954 and formular : \frac { 1 } { 2 \sqrt { 3 } + 3 } - \frac { 1 } { 2 \sqrt { 3 } - 3 }
 
score : 5.1206703 and formular : y = \frac { 3 } { 2 } x ^ { 2 } , y = - \frac { 3 } { 2 } x ^ { 2 }
 
score : 5.1124716 and formular : \frac { \sqrt { 3 } } { 2 }
 


In [30]:
# delete index
es.indices.delete(index = "idx_formular_00")

{'acknowledged': True}