In [1]:
from datetime import datetime
from elasticsearch import helpers, Elasticsearch
import csv
import csv2es



In [2]:
es = Elasticsearch(timeout=1000)

In [None]:
es.indices.put_mapping(
    index="wordcloud_data",
    doc_type="my-type",
    body={
        "properties": {  
            "date": {"type":"date"},
            "word_data": {"type": "text"},
            "word_count": {"type": "integer"}
        }
    }
)

In [43]:
es.indices.create(
    index='category_index_3',
    body={
      "settings":{
      "analysis":{
         "analyzer":{
            "my_analyzer":{ 
               "type":"custom",
               "tokenizer":"standard",
               "filter":[
                  "lowercase"
               ]
            },
            "my_stop_analyzer":{ 
               "type":"custom",
               "tokenizer":"standard",
               "filter":[
                  "lowercase",
                  "english_stop"
               ]
            }
         },
         "filter":{
            "english_stop":{
               "type":"stop",
               "stopwords":"_english_"
            }
         }
      }
   },
   "mappings":{
       "properties":{
          "en_label": {
             "type":"text",
             "analyzer":"my_analyzer", 
             "search_analyzer":"my_stop_analyzer", 
             "search_quote_analyzer":"my_analyzer" 
         }
       }
       }
    },
    # Will ignore 400 errors, remove to ensure you're prompted
    ignore=400
)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'category_index_3'}

In [135]:
es.indices.create(
    index='category_index_4',
    body={
      "settings": {
        "analysis": {
          "filter": {
            "autocomplete_filter": {
              "type": "edge_ngram",
              "min_gram": 1,
              "max_gram": 20
            }
          },
          "analyzer": {
            "autocomplete": { 
              "type": "custom",
              "tokenizer": "standard",
              "filter": [
                "lowercase",
                "autocomplete_filter"
              ]
            }
          }
        }
      },
      "mappings": {
        "properties": {
          "en_label": {
            "type": "text",
            "analyzer": "autocomplete", 
            "search_analyzer": "standard" 
          }
        }
      }
    },
    # Will ignore 400 errors, remove to ensure you're prompted
    ignore=400
)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'category_index_4'}

In [191]:
es.indices.create(
    index='category_index_5',
    body={
      "settings": {
        "analysis": {
          "filter": {
            "autocomplete_filter": {
              "type": "edge_ngram",
              "min_gram": 1,
              "max_gram": 10
            }
          },
          "analyzer": {
            "autocomplete": { 
              "type": "custom",
              "tokenizer": "standard",
              "filter": [
                "lowercase",
                "autocomplete_filter"
              ]
            }
          }
        }
      },
      "mappings": {
        "properties": {
          "en_label": {
            "type": "completion",
            "analyzer": "autocomplete", 
            "search_analyzer": "standard" 
          }
        }
      }
    },
    # Will ignore 400 errors, remove to ensure you're prompted
    ignore=400
)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'category_index_5'}

In [None]:
with open('./Data/parsed_categories.csv') as f:
    reader = csv.DictReader(f)
    helpers.bulk(es, reader, index='category_index', doc_type='category')

In [68]:
with open('./Data/parsed_categories_2.tsv') as f:
    reader = csv.DictReader(f, delimiter='\t')
    helpers.bulk(es, reader, index='category_index_2', doc_type='category')

In [44]:
with open('./Data/parsed_categories_2.tsv') as f:
    reader = csv.DictReader(f, delimiter='\t')
    helpers.bulk(es, reader, index='category_index_3')

In [136]:
with open('./Data/parsed_categories_2.tsv') as f:
    reader = csv.DictReader(f, delimiter='\t')
    helpers.bulk(es, reader, index='category_index_4')

In [192]:
with open('./Data/parsed_categories_2.tsv') as f:
    reader = csv.DictReader(f, delimiter='\t')
    helpers.bulk(es, reader, index='category_index_5')

In [193]:
es.indices.refresh('category_index_5')
es.cat.count('category_index_5', params={"format": "json"})

[{'epoch': '1605597702', 'timestamp': '07:21:42', 'count': '995868'}]

In [23]:
# All indices
es.indices.get_alias("*")

{'.async-search': {'aliases': {}},
 'category_index_3': {'aliases': {}},
 'ilm-history-2-000001': {'aliases': {'ilm-history-2': {'is_write_index': True,
    'is_hidden': True}}},
 '.apm-custom-link': {'aliases': {}},
 'category_index_2': {'aliases': {}},
 '.apm-agent-configuration': {'aliases': {}},
 '.kibana_1': {'aliases': {'.kibana': {}}},
 'category_index': {'aliases': {}},
 '.kibana-event-log-7.9.3-000001': {'aliases': {'.kibana-event-log-7.9.3': {'is_write_index': True}}},
 'category_index_4': {'aliases': {}},
 '.kibana_task_manager_1': {'aliases': {'.kibana_task_manager': {}}}}

In [42]:
# Remove Index
es.indices.delete(index='category_index_3', ignore=[400, 404])

{'acknowledged': True}

In [3]:
def search(index, query, field):
    res = es.search(index=index, body={
    "query" : {
        "query_string" : {
          "query" : query,
          "fields"  : field
            }
        }
    })
    index = 1
    for r in res['hits']['hits']:
        print(index, ". result")
        print("English Label: ", r['_source']['en_label'])
        print("English Broader Categories: ", r['_source']['en_broader'])
        print("Hungarian Label: ", r['_source']['hu_label'])
        print("Hungarian Broader Categories: ", r['_source']['hu_broader'])
        print("Slovak Label: ", r['_source']['sk_label'])
        print("Slovak Broader Categories: ", r['_source']['sk_broader'])
        print()
        index += 1
    return res

In [6]:
search("category_index_4", "Mathematics", ["en_label"])

1 . result
English Label:  Mathematics articles related to basic mathematics
English Broader Categories:  
Hungarian Label:  
Hungarian Broader Categories:  
Slovak Label:  
Slovak Broader Categories:  

2 . result
English Label:  Mathematics articles related to applied mathematics
English Broader Categories:  
Hungarian Label:  
Hungarian Broader Categories:  
Slovak Label:  
Slovak Broader Categories:  

3 . result
English Label:  Mathematics articles related to discrete mathematics
English Broader Categories:  
Hungarian Label:  
Hungarian Broader Categories:  
Slovak Label:  
Slovak Broader Categories:  

4 . result
English Label:  Mathematics articles related to general mathematics
English Broader Categories:  
Hungarian Label:  
Hungarian Broader Categories:  
Slovak Label:  
Slovak Broader Categories:  

5 . result
English Label:  Mathematics articles related to history of mathematics
English Broader Categories:  
Hungarian Label:  
Hungarian Broader Categories:  
Slovak Label: 

{'took': 828,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 212, 'relation': 'eq'},
  'max_score': 14.80424,
  'hits': [{'_index': 'category_index_4',
    '_type': '_doc',
    '_id': 'o1_R0HUBDVhNv2MgPvdL',
    '_score': 14.80424,
    '_source': {'key': 'Mathematics articles related to basic mathematics',
     'en_db': 'http://dbpedia.org/resource/Category:Mathematics_articles_related_to_basic_mathematics',
     'en_label': 'Mathematics articles related to basic mathematics',
     'en_wiki': 'http://en.wikipedia.org/wiki/Category:Mathematics_articles_related_to_basic_mathematics?oldid=332898130',
     'en_broader': '',
     'hu_db': '',
     'hu_label': '',
     'hu_wiki': '',
     'hu_broader': '',
     'sk_db': '',
     'sk_label': '',
     'sk_wiki': '',
     'sk_broader': ''}},
   {'_index': 'category_index_4',
    '_type': '_doc',
    '_id': 'pl_R0HUBDVhNv2MgPvdL',
    '_score': 14.80424,
    '_source': {'ke

In [28]:
res

{}