Analyzer
==


Tokenizer
-------------

Tokenizers | Elasticsearch Reference [5.1] | Elastic

https://www.elastic.co/guide/en/elasticsearch/reference/5.1/analysis-tokenizers.html

### Word Oriented Tokenizers

* standard
* letter
* lowercase
* whitespace
* uax_url_email
* classic
* thai

### Partial Word Tokenizersedit

* ngram
* edge_ngram

### Structured Text Tokenizers

* keyword
* pattern
* path_hierarchy



In [92]:
# -*- coding: utf-8 -*-
%load_ext autoreload
%autoreload 2

import sys
import json
import pprint
sys.path.append("lib/")
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
import graphviz as gv

INDEX='my-index'
TYPE_NAME='my-type'

es = Elasticsearch('localhost:9200')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [93]:
def analyzer(index, analyzer, char_filters, tokenizer, filters, text, explain=False):
    analyzer = es.indices.analyze(
        index=index,
        text=text,
        char_filters=char_filters,
        tokenizer=tokenizer,
        filters=filters,
        explain=explain
    )
    return analyzer

Structured Text Tokenizers: keyword
-----------


Structured Text Tokenizers: pattern
-----------


Structured Text Tokenizers: path_hierarchy
-----------


In [105]:
char_filters=[]
tokenizer = "keyword"
filters=[]

text="I have a pen. I have an apple."

print(json.dumps([ i["token"] for i in analyzer(INDEX, analyzer, char_filters, tokenizer, filters, text, explain=False)["tokens"] ], indent=4, ensure_ascii=False))

[
    "I have a pen. I have an apple."
]


In [104]:
text="I have a pen. I have an apple. Yes, I have."

char_filters=[]
tokenizer = "pattern"
filters=[]

print(json.dumps([ i["token"] for i in analyzer(INDEX, analyzer, char_filters, tokenizer, filters, text, explain=False)["tokens"] ], indent=4, ensure_ascii=False))

[
    "I", 
    "have", 
    "a", 
    "pen", 
    "I", 
    "have", 
    "an", 
    "apple", 
    "Yes", 
    "I", 
    "have"
]


In [99]:
body="""
{
  "settings": {
    "analysis": {
      "tokenizer": {
        "I_tokenizer": {
          "type": "pattern",
          "pattern": "I"
        }
      }
    }
  }
}
"""

es.indices.close(index=INDEX)
es.indices.put_settings(index=INDEX, body=body)
es.indices.open(index=INDEX)


{u'acknowledged': True}

In [103]:
text="I have a pen. I have an apple. Yes, I have."

char_filters=[]
tokenizer = "I_tokenizer"
filters=[]

print(json.dumps([ i["token"] for i in analyzer(INDEX, analyzer, char_filters, tokenizer, filters, text, explain=False)["tokens"] ], indent=4, ensure_ascii=False))

[
    " have a pen. ", 
    " have an apple. Yes", 
    " ", 
    " have."
]


In [108]:
# 前後の空白削除は trim token filter を使う
text="I have a pen. I have an apple. Yes, I have."

char_filters=[]
tokenizer = "I_tokenizer"
filters=["trim"]

print(json.dumps([ i["token"] for i in analyzer(INDEX, analyzer, char_filters, tokenizer, filters, text, explain=False)["tokens"] ], indent=4, ensure_ascii=False))

[
    "have a pen.", 
    "have an apple. Yes", 
    "", 
    "have."
]


In [56]:
char_filters=[]
tokenizer = "kuromoji_tokenizer"
filters=[]

text="新宿三丁目"
    
print_analyzer(INDEX, analyzer, char_filters, tokenizer, filters, text)

{
    "tokens": [
        {
            "end_offset": 2, 
            "token": "新宿", 
            "type": "word", 
            "start_offset": 0, 
            "position": 0
        }, 
        {
            "end_offset": 3, 
            "token": "三", 
            "type": "word", 
            "start_offset": 2, 
            "position": 1
        }, 
        {
            "end_offset": 5, 
            "token": "丁目", 
            "type": "word", 
            "start_offset": 3, 
            "position": 2
        }
    ]
}


In [57]:
char_filters=[]
tokenizer = "standard"
filters=[]

text="新宿三丁目"
    
print_analyzer(INDEX, analyzer, char_filters, tokenizer, filters, text)

{
    "tokens": [
        {
            "end_offset": 1, 
            "token": "新", 
            "type": "<IDEOGRAPHIC>", 
            "start_offset": 0, 
            "position": 0
        }, 
        {
            "end_offset": 2, 
            "token": "宿", 
            "type": "<IDEOGRAPHIC>", 
            "start_offset": 1, 
            "position": 1
        }, 
        {
            "end_offset": 3, 
            "token": "三", 
            "type": "<IDEOGRAPHIC>", 
            "start_offset": 2, 
            "position": 2
        }, 
        {
            "end_offset": 4, 
            "token": "丁", 
            "type": "<IDEOGRAPHIC>", 
            "start_offset": 3, 
            "position": 3
        }, 
        {
            "end_offset": 5, 
            "token": "目", 
            "type": "<IDEOGRAPHIC>", 
            "start_offset": 4, 
            "position": 4
        }
    ]
}
