# Create a connection to Elasticsearch

In [2]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
print(es.info())

{'name': '239980ad27d0', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'GA3py-85TgiwUpINHtUXXQ', 'version': {'number': '8.11.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '64cf052f3b56b1fd4449f5454cb88aca7e739d9a', 'build_date': '2023-12-08T11:33:53.634979452Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


# Char map filters

## HTML Strip Char Filter (html_strip)

In [4]:
response = es.indices.analyze(
    body={
        "char_filter": ["html_strip"],
        "tokenizer": "standard",
        "text": "<p>Hello <b>World</b>! This is <a href='<http://example.com>'>Elasticsearch</a>.</p>"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['Hello',
 'World',
 'This',
 'is',
 'a',
 'href',
 'http',
 'example.com',
 'Elasticsearch']

## Pattern Replace Char Filter (pattern_replace)

In [5]:
response = es.indices.analyze(
    body={
        "char_filter": [
            {
                "type": "pattern_replace",
                "pattern": "[-_@.]",  # Removes hyphens, underscores, apostrophes
                "replacement": " "
            }
        ],
        "tokenizer": "standard",
        "text": "liu_xiao_guo"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['liu', 'xiao', 'guo']

## Mapping Char Filter (mapping)

In [7]:
response = es.indices.analyze(
    body={
        "tokenizer": "standard",
        "char_filter": [
            {
                "type": "mapping",
                "mappings": [
                    "@gmail.com=>",    # Replace @gmail.com with nothing
                    "$=>dollar",       # Replace $ with dollar
                ]
            }
        ],
        "text": "xiaoguo.liu@gmail.com gives me $"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['xiaoguo.liu', 'gives', 'me', 'dollar']

# Tokenizers

## Standard Tokenizer (standard)

In [8]:
response = es.indices.analyze(
    body={
        "tokenizer": "standard",
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['The',
 '2',
 'QUICK',
 'Brown',
 'Foxes',
 'jumps_over',
 'the',
 'lazy',
 "dog's",
 'bone']

## Letter Tokenizer (letter)

In [9]:
response = es.indices.analyze(
    body={
        "tokenizer": "letter",
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['The',
 'QUICK',
 'Brown',
 'Foxes',
 'jumps',
 'over',
 'the',
 'lazy',
 'dog',
 's',
 'bone']

## Lowercase Tokenizer (lowercase)

In [10]:
response = es.indices.analyze(
    body={
        "tokenizer": "lowercase",
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['the',
 'quick',
 'brown',
 'foxes',
 'jumps',
 'over',
 'the',
 'lazy',
 'dog',
 's',
 'bone']

## Whitespace Tokenizer (whitespace)

In [11]:
response = es.indices.analyze(
    body={
        "tokenizer": "whitespace",
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['The',
 '2',
 'QUICK',
 'Brown-Foxes,',
 'jumps_over',
 'the',
 "lazy-dog's",
 'bone.']

## Classic Tokenizer (classic)

In [12]:
response = es.indices.analyze(
    body={
        "tokenizer": "classic",
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['The',
 '2',
 'QUICK',
 'Brown',
 'Foxes',
 'jumps',
 'over',
 'the',
 'lazy',
 "dog's",
 'bone']

## UAX URL Email Tokenizer (uax_url_email)

In [13]:
response = es.indices.analyze(
    body={
        "tokenizer": "classic",
        "text": "visit https://elasticstack.blog.csdn.net to get the best materials to learn Elastic Stack"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['visit',
 'https',
 'elasticstack.blog.csdn.net',
 'to',
 'get',
 'the',
 'best',
 'materials',
 'to',
 'learn',
 'Elastic',
 'Stack']

## N-Gram Tokenizer (ngram)

In [14]:
response = es.indices.analyze(
    body={
        "tokenizer": {
            "type": "ngram",
            "min_gram": 3,
            "max_gram": 4
        },
        "text": "Hello Xiaoguo"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['Hel',
 'Hell',
 'ell',
 'ello',
 'llo',
 'llo ',
 'lo ',
 'lo X',
 'o X',
 'o Xi',
 ' Xi',
 ' Xia',
 'Xia',
 'Xiao',
 'iao',
 'iaog',
 'aog',
 'aogu',
 'ogu',
 'oguo',
 'guo']

## Edge N-Gram Tokenizer (edge_ngram)

In [15]:
response = es.indices.analyze(
    body={
        "tokenizer": {
                "type": "edge_ngram",
                "min_gram": 4,
                "max_gram": 5,
                "token_chars": ["letter", "digit"]
        },
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['QUIC',
 'QUICK',
 'Brow',
 'Brown',
 'Foxe',
 'Foxes',
 'jump',
 'jumps',
 'over',
 'lazy',
 'bone']

## Keyword Tokenizer (keyword)

In [16]:
response = es.indices.analyze(
    body={
        "tokenizer": "keyword",
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

["The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."]

## Pattern Tokenizer (pattern)

In [17]:
response = es.indices.analyze(
    body={
        "tokenizer": {
          "type": "pattern",
          "pattern": "_+"
        },
        "text": "hello_world_from_elasticsearch"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['hello', 'world', 'from', 'elasticsearch']

## Path Tokenizer (path_hierarchy)

response = es.indices.analyze(
    body={
        "tokenizer": "path_hierarchy",
        "text": "/usr/local/bin/python"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

# Token filters

## Apostrophe

In [29]:
response = es.indices.analyze(
    body={
        "filter": ["apostrophe"],
        "tokenizer": "standard",
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]


['The',
 '2',
 'QUICK',
 'Brown',
 'Foxes',
 'jumps_over',
 'the',
 'lazy',
 'dog',
 'bone']

## Lowercase Filter

In [31]:
response = es.indices.analyze(
    body={
        "filter": ["lowercase"],
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

["the 2 quick brown-foxes, jumps_over the lazy-dog's bone."]

## Uppercase Filter

In [32]:
response = es.indices.analyze(
    body={
        "filter": ["uppercase"],
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

["THE 2 QUICK BROWN-FOXES, JUMPS_OVER THE LAZY-DOG'S BONE."]

## Trim Filter

In [33]:
# Analyze the text using the custom analyzer
response = es.indices.analyze(
    body={
        "tokenizer": "keyword",
        "filter":[
            "lowercase",
            "trim"
         ],
        "text": " The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone. "
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

["the 2 quick brown-foxes, jumps_over the lazy-dog's bone."]

## ASCII Folding Filter (asciifolding)

In [34]:
# Analyze the text using the custom analyzer
response = es.indices.analyze(
    body={
        "filter": ["asciifolding"],
        "text": "Türkiye"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['Turkiye']

## Synonym Filter

In [35]:
# Analyze the text using the custom analyzer
response = es.indices.analyze(
    body={
        "tokenizer": "standard",
        "filter":[
            "lowercase",
            {
              "type": "synonym",
              "synonyms": ["jumps_over => leap"]
            }
         ],
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['the', '2', 'quick', 'brown', 'foxes', 'leap', 'the', 'lazy', "dog's", 'bone']

## Synonym Graph Filter

In [36]:
response = es.indices.analyze(
    body={
        "tokenizer": "standard",
        "filter":[
            "lowercase",
            {
              "type": "synonym_graph",
              "synonyms": ["NYC, New York City", "LA, Los Angeles"]
            }
         ],
        "text": "Flight from LA to NYC has been delayed by an hour"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['flight',
 'from',
 'los',
 'la',
 'angeles',
 'to',
 'new',
 'nyc',
 'york',
 'city',
 'has',
 'been',
 'delayed',
 'by',
 'an',
 'hour']

## Stemmer Filter

In [39]:
response = es.indices.analyze(
    body={
    "tokenizer": "standard",
    "filter": [
        {
            "type": "stemmer",
            "language": "English",
        },
        ],
    "text": "candies, ladies, plays, playing, ran, running, dresses"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['candi', 'ladi', 'plai', 'plai', 'ran', 'run', 'dress']

## KStem Filter

In [42]:
response = es.indices.analyze(
    body={
 "tokenizer": "standard",
 "filter": [
    'kstem',
  ],
 "text": "candies, ladies, plays, playing, ran, running"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['candy', 'ladies', 'play', 'play', 'ran', 'running']

## Porter Stem Filter

In [51]:
response = es.indices.analyze(
    body={
    "tokenizer": "whitespace",
    "filter": [
        {
            "type": "pattern_replace",
            "pattern": "[-|.|,]"
        },
        {
            "type": "porter_stem",
            "language": "English",
        },
        ],
    "text": "candies, ladies, plays, playing, ran, running, dresses"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['candi', 'ladi', 'plai', 'plai', 'ran', 'run', 'dress']

## Snowball Filter

In [52]:
response = es.indices.analyze(
    body={
    "tokenizer": "whitespace",
    "filter": [
        {
            "type": "snowball",
            "language": "English",
        },
        ],
    "text": "candies, ladies, plays, playing, ran, running, dresses"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['candies,', 'ladies,', 'plays,', 'playing,', 'ran,', 'running,', 'dress']

## Stemmer Override

In [57]:
response = es.indices.analyze(
    body={
      "tokenizer": "standard",
      "filter": [
        {
            "type": "stemmer_override",
            "language": "English",
            "rules": [
                "running, runs => run",
                "stemmer => stemmer"
            ]
        },
        ],
      "text": "candies, ladies, plays, playing, ran, running, dresses"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['candies', 'ladies', 'plays', 'playing', 'ran', 'run', 'dresses']

## Keyword Marker Filter

In [58]:
response = es.indices.analyze(
    body={
     "tokenizer": "whitespace",
     "filter": [
         {
            "type": "keyword_marker",
            "keywords": ["running"]  # Mark 'running' as a keyword
         },
         {
            "type": "pattern_replace",
            "pattern": "[-|.|,]"
         },
         {
            "type": "porter_stem",
            "language": "English",
         },
      ],
     "text": "candies, ladies, plays, playing, runs, running"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['candi', 'ladi', 'plai', 'plai', 'run', 'running']

## Stop Filter

In [60]:
# Analyze the text using the custom analyzer
response = es.indices.analyze(
    body={
        "tokenizer": "standard",
        "filter":{
            "type":"stop",
            "stopwords": ["is","am","are","of","if","a","the"],
            "ignore_case": True
        },
        "text": "i am sachin. I Am software engineer."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['i', 'sachin', 'I', 'software', 'engineer']

## Unique Filter

In [61]:
response = es.indices.analyze(
    body={
     "tokenizer": "whitespace",
     "filter":[
         "lowercase", "unique",
      ],
     "text": "Happy happy joy joy"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['happy', 'joy']

## Length Filter

In [62]:
response = es.indices.analyze(
    body={
        "tokenizer": "standard",
        "filter":[
            "lowercase",
            {
              "type": "length",
              "min": 1,
              "max": 4
            }
         ],
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['the', '2', 'the', 'lazy', 'bone']

## NGram Token Filter

In [63]:
response = es.indices.analyze(
    body={
     "tokenizer": "whitespace",
     "filter":[
         {
            "type": "ngram",
            "min_gram": 3,
            "max_gram": 4
         }
      ],
     "text": "Skinny blue jeans by levis"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]

['Ski',
 'Skin',
 'kin',
 'kinn',
 'inn',
 'inny',
 'nny',
 'blu',
 'blue',
 'lue',
 'jea',
 'jean',
 'ean',
 'eans',
 'ans',
 'lev',
 'levi',
 'evi',
 'evis',
 'vis']

## Edge NGram Token Filter

In [64]:
response = es.indices.analyze(
    body={
     "tokenizer": "whitespace",
     "filter":[
         {
            "type": "edge_ngram",
            "min_gram": 3,
            "max_gram": 4
         }
      ],
     "text": "Skinny blue jeans by levis"
    }
)

# Extract tokens
[token['token'] for token in response['tokens']]


['Ski', 'Skin', 'blu', 'blue', 'jea', 'jean', 'lev', 'levi']

## Shingle Filter

In [79]:
response = es.indices.analyze(
    body={
     "tokenizer": "whitespace",
     "filter":[
        {
          "type": "shingle",
          "min_shingle_size": 2,
          "max_shingle_size": 3           
        }
      ],
     "text": "Welcome to use Elastic Stack"
    }
)

[token['token'] for token in response['tokens']]

['Welcome',
 'Welcome to',
 'Welcome to use',
 'to',
 'to use',
 'to use Elastic',
 'use',
 'use Elastic',
 'use Elastic Stack',
 'Elastic',
 'Elastic Stack',
 'Stack']

# Creating a custom analyzer

In [83]:
response = es.indices.analyze(
    body={
        "char_filter": [
            {
                "type": "mapping",
                "mappings": [
                    "- => ' '", # replacing hyphens with blank space
                    "_ => ' '", # replacing underscore with blank space
                 ]
            }
        ],
        "tokenizer": "standard",
        "filter": ["apostrophe", "lowercase", "stop", "porter_stem"],
        "text": "The 2 QUICK Brown-Foxes, jumps_over the lazy-dog's bone."
    }
)

# Extract and print tokens
tokens = [token['token'] for token in response['tokens']]
tokens

['2', 'quick', 'brown', 'fox', 'jump', 'over', 'lazi', 'dog', 'bone']

In [89]:
settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "my_custom_analyzer": {
                    "type": "custom",
                    "char_filter": {
                        "type": "mapping",
                        "mappings": [
                            "- => ' '",
                            "_ => ' '",
                        ]
                    },
                    "tokenizer": "standard",
                    "filter": ["lowercase", "apostrophe", "stop", "porter_stem"],
                }
            }
        },
        "index": {
            "number_of_shards": 1,
            "number_of_replicas": 0,
            "routing.allocation.include._tier_preference": "data_hot"
        }
    }
}

mappings = {
        "properties": {
            "title": {"type":"text", "analyzer":"my_custom_analyzer"},
            "brand": {"type": "text", "analyzer":"my_custom_analyzer", "fields": {"raw": {"type": "keyword"}}},
            "updated_time": {"type": "date"}
        }
}

response = es.indices.create(index="trial_index", body=settings)