# Exercise #1: Getting ordered and unordered bigram matches in Elasticsearch

In [1]:
from elasticsearch import Elasticsearch
from pprint import pprint

## Indexing a toy collection 

This time, we store **term position information** and perform minimal stemming, i.e., removing only plurals (for that, we specify a custom analyzer).

Check the [Elasticsearch documentation on analyzers](https://www.elastic.co/guide/en/elasticsearch/reference/current/analyzer.html).

In [2]:
INDEX_NAME = "toy_index"  

INDEX_SETTINGS = {
    'settings' : {
        'index' : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        },
        'analysis': {
            'analyzer': {
                'my_english_analyzer': {
                    'type': "custom",
                    'tokenizer': "standard",
                    'stopwords': "_english_",
                    'filter': [
                        "lowercase",
                        "english_stop",
                        "filter_english_minimal"
                    ]                
                }
            },
            'filter' : {
                'filter_english_minimal' : {
                    'type': "stemmer",
                    'name': "minimal_english"
                },
                'english_stop': {
                    'type': "stop",
                    'stopwords': "_english_"
                }
            },
        }
    },
    'mappings': {
        'properties': {
            'title': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "my_english_analyzer"
            },
            'content': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "my_english_analyzer"
            }
        }
    }
}

In [3]:
DOCS = {
    1: {"title": "Rap God",
        "content": "gonna, gonna, Look, I was gonna go easy on you and not to hurt your feelings"
        },
    2: {"title": "Lose Yourself",
        "content": "Yo, if you could just, for one minute Or one split second in time, forget everything Everything that bothers you, or your problems Everything, and follow me"
        },
    3: {"title": "Love The Way You Lie",
        "content": "Just gonna stand there and watch me burn But that's alright, because I like the way it hurts"
        },
    4: {"title": "The Monster",
        "content": ["gonna gonna I'm friends with the monster", "That's under my bed Get along with the voices inside of my head"]
        },
    5: {"title": "Beautiful",
        "content": "Lately I've been hard to reach I've been too long on my own Everybody has a private world Where they can be alone"
        },
    6: {"title": "Fake Eminem 1",
        "content": "This is not real Eminem, just some text to get more matches for a split second for a split second."
        },
    7: {"title": "Fake Eminem 2",
        "content": "I have a monster friend and I'm friends with the monster and then there are some more friends who are monsters."
        },
}

In [4]:
es = Elasticsearch()

In [5]:
if es.indices.exists(INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)
    
es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

{'acknowledged': True, 'index': 'toy_index', 'shards_acknowledged': True}

Testing our analyzer.

In [6]:
es.indices.analyze(index=INDEX_NAME, body={'analyzer': "my_english_analyzer", 'text': "monsters in my bed"})

{'tokens': [{'end_offset': 8,
   'position': 0,
   'start_offset': 0,
   'token': 'monster',
   'type': '<ALPHANUM>'},
  {'end_offset': 14,
   'position': 2,
   'start_offset': 12,
   'token': 'my',
   'type': '<ALPHANUM>'},
  {'end_offset': 18,
   'position': 3,
   'start_offset': 15,
   'token': 'bed',
   'type': '<ALPHANUM>'}]}

In [7]:
for doc_id, doc in DOCS.items():
    es.index(index=INDEX_NAME, id=doc_id, body=doc)

Notice that you also get term position information when requesting a term vector.

In [8]:
tv = es.termvectors(index=INDEX_NAME, id=2, fields="title,content")
pprint(tv)

{'_id': '2',
 '_index': 'toy_index',
 '_type': '_doc',
 '_version': 1,
 'found': True,
 'term_vectors': {'content': {'field_statistics': {'doc_count': 7,
                                                   'sum_doc_freq': 85,
                                                   'sum_ttf': 101},
                              'terms': {'bother': {'term_freq': 1,
                                                   'tokens': [{'position': 18}]},
                                        'could': {'term_freq': 1,
                                                  'tokens': [{'position': 3}]},
                                        'everything': {'term_freq': 3,
                                                       'tokens': [{'position': 15},
                                                                  {'position': 16},
                                                                  {'position': 23}]},
                                        'follow': {'term_freq': 1,
                    

This method returns the sequence of terms for a given document field, with None values for stopwords that got removed.

In [9]:
def get_term_sequence(es, doc_id, field):
    tv = es.termvectors(index=INDEX_NAME, id=doc_id, fields=[field])
    # We first put terms in a position-indexed dict.
    pos = {}
    for term, tinfo in tv['term_vectors'][field]['terms'].items():
        for token in tinfo['tokens']:
            pos[token['position']] = term
    # Then, turn that dict to a list.
    seq = [None] * (max(pos.keys()) + 1)
    for p, term in pos.items():
        seq[p] = term
    return seq

In [10]:
print(get_term_sequence(es, 7, "content"))

['i', 'have', None, 'monster', 'friend', None, "i'm", 'friend', None, None, 'monster', None, None, None, None, 'some', 'more', 'friend', 'who', None, 'monster']


## Getting ordered bigram matches

Get a list of documents that contain the terms "split second" in this exact order in the `concent` field.

You can use a [match_phrase query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query-phrase.html) for example (see the [Elasticsearch notebook](../../code/elasticsearch/Elasticsearch.ipynb) for usage).

In [23]:
query = "split second"
bigram = query.split()

In [24]:
res = es.search(index=INDEX_NAME, body={'query': {'match_phrase': {'content': query}}})

For each of those matching documents, count the actual number of times the phrase appears in the `content` field. Note that there is no built-in support in the Elasticsearch API for that; you'll need to get the actual term sequence from the field (using the `get_term_sequence()` helper method) and count it yourself.

In [20]:
def count_ordered_bigram_matches(text, bigram):
    """Counts the number of bigram matches in text. Both text and bigram are represented as a list of terms."""
    count = 0
    for i in range(len(text) - 1):
        if text[i] == bigram[0]:
            if text[i + 1] == bigram[1]:
                count += 1
    return count

In [26]:
for hit in res['hits']['hits']:
    doc_id = hit['_id']
    text = get_term_sequence(es, doc_id, "content")
    count = count_ordered_bigram_matches(text, bigram)
    print("(#{}) {} -- {}".format(hit['_id'], hit['_source']['title'], count))

(#6) Fake Eminem 1 -- 2
(#2) Lose Yourself -- 1


## Getting unordered bigram matches

Get a list of documents that contain the terms "friends monster" in an unordered window of size 4.

You may use Elasticsearch's [span near query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-span-near-query.html) for that. NOTE: for span queries, you'll need to analyze the query terms beforehand.

In [42]:
query = {
    'span_near': {
        'clauses': [
            {'span_term': {'content': "friend"}}, 
            {'span_term': {'content': "monster"}}
        ],
        'slop': 2,
        'in_order': False    
    }
}
bigram=["friend", "monster"]

res = es.search(index=INDEX_NAME, body={'query': query})

For each of the matching documents, count the actual number of unordered matches in the `content` field. As before, use the `get_term_sequence()` method.

In [40]:
def count_unordered_bigram_matches(text, bigram, w):
    """Counts the number of unordered bigram matches in text within a given window size. 
    Both text and bigram are represented as a list of terms."""
    count = 0
    for i in range(len(text) - 1):
        if text[i] in bigram:
            other_term = bigram[0] if text[i] == bigram[1] else bigram[1]
            if other_term in text[i+1:i+w]:
                count += 1
    return count

In [41]:
for hit in res['hits']['hits']:
    doc_id = hit['_id']
    text = get_term_sequence(es, doc_id, "content")
    count = count_unordered_bigram_matches(text, bigram, 4)
    print("(#{}) {} -- {}".format(hit['_id'], hit['_source']['title'], count))

(#7) Fake Eminem 2 -- 3
(#4) The Monster -- 1
