## Connect to the Elasticsearch client with your credentials

In [83]:
from elasticsearch import Elasticsearch, helpers
from getpass import getpass

#Connect to the elastic cloud server
ELASTIC_CLOUD_ID = getpass("Elastic Cloud ID: ")
ELASTIC_API_KEY = getpass("Elastic API Key: ")

# Create an Elasticsearch client using the provided credentials
client = Elasticsearch(
    cloud_id=ELASTIC_CLOUD_ID,  # cloud id can be found under deployment management
    api_key=ELASTIC_API_KEY, # your username and password for connecting to elastic, found under Deplouments - Security
)

Get the samle lyrics data [we just downloaded with the API.](/lyrics.ipynb)

In [142]:
import json
with open('data/ts_song.json', 'r') as f:
  songs = json.load(f)

## Put the data in an index

We are creating a nested field for the lyrics so we can search for the inner hits to get the exact lines we want.

In [133]:
index_name = 'ts_songs'

mappings = {
  "properties": {
    "lyrics": {
        "type": "nested",
        "properties": {
          "line": {
            "type": "text"
          }
        }
    },
  }
}

# Create the Elasticsearch index with the specified name (delete if already existing)
if client.indices.exists(index=index_name):
    client.indices.delete(index=index_name)
client.indices.create(index=index_name, mappings=mappings)

def generate_docs(data, index_name):
    for document in data:
        yield dict(_index=index_name, _id=f"{document['id']}", _source=document)


# Use the Elasticsearch helpers.bulk() method to index the DataFrame data into Elasticsearch
load = helpers.bulk(client, generate_docs(songs, index_name), index_name)

## Look for a specific line in a song

We can now use a nested query to look up words in our songs and get the specific passage where this would be mentioned.

In [134]:
def simple_search(query):
  query = {
      "nested": {
        "path": "lyrics",
        "query": {
          "match": {
            "lyrics.line": query
          }
        },
        "inner_hits" : {
          "docvalue_fields" : [
            "lyrics.line.keyword"
          ]
        }
      }
  }

  #Run a simple query, for example looking for problems with the engine
  response = client.search(index=index_name, query=query)

  print(f'We get back {response["hits"]["total"]["value"]} songs that fit, here are the top results:')
  for hit in response["hits"]["hits"][0:5]:
      print(f'From {hit["_source"]["artist"]} : {hit["_source"]["name"]}: ')
      for inner_hit in hit["inner_hits"]["lyrics"]["hits"]["hits"][0:1]:
          print(inner_hit["_source"]["line"])
      print()

simple_search("I am in love with you")


We get back 19 songs that fit, here are the top results:
From Taylor Swift : You Are in Love: 
You are in love, true love

From Taylor Swift : Dancing With Our Hands Tied: 
Oh, keeping you with me, I-

From Taylor Swift : Come Back... Be Here: 
4 AM, the second day

From Taylor Swift : New Romantics: 
Come on, come along with me

From Taylor Swift : So It Goes...: 
Do bad things with you



However, this is only returning exact matches, missing out on similar songs about "lovers", "loving", or any similar phrases which I might still want to find. 

So we can take this a step further and add a semantic search model into the mix, to help us really look for meaning in the lyrics.



## Adding ELSER inference for semantic search

We will use a [foreach](https://www.elastic.co/guide/en/elasticsearch/reference/current/foreach-processor.html) processor to loop through all lines of the lyrics.

See [the ELSER Notebook](https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/search/03-ELSER.ipynb) for a simple get-started quide for semantic search; and [this document chunking example](https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/document-chunking/with-index-pipelines.ipynb) for another instance of embedding inner hits. 

In [91]:
client.ingest.put_pipeline(
    id="adding_ELSER_to_lyrics", 
    processors=
    [
        {
            "foreach": {
                "field": "lyrics",
                "processor": {
                    "inference": {
                        "model_id": ".elser_model_2",
                        "input_output": [
                            {"input_field": "_ingest._value.line", "output_field": "_ingest._value.tokens"}
                        ],
                        "on_failure" : [
                        {
                            "set" : {
                                "field": "_ingest._value.errors",
                                "value": "failed in foreach processor"
                            }
                        }]
                    }
                }
            }
        }
    ]
)

mappings = {
    "dynamic" : True,
    "properties" : 
    {
        "lyrics": {
            "type": "nested",
            "properties": {
                "line" : {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
                },
                "tokens": { 
                    "type": "sparse_vector" 
                }
            }
        }
    }
}

#Creating the new index with enriched data
index_name_new = "ts_songs_semantic"
if client.indices.exists(index=index_name_new):
    client.indices.delete(index=index_name_new)
client.indices.create(index=index_name_new, mappings=mappings)

client.reindex(body={
      "source": {
          "index": index_name},
      "dest": {"index": index_name_new, "pipeline" : "adding_ELSER_to_lyrics"}
    }, wait_for_completion=False)

ObjectApiResponse({'task': 'eclQBhHoS0CN09g-_bZM5w:63679788'})

We can now run the same query again, but using `text_expansion` on the generated tokens rather than `match` directly on the text field.

In [135]:
def semantic_search(query):
    query = {
        "nested": {
            "path": "lyrics",
            "query": {
                "text_expansion": {
                    "lyrics.tokens": {
                        "model_id": ".elser_model_2",
                        "model_text": query,
                    }
                }
            },
            "inner_hits" : {
                "docvalue_fields" : [
                    "lyrics.line.keyword"
                ]
            }
        }
    }

    #Run a simple query, for example looking for problems with the engine
    response = client.search(index=index_name_new, query=query)

    print(f'We get back {response["hits"]["total"]["value"]} songs that fit, here are the top results:')
    for hit in response["hits"]["hits"][0:5]:
        print(f'From {hit["_source"]["artist"]} : {hit["_source"]["name"]}: ')
        for inner_hit in hit["inner_hits"]["lyrics"]["hits"]["hits"][0:1]:
            print(inner_hit["_source"]["line"])
        print()

semantic_search("I am in love with you")


We get back 19 songs that fit, here are the top results:
From Taylor Swift : You Are in Love: 
You are in love

From Taylor Swift : Gorgeous: 
(I hate you so much)

From Taylor Swift : End Game (Ft. Ed Sheeran & Future): 
You love it, I love it, too 'cause you my type (you my type)

From Taylor Swift : Come Back... Be Here: 
This is falling in love in the cruelest way

From Taylor Swift : Don't Blame Me: 
You're lovin' me



With the new semantic search layer, we actually get back pretty much every song in our sample dataset. 

This could either mean the model has too low of a treshold for the match, or that simply all songs are in some form about love. 

Let's try some more specific searches.

In [136]:
simple_search("I have been betrayed")

We get back 19 songs that fit, here are the top results:
From Taylor Swift : King of My Heart: 
And all at once, you are the one I have been waiting for

From Taylor Swift : This Is Why We Can't Have Nice Things: 
This is why we can't have-

From Taylor Swift : Girl at Home: 
This I have previously learned

From Taylor Swift : Gorgeous: 
There's nothing I hate more than what I can't have

From Taylor Swift : The Moment I Knew: 
And I would've been so happy



In [137]:
semantic_search("I have been betrayed")

We get back 19 songs that fit, here are the top results:
From Taylor Swift : This Is Why We Can't Have Nice Things: 
Friends don't try to trick you

From Taylor Swift : I Did Something Bad: 
And I let them think they saved me

From Taylor Swift : Gorgeous: 
(I hate you so much)

From Taylor Swift : Dancing With Our Hands Tied: 
I, I loved you in spite of

From Taylor Swift : Getaway Car: 
'Cause us traitors never win



In [139]:
simple_search("you broke up with me")

We get back 19 songs that fit, here are the top results:
From Taylor Swift : I Did Something Bad: 
Light me up (light me up), light me up (light me up)

From Taylor Swift : New Romantics: 
Come on, come along with me

From Taylor Swift : Look What You Made Me Do: 
The role you made me play

From Taylor Swift : Dancing With Our Hands Tied: 
Oh, keeping you with me, I-

From Taylor Swift : Girl at Home: 
Want to see you pick up your phone



In [138]:
semantic_search("you broke up with me")

We get back 19 songs that fit, here are the top results:
From Taylor Swift : Gorgeous: 
You've ruined my life by not being mine

From Taylor Swift : This Is Why We Can't Have Nice Things: 
Because you break them

From Taylor Swift : You Are in Love: 
You are in love

From Taylor Swift : Look What You Made Me Do: 
Isn't cool, no, I don't like you (oh)

From Taylor Swift : Girl at Home: 
You're about to lose your girl



Semantic search does seem to capture the meaning better, however it still seems like the spirit of the songs isn't fully represented. This may be due to the chunking strategy. In these examples, each line of the lyrics is its own document, however the sentences or paragraphs end up broken up and some of the context is lost. 

This brings up a very important point about semantic search - performance isn't only determined by the model chosen, but also the way data is processed and introduced to the model.
Let's try a few strategies to offer the model more context.


## Adding chunking for context

We can add multiple versions of our documents (or songs in this instance) to the index. For this simple example, we've chosen three dymensions: the entire lyrics as one entry, the line-by-line approach we just tested above, and a custom recursive chunking strategy that is an industry best practice when working with LLMs.

The chunker gives us paragraphs of about 150 characters, with some overlap. The overlap means some lines get included in multiple chunks with a rolling window appraoch to preserve as much context as possible in each chunk.

(insert more stats from data processing notebook)

In [146]:
index_name = 'ts_songs_chunks'

mappings = {
  "properties": {
    "lyrics": {
      "type": "nested",
      "properties": {
        "line": {
          "type": "text"
        }
      },
    },
    "full_lyrics": {
      "type": "nested",
      "properties": {
        "line": {
          "type": "text"
        }
      }, 
    },
    "chunks": {
      "type": "nested",
      "properties": {
        "line": {
          "type": "text"
        }
      },   
    },
  }
}

# Create the Elasticsearch index with the specified name (delete if already existing)
if client.indices.exists(index=index_name):
    client.indices.delete(index=index_name)
client.indices.create(index=index_name, mappings=mappings)

def generate_docs(data, index_name):
    for document in data:
        yield dict(_index=index_name, _id=f"{document['id']}", _source=document)


# Use the Elasticsearch helpers.bulk() method to index the DataFrame data into Elasticsearch
load = helpers.bulk(client, generate_docs(songs, index_name), index_name)

### Expanding the search function now that we have more parameter options

In [159]:
def simple_search(query, path, content=True):
  # path determines on which field to search (full_lyrics, lyrics, or chunks in this case)
  # content determines if we print the lyrics matched or only the song title
  query = {
        "nested": {
          "path": path,
          "query": {
            "match": {
              path + ".line": query
            }
          },
          "inner_hits" : {
            "docvalue_fields" : [
              path + ".line.keyword"
            ]
          }
        }
    }

  #Run a simple query, for example looking for problems with the engine
  response = client.search(index=index_name, query=query)

  print(f'We get back {response["hits"]["total"]["value"]} songs that fit, here are the top results:')
  for hit in response["hits"]["hits"][0:5]:
      print(f'From {hit["_source"]["artist"]} : {hit["_source"]["name"]}: ')
      if content:
        for inner_hit in hit["inner_hits"][path]["hits"]["hits"][0:1]:
            print(inner_hit["_source"]["line"])


query = "I am in love with you"
path = "chunks"
simple_search(query, path, False)

We get back 19 songs that fit, here are the top results:
From Taylor Swift : You Are in Love: 
From Taylor Swift : Dancing With Our Hands Tied: 
From Taylor Swift : Come Back... Be Here: 
From Taylor Swift : ...Ready for It?: 
From Taylor Swift : Getaway Car: 


In [160]:
query = "I am in love with you"
simple_search(query, "lyrics", False)

We get back 19 songs that fit, here are the top results:
From Taylor Swift : You Are in Love: 
From Taylor Swift : Dancing With Our Hands Tied: 
From Taylor Swift : Come Back... Be Here: 
From Taylor Swift : New Romantics: 
From Taylor Swift : So It Goes...: 


In [161]:
query = "I am in love with you"
simple_search(query, "full_lyrics", False)

We get back 19 songs that fit, here are the top results:
From Taylor Swift : Come Back... Be Here: 
From Taylor Swift : Girl at Home: 
From Taylor Swift : You Are in Love: 
From Taylor Swift : Dancing With Our Hands Tied: 
From Taylor Swift : End Game (Ft. Ed Sheeran & Future): 


We can observe that different chunking sizes give different results for the same query. Some songs appear in all results but ranked differently, while others are unique to a certain chunk-sized search.

We can now also generate embeddings for the various sizes and see how those also perform differently. 

In [162]:
processor = {
    "inference": {
        "model_id": ".elser_model_2",
        "input_output": [
            {"input_field": "_ingest._value.line", "output_field": "_ingest._value.tokens"}
        ],
        "on_failure" : [{
            "set" : {
                "field": "_ingest._value.errors",
                "value": "failed in foreach processor"
            }
        }]
    }
}

client.ingest.put_pipeline(
    id="adding_ELSER_to_lyrics_chunks", 
    processors=
    [
        {
            "foreach": {
                "field": "lyrics",
                "processor": processor
            },
            "foreach": {
                "field": "full_lyrics",
                "processor": processor
            },
            "foreach": {
                "field": "chunks",
                "processor": processor
            },
        }
    ]
)

properties = {
    "line": {
        "type": "text",
        "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
    },
    "tokens": { 
        "type": "sparse_vector" 
    }
}

mappings = {
    "dynamic" : True,
    "properties" : 
    {
        "lyrics": {
            "type": "nested",
            "properties": properties
        },
        "full_lyrics": {
            "type": "nested",
            "properties": properties
        },
        "chunks": {
            "type": "nested",
            "properties": properties
        },
    }
}


#Creating the new index with enriched data
index_name_new = "ts_songs_semantic_chunked"
if client.indices.exists(index=index_name_new):
    client.indices.delete(index=index_name_new)
client.indices.create(index=index_name_new, mappings=mappings)

client.reindex(body={
      "source": {
          "index": index_name},
      "dest": {"index": index_name_new, "pipeline" : "adding_ELSER_to_lyrics_chunks"}
    }, wait_for_completion=False)

ObjectApiResponse({'task': 'JqYuDbWsRueybLrxY3c9Cg:95601039'})

In [163]:
def semantic_search(query, path, content=True):
    query = {
        "nested": {
            "path": path,
            "query": {
                "text_expansion": {
                    path + ".tokens": {
                        "model_id": ".elser_model_2",
                        "model_text": query,
                    }
                }
            },
            "inner_hits" : {
                "docvalue_fields" : [
                    path + ".line.keyword"
                ]
            }
        }
    }

    #Run a simple query, for example looking for problems with the engine
    response = client.search(index=index_name_new, query=query)

    print(f'We get back {response["hits"]["total"]["value"]} songs that fit, here are the top results:')
    for hit in response["hits"]["hits"][0:5]:
        print(f'From {hit["_source"]["artist"]} : {hit["_source"]["name"]}: ')
        if content:
            for inner_hit in hit["inner_hits"][path]["hits"]["hits"][0:1]:
                print(inner_hit["_source"]["line"])

In [169]:
query = "I am in love with you"
semantic_search(query, "lyrics", False)

We get back 0 songs that fit, here are the top results:


In [167]:
query = "I am in love with you"
semantic_search(query, "chunks", False)

We get back 19 songs that fit, here are the top results:
From Taylor Swift : You Are in Love: 
From Taylor Swift : Gorgeous: 
From Taylor Swift : Don't Blame Me: 
From Taylor Swift : Dancing With Our Hands Tied: 
From Taylor Swift : Delicate: 
