In [90]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    ScoringProfile,
    SearchFieldDataType,
    SimpleField,
    SearchableField
)

In [143]:
import yaml

with open('../creds.dev.yml') as credsfi:
    creds = yaml.safe_load(credsfi)


service_name = creds['service_name']
admin_key = creds['primary_admin']

index_name = "hotels-quickstart"

# Create an SDK client
endpoint = "https://{}.search.windows.net/".format(service_name)
client_type = SearchIndexClient
admin_client = client_type(endpoint=endpoint,
                      index_name=index_name,
                      credential=AzureKeyCredential(admin_key))


client_type = SearchClient
search_client = client_type(endpoint=endpoint,
                      index_name=index_name,
                      credential=AzureKeyCredential(admin_key))

In [144]:
# clear out any existing data
try:
    result = admin_client.delete_index(index_name)
    print ('Index', index_name, 'Deleted')
except Exception as ex:
    print (ex)

Index hotels-quickstart Deleted


In [145]:
# Specify the index schema
name = index_name
fields = [
        SimpleField(name="HotelId", type=SearchFieldDataType.String, key=True),
        SearchableField(name="HotelName", type=SearchFieldDataType.String, sortable=True),
        SearchableField(name="Description", type=SearchFieldDataType.String, analyzer_name="en.lucene"),
        SearchableField(name="Description_fr", type=SearchFieldDataType.String, analyzer_name="fr.lucene"),
        SearchableField(name="Category", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),

        SearchableField(name="Tags", collection=True, type=SearchFieldDataType.String, facetable=True, filterable=True),

        SimpleField(name="ParkingIncluded", type=SearchFieldDataType.Boolean, facetable=True, filterable=True, sortable=True),
        SimpleField(name="LastRenovationDate", type=SearchFieldDataType.DateTimeOffset, facetable=True, filterable=True, sortable=True),
        SimpleField(name="Rating", type=SearchFieldDataType.Double, facetable=True, filterable=True, sortable=True),

        ComplexField(name="Address", fields=[
            SearchableField(name="StreetAddress", type=SearchFieldDataType.String),
            SearchableField(name="City", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),
            SearchableField(name="StateProvince", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),
            SearchableField(name="PostalCode", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),
            SearchableField(name="Country", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),
        ])
    ]
cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
scoring_profiles = []
suggester = [{'name': 'sg', 'source_fields': ['Tags', 'Address/City', 'Address/Country']}]

In [98]:
a = None
b = a or 'goodbye'

b

'goodbye'

In [43]:
# Specify the index schema
# BUT WHAT IF THERE'S NO KEY?!

# (OperationNotAllowed) The request is invalid. Details: index : Found 0 key fields in index 'hotels-quickstart'. Each index must have exactly one key field.
# Code: OperationNotAllowed
# Message: The request is invalid. Details: index : Found 0 key fields in index 'hotels-quickstart'. Each index must have exactly one key field.
# Exception Details:	(MissingKeyField) Found 0 key fields in index 'hotels-quickstart'. Each index must have exactly one key field. Parameters: index
# 	Code: MissingKeyField
# 	Message: Found 0 key fields in index 'hotels-quickstart'. Each index must have exactly one key field. Parameters: index

name = index_name
fields_nokey = [
        SimpleField(name="HotelId", type=SearchFieldDataType.String),
        SearchableField(name="HotelName", type=SearchFieldDataType.String, sortable=True),
        SearchableField(name="Description", type=SearchFieldDataType.String, analyzer_name="en.lucene"),
        SearchableField(name="Description_fr", type=SearchFieldDataType.String, analyzer_name="fr.lucene"),
        SearchableField(name="Category", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),

        SearchableField(name="Tags", collection=True, type=SearchFieldDataType.String, facetable=True, filterable=True),

        SimpleField(name="ParkingIncluded", type=SearchFieldDataType.Boolean, facetable=True, filterable=True, sortable=True),
        SimpleField(name="LastRenovationDate", type=SearchFieldDataType.DateTimeOffset, facetable=True, filterable=True, sortable=True),
        SimpleField(name="Rating", type=SearchFieldDataType.Double, facetable=True, filterable=True, sortable=True),

        ComplexField(name="Address", fields=[
            SearchableField(name="StreetAddress", type=SearchFieldDataType.String),
            SearchableField(name="City", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),
            SearchableField(name="StateProvince", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),
            SearchableField(name="PostalCode", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),
            SearchableField(name="Country", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),
        ])
    ]
cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
scoring_profiles = []
suggester = [{'name': 'sg', 'source_fields': ['Tags', 'Address/City', 'Address/Country']}]

In [146]:
index = SearchIndex(
    name=name,
    fields=fields,
    scoring_profiles=scoring_profiles,
    suggesters = suggester,
    cors_options=cors_options)

try:
    result = admin_client.create_index(index)
    print(result)
    print ('Index', result.name, 'created')
except Exception as ex:
    print (ex)

{'additional_properties': {}, 'name': 'hotels-quickstart', 'fields': [<azure.search.documents.indexes.models._index.SearchField object at 0x1046c20e0>, <azure.search.documents.indexes.models._index.SearchField object at 0x104b8eec0>, <azure.search.documents.indexes.models._index.SearchField object at 0x1062839a0>, <azure.search.documents.indexes.models._index.SearchField object at 0x106282d70>, <azure.search.documents.indexes.models._index.SearchField object at 0x106747b50>, <azure.search.documents.indexes.models._index.SearchField object at 0x105e05a80>, <azure.search.documents.indexes.models._index.SearchField object at 0x106740040>, <azure.search.documents.indexes.models._index.SearchField object at 0x10648a320>, <azure.search.documents.indexes.models._index.SearchField object at 0x10648b310>, <azure.search.documents.indexes.models._index.SearchField object at 0x106488490>], 'scoring_profiles': [], 'default_scoring_profile': None, 'cors_options': <azure.search.documents.indexes._gen

In [147]:
import yaml

with open('documents.json') as infi:
    documents = yaml.safe_load(infi)
    
print(len(documents))

4


In [148]:
try:
    result = search_client.upload_documents(documents=documents)
    print("Upload of new {} document(s) succeeded: {}".format(len(documents), result[0].succeeded))
except Exception as ex:
    print(ex.message)
    
# it seems that `key`s are unique (in our case HotelId) so if you upload a duplicate document,
# it will reupload the same doc with no exception

Upload of new 4 document(s) succeeded: True


In [48]:
# let's test this out by changing the name of one of the hotels
# update: yep!

documents[0]['HotelName'] += ' IS CHANGED NOW! AGAIN!'
documents[-1]['Address']['City'] += ' Some More Words'

try:
    result = search_client.upload_documents(documents=documents)
    print("Upload of new {} document(s) succeeded: {}".format(len(documents), result[0].succeeded))
except Exception as ex:
    print(ex.message)
    
# it seems that `key`s are unique (in our case HotelId) so if you upload a duplicate document,
# it will reupload the same doc with no exception

Upload of new 4 document(s) succeeded: True


In [149]:
# executes an empty search (search=*), returning an unranked list (search score = 1.0) of arbitrary documents.
# Because there are no criteria, all documents are included in results
# same thing happens if `search_text` param is missing
# if `include_total_count` is `False`, `results.get_count()` does not fail, but is `None`


results = search_client.search(search_text='*', include_total_count=True)

print ('Total Documents Matching Query:', results.get_count())
for result in results:
    print("{}: {}\n{}".format(result["HotelId"], result["HotelName"], result['Address']))

Total Documents Matching Query: 4
1: Secret Point Motel
{'StreetAddress': '677 5th Ave', 'City': 'New York', 'StateProvince': 'NY', 'PostalCode': '10022', 'Country': 'USA'}
2: Twin Dome Motel
{'StreetAddress': '140 University Town Center Dr', 'City': 'Sarasota', 'StateProvince': 'FL', 'PostalCode': '34243', 'Country': 'USA'}
3: Triple Landscape Hotel
{'StreetAddress': '3393 Peachtree Rd', 'City': 'Atlanta', 'StateProvince': 'GA', 'PostalCode': '30326', 'Country': 'USA'}
4: Sublime Cliff Hotel
{'StreetAddress': '7400 San Pedro Ave', 'City': 'San Antonio', 'StateProvince': 'TX', 'PostalCode': '78216', 'Country': 'USA'}


In [151]:
# adds whole terms to the search expression ("wifi")
# specifies that results contain only those fields in the `select` statement
# reduces data sent/rcved
# NOTE: searches all fields, even if they aren't in `select` stmt

results = search_client.search(search_text="wifi", include_total_count=True) #, select='HotelId,HotelName,Tags')

print ('Total Documents Matching Query:', results.get_count())
for result in results:
    print("{}: {}: {}".format(result["HotelId"], result["HotelName"], result["Tags"]))

Total Documents Matching Query: 1
2: Twin Dome Motel: ['pool', 'free wifi', 'concierge']


In [121]:
# filter, sort
results = search_client.search(search_text="hotels", 
                               select='HotelId,HotelName,Rating', 
                               filter='Rating gt 4', 
                               order_by='Rating desc')

for result in results:
    print("{}: {} - {} rating".format(result["HotelId"], result["HotelName"], result["Rating"]))

3: Triple Landscape Hotel - 4.8 rating
4: Sublime Cliff Hotel - 4.6 rating


In [22]:
# scope query matching

results = search_client.search(search_text="sublime", search_fields=['HotelName'], select='HotelId,HotelName')

for result in results:
    print("{}: {}".format(result["HotelId"], result["HotelName"]))

4: Sublime Cliff Hotel


In [23]:
# add facets and counts
# back to a match-everything search

results = search_client.search(search_text="*", facets=["Category"])

facets = results.get_facets()

for facet in facets["Category"]:
    print("    {}".format(facet))

    {'value': 'Boutique', 'count': 3}
    {'value': 'Resort and Spa', 'count': 1}


In [141]:
# just return a single doc by id ("key")
# we specified that HotelId was the key like:
# SimpleField(name="HotelId", type=SearchFieldDataType.String, key=True)

result = search_client.get_document(key="3")

print("Details for hotel '3' are:")
print("Name: {}".format(result["HotelName"]))
print("Rating: {}".format(result["Rating"]))
print("Category: {}".format(result["Category"]))

Details for hotel '3' are:
Name: Triple Landscape Hotel
Rating: 4.8
Category: Resort and Spa


In [31]:
# ooh fun, autocomplete!
# When the index was created, a suggester named sg was also created as part of the request
# suggester = [{'name': 'sg', 'source_fields': ['Tags', 'Address/City', 'Address/Country']}]
# what does `twoTerms` mean?
# The default is 'oneTerm'. Use 'twoTerms' to get shingles and 'oneTermWithContext
# to use the current context while producing auto-completed terms. 
# Possible values include: 'oneTerm', 'twoTerms', 'oneTermWithContext'.
# note that `twoTerms` LITERALLY RETURNS ONLY TWO TERMS even if there are more tokens in the field?!

search_suggestion = 'sa'
results = search_client.autocomplete(search_text=search_suggestion, suggester_name="sg", mode='twoTerms')

print("Autocomplete for:", search_suggestion)
for result in results:
    print (result['text'])

Autocomplete for: sa
san antonio
sarasota some


In [50]:
# what do we actually get back?
import json

results = search_client.search(search_text="*", include_total_count=True)

for res in results:
    print(json.dumps(res, sort_keys=True, indent=2))

{
  "@search.captions": null,
  "@search.highlights": null,
  "@search.reranker_score": null,
  "@search.score": 1.0,
  "Address": {
    "City": "New York",
    "Country": "USA",
    "PostalCode": "10022",
    "StateProvince": "NY",
    "StreetAddress": "677 5th Ave"
  },
  "Category": "Boutique",
  "Description": "The hotel is ideally located on the main commercial artery of the city in the heart of New York. A few minutes away is Time's Square and the historic centre of the city, as well as other places of interest that make New York one of America's most attractive and cosmopolitan cities.",
  "Description_fr": "L'h\u00f4tel est id\u00e9alement situ\u00e9 sur la principale art\u00e8re commerciale de la ville en plein c\u0153ur de New York. A quelques minutes se trouve la place du temps et le centre historique de la ville, ainsi que d'autres lieux d'int\u00e9r\u00eat qui font de New York l'une des villes les plus attractives et cosmopolites de l'Am\u00e9rique.",
  "HotelId": "1",
  "

In [125]:
# what about when we do an actual search?
# NOTE: `change*` matches `changed` but `change` does not -- did we do any stemming/lemmatization?
# answer: nope! we didn't specify an analyzer? let's try with some analyzed fields
# search_mode defaults to "any"

# NOTES ABOUT HIGHLIGHTING:
# all three of highlight_pre_tag, highlight_post_tag, and highlight_fields are necessary
# falls under toplevel `@search.highlights`, NOT included in the original doc returned
# therefore json must be merged for display
# for some reason the value of `@search.highlights.HotelName` is an Array[String]?!

# Only searchable fields can be used in the highlight list. (eg. HotelId is a SimpleField)

import json

results = search_client.search(search_text=["hotel"], 
                               include_total_count=True, 
                               search_fields=['Description'],
                              search_mode="any",
                              highlight_pre_tag="<i>",
                              highlight_post_tag="</i>",
                              highlight_fields="HotelName"
                              )

print(results.get_count())
for res in results:
    #print(json.dumps(res, sort_keys=True, indent=2))
    print(res['HotelName'])
    print(res['@search.highlights'])#['HotelName'])

4
Triple Landscape Hotel
None
Twin Dome Motel
None
Secret Point Motel
None
Sublime Cliff Hotel
None


In [116]:
# NOTE: `change*` matches `changed` but `change` does not -- did we do any stemming/lemmatization?
# answer: nope! we didn't specify an analyzer for the hotel name! let's try with some analyzed fields

results = search_client.search(search_text=["hotel"], 
                               include_total_count=True,
                               highlight_pre_tag="<i>",
                              highlight_post_tag="</i>",
                              highlight_fields="Description"
                              )
for res in results:
    print(res['HotelName'])
    print(res.get('@search.highlights', {}).get('Description'))

In [140]:
FIELD_TYPES = {
    "SimpleField": SimpleField,
    "SearchableField": SearchableField,
    "ComplexField": ComplexField
}

# TODO add more of these
DATA_TYPES = {
    "string": SearchFieldDataType.String,
    "double": SearchFieldDataType.Double,
    "datetime_offset": SearchFieldDataType.DateTimeOffset,
    "boolean": SearchFieldDataType.Boolean
}

def build_fields(json_fields):
    fields = []
    for field in json_fields:
        print(len(json_fields), field)
        field_type_name = field['field_type']
        field_type = FIELD_TYPES[field_type_name]
        field['type'] = DATA_TYPES[field['type']]
        
        if field_type_name == 'ComplexField':
            field['fields'] = build_fields(field['fields'])
        
        #field_type = FIELD_TYPES[field_type_name]
        #del field['field_type']
        #print('\t', field)
        #if field.get('type'):
        #field['type'] = DATA_TYPES[field['type']]

        fields.append(field_type(**field))

    return fields

with open('../indexes/hotels-quickstart.json') as infi:
    index = json.load(infi)
    
#print(len(index))

#print(build_fields(index['fields']))

# def get_field_names(json_fields, prefix='', sep='/'):
#     field_names = ['/'.join([prefix, f['name']]).strip('/') for f in json_fields
#                    if f['field_type'] != 'ComplexField']
#     for f in json_fields:
#         if f['field_type'] == 'ComplexField':
#             base_name = '/'.join([prefix, f['name']]).strip('/')
#             subfields = get_field_names(f['fields'], base_name)
#             field_names.extend(subfields)
    
#     return field_names

# get_field_names(index['fields'])

def get_field_names(json_fields, prefix='', sep='/', only_highlightable=False):
    field_names = []
    for f in json_fields:
        base_name = sep.join([prefix, f['name']]).strip(sep)
        if f['field_type'] == 'ComplexField':
            subfields = get_field_names(f['fields'], base_name)
            field_names.extend(subfields)
        else:
            if only_highlightable:
                if f['field_type'] == 'SearchableField':
                    field_names.append(base_name)
            else:
                field_names.append(base_name)

    return field_names

get_field_names(index['fields'], only_highlightable=True)

['HotelName',
 'Description',
 'Description_fr',
 'Category',
 'Tags',
 'Address/StreetAddress',
 'Address/City',
 'Address/StateProvince',
 'Address/PostalCode',
 'Address/Country']