# Data

The data used for the example can be found at:

* https://www.kaggle.com/datasets/juanmah/world-cities

In [3]:
#setup
import uuid
from elasticsearch import Elasticsearch
import pandas as pd
from geojson import Point
import json
import spacy_sentence_bert
import spacy
from tqdm.notebook import tqdm


esClient = Elasticsearch("http://localhost:9200")
esConnected = esClient.ping()
print("Connected to elastic:", esConnected)

nlp = spacy.load("../model/en_stsb_roberta_large/en_stsb_roberta_large-0.1.2")
#nlp = spacy_sentence_bert.load_model('en_stsb_roberta_large')


Connected to elastic: False




In [6]:
# setup elasticsearch
mappingsStr = '''
{
  "properties": {
    "phrase-vector": {
      "type": "dense_vector",
      "similarity": "l2_norm",
      "dims": 1024
    },
    "phrase": {
      "type": "text"
    },
    "id": {
      "type": "text"
    },
    "name": {
      "type": "text"
    },
    "lat": {
      "type": "float"
    },
    "lon": {
      "type": "float"
    },
    "country": {
      "type": "text"
    },
    "population": {
      "type": "integer"
    },
    "location":{
      "type": "object"
    }
  }
}
'''

mappingJson = json.loads(mappingsStr)
# mappingJson["properties"]["phrase-vector"]["dims"]=vectorDims
# mappingJson["properties"]["phrase-vector"]["similarity"]="dot_product"

indexName = "knn-index"
esClient.indices.create(index=indexName, ignore=400, mappings=mappingJson)

  esClient.indices.create(index=indexName, ignore=400, mappings=mappingJson)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'knn-index'})

In [9]:
# parse the csv data

docEntryTmpl = """
{ "name": "title_goes_here", "country": "country_goes_here", "location": {"type":"unknown"}, "phrase-vector": [0,0,0], "phrase":"phase_goes_here", "population": 0.0, "lat": 0.1, "lon": 0.2, "id":"id_goes_here"}
"""

csvFile = "../data/worldcities.csv"

def parseCSVEntries(df):

    indexEntries = []
    with tqdm(total=len(df)) as pbar:
        for index, row in df.iterrows():
            indexEntry = docEntryTmpl
            indexEntryJson = json.loads(indexEntry)

            id = str(uuid.uuid4())
            city = row["city"]
            country =row["country"]
            geoPoint = Point([float(row["lng"]), float(row["lat"])])

            population = 0
            try:
                population = int(row["population"])
            except:
                population = -1

            indexEntryJson["id"] = id
            indexEntryJson["name"] = city
            indexEntryJson["country"] = country
            indexEntryJson["population"] = population
            indexEntryJson["location"] = geoPoint
            indexEntryJson["lat"] = row["lat"]
            indexEntryJson["lon"] = row["lng"]

            phrase = "The city of {city} in {country}, has a population of {population} people.".format(city=city, country=country, population=population)
            phraseVector = nlp(phrase)
            indexEntryJson["phrase"] = phrase
            indexEntryJson["phrase-vector"] = phraseVector.vector

            if index == 1:
                print("Example Index Entry: ", indexEntryJson)
            indexEntries.append(indexEntryJson)
            pbar.update(1)
    
    return indexEntries


def readCSV(filePath):
    df = pd.read_csv(filePath)
    return df

df = readCSV(csvFile)

searchEntries = parseCSVEntries(df)
# print(searchEntries[0])


print("indexing {} entries".format(len(searchEntries)))

for searchEntry in searchEntries:

    esClient.index(
        index=indexName,
        id=searchEntry["id"],
        document=searchEntry,
    )

print("indexed {} entries".format(len(searchEntries)))

exampleResponse = esClient.search(
    index=indexName, body = {
        'size' : 1,
        'query': {
        'match_all' : {}
        }
    }
)

print("example indexed response:")
print(exampleResponse)



  0%|          | 0/44691 [00:00<?, ?it/s]

Example Index Entry:  {'name': 'Jakarta', 'country': 'Indonesia', 'location': {"coordinates": [106.8275, -6.175], "type": "Point"}, 'phrase-vector': array([ 0.7157666 ,  0.24740766, -0.69210047, ...,  1.1362973 ,
       -1.0295087 , -0.01820834], dtype=float32), 'phrase': 'The city of Jakarta in Indonesia, has a population of 33756000 people.', 'population': 33756000, 'lat': -6.175, 'lon': 106.8275, 'id': 'eeb32b14-4efb-4e90-a786-326cd83620a1'}
indexing 44691 entries
indexed 44691 entries
example indexed response:
{'took': 177, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10000, 'relation': 'gte'}, 'max_score': 1.0, 'hits': [{'_index': 'knn-index', '_id': 'e3855a3b-17eb-491e-9882-1233cc6147dc', '_score': 1.0, '_source': {'name': 'Tokyo', 'country': 'Japan', 'location': {'type': 'Point', 'coordinates': [139.6922, 35.6897]}, 'phrase-vector': [0.44340047240257263, 0.1447250247001648, 0.26648181676864624, 0.74244701862