In [1]:
# load required libraries

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

import os
import json
import dotenv
import logging

# fetch project_id and location from environment
dotenv.load_dotenv()
project_id = os.environ.get("PROJECT_ID")
location = os.environ.get("REGION")

# init vertex ai with project_id and location
vertexai.init(project=project_id, location=location)


In [8]:
# load the jsonl file
guides = {}
with open("city_guides.jsonl") as f:
    for line in f:
        data = json.loads(line)
        guides[data['city']] = data['guide']
        

In [9]:
len(guides)

48

In [20]:
guide = guides[list(guides.keys())[0]]
len(guide)

65087

In [21]:
# count tokens
model = GenerativeModel(model_name="gemini-1.5-flash-002")
print(model.count_tokens(guide))


total_tokens: 19667
total_billable_characters: 54259



In [23]:
# create embedding
embed_model = TextEmbeddingModel.from_pretrained("text-embedding-004")
embeddings = embed_model.get_embeddings([guide])


In [37]:
embed_model.count_tokens([guide])

CountTokensResponse(total_tokens=20468, total_billable_characters=54259, _count_tokens_response=total_tokens: 20468
total_billable_characters: 54259
)

In [46]:
len(embeddings[0].values)


768

In [48]:
# let's experiment with the embedding tasks
trunc_guide = guide[:1024]


"{{pagebanner|Agra banner Taj Mahal.jpg|unesco=yes}}\n'''Agra''' (Hindi: आगरा ''Āgrā'') is the city of the Taj Mahal, in the north [[India]]n state of [[Uttar Pradesh]], some 200&nbsp;km from [[Delhi]].\n\nAgra has three [[UNESCO World Heritage List|UNESCO World Heritage]] sites, the '''Taj Mahal''' and the '''Agra Fort''' in the city and '''[[Fatehpur Sikri]]''' 40 km away. There are also many other buildings and tombs from Agra's days of glory as the capital of the [[Mughal Empire]].\n\nBesides these three sites, the city has little else to recommend it. Pollution, especially smog and litter, is rampant and visitors are pestered by swarms of touts and hawkers at every monument, besides the inner Taj Mahal which, once you are in, is free of scams and touts. The sites are some of the wonders of the world and no trip to India is complete without at least one visit to the Taj. For the vast majority of visitors, a single day in Agra is more than enough.\n\n==Understand==\n\nWhile the heyd

In [49]:
inputs = [TextEmbeddingInput(trunc_guide, "RETRIEVAL_DOCUMENT"), 
          TextEmbeddingInput(trunc_guide, "RETRIEVAL_DOCUMENT"), 
          TextEmbeddingInput(trunc_guide, "RETRIEVAL_DOCUMENT"), 
          TextEmbeddingInput(trunc_guide, "RETRIEVAL_QUERY"),
          TextEmbeddingInput(trunc_guide, "QUESTION_ANSWERING")]


In [51]:
embeddings = embed_model.get_embeddings(inputs)
embeddings

[TextEmbedding(values=[0.015897389501333237, 0.03316788375377655, 0.010219445452094078, -0.05440279841423035, -0.01795378513634205, 0.01967422105371952, -0.05591551214456558, -0.07765986025333405, -0.01197514496743679, 0.1068020612001419, -0.027489451691508293, -0.04200753942131996, 0.04385225102305412, -0.04079624265432358, 0.03113626129925251, -0.0181539598852396, 0.05600706860423088, -0.009504806250333786, -0.05576562136411667, -0.042506322264671326, 0.027680814266204834, 0.016630778089165688, -0.01653320901095867, -0.024979805573821068, 0.012332777492702007, -0.0023177291732281446, -0.0030722045339643955, 0.08351946622133255, 0.008124026469886303, -0.018110929057002068, 0.023136870935559273, 0.10808590799570084, -0.012824470177292824, -0.051105618476867676, 0.019684286788105965, 0.014067677780985832, 0.0360352024435997, -0.02876073308289051, 0.03255864232778549, -0.030468719080090523, -0.053108152002096176, 0.0427626334130764, -0.0009338866220787168, -0.02100309729576111, -0.057286

In [52]:
len(embeddings)

5

In [54]:
print(embeddings[0].values)

[0.015897389501333237, 0.03316788375377655, 0.010219445452094078, -0.05440279841423035, -0.01795378513634205, 0.01967422105371952, -0.05591551214456558, -0.07765986025333405, -0.01197514496743679, 0.1068020612001419, -0.027489451691508293, -0.04200753942131996, 0.04385225102305412, -0.04079624265432358, 0.03113626129925251, -0.0181539598852396, 0.05600706860423088, -0.009504806250333786, -0.05576562136411667, -0.042506322264671326, 0.027680814266204834, 0.016630778089165688, -0.01653320901095867, -0.024979805573821068, 0.012332777492702007, -0.0023177291732281446, -0.0030722045339643955, 0.08351946622133255, 0.008124026469886303, -0.018110929057002068, 0.023136870935559273, 0.10808590799570084, -0.012824470177292824, -0.051105618476867676, 0.019684286788105965, 0.014067677780985832, 0.0360352024435997, -0.02876073308289051, 0.03255864232778549, -0.030468719080090523, -0.053108152002096176, 0.0427626334130764, -0.0009338866220787168, -0.02100309729576111, -0.05728669837117195, -0.059358

In [55]:
print(embeddings[1].values)

[0.015897389501333237, 0.03316788375377655, 0.010219445452094078, -0.05440279841423035, -0.01795378513634205, 0.01967422105371952, -0.05591551214456558, -0.07765986025333405, -0.01197514496743679, 0.1068020612001419, -0.027489451691508293, -0.04200753942131996, 0.04385225102305412, -0.04079624265432358, 0.03113626129925251, -0.0181539598852396, 0.05600706860423088, -0.009504806250333786, -0.05576562136411667, -0.042506322264671326, 0.027680814266204834, 0.016630778089165688, -0.01653320901095867, -0.024979805573821068, 0.012332777492702007, -0.0023177291732281446, -0.0030722045339643955, 0.08351946622133255, 0.008124026469886303, -0.018110929057002068, 0.023136870935559273, 0.10808590799570084, -0.012824470177292824, -0.051105618476867676, 0.019684286788105965, 0.014067677780985832, 0.0360352024435997, -0.02876073308289051, 0.03255864232778549, -0.030468719080090523, -0.053108152002096176, 0.0427626334130764, -0.0009338866220787168, -0.02100309729576111, -0.05728669837117195, -0.059358

In [56]:
print(embeddings[-1].values)

[0.013275518082082272, 0.043954554945230484, -0.03184553235769272, -0.021339820697903633, 0.009085732512176037, 0.04147480055689812, -0.00505526177585125, -0.04750632494688034, 0.0003982528578490019, 0.07577382028102875, 0.0013471965212374926, -0.001658832305110991, 0.027909215539693832, -0.0352165661752224, 0.05985870212316513, -0.03126201406121254, 0.09204615652561188, 0.01083969697356224, -0.028716444969177246, -0.09018561989068985, -0.0030709931161254644, 0.04590848088264465, -0.017654696479439735, 0.0038497024215757847, 0.02753075398504734, -0.004533745348453522, -0.007451934739947319, 0.06784041970968246, 0.013512023724615574, -0.030696703121066093, 0.033896587789058685, 0.09831488877534866, -0.0012915108818560839, -0.045629918575286865, 0.027113353833556175, 0.028020109981298447, 0.028338482603430748, -0.01381932944059372, 0.017636042088270187, -0.056319452822208405, -0.06471629440784454, 0.020871222019195557, -0.012439064681529999, -0.007282075472176075, -0.043530967086553574, 