In [35]:
import json, re
from pathlib import Path
from typing import Dict
import math

# Hyperspace config and setup

In [36]:
import json

config = {
    "configuration": {
        "id": {
            "type":"float"
        },
        "title":{
        "type":"keyword"
        },
        "bundle_id": {
            "type":"keyword"
        },
        "ios":{
        "type":"boolean"
        },
        "categories": {
            "type":"keyword",
            "struct_type":"list"
        },
        "content": {
            "type":"keyword"
        },
        "embedded_app": {
            "type": "dense_vector",
            "dim": 384,
            "index_type": "brute_force",
            "metric": "IP"
        }
    }
}

with open('advec_config.json', 'w') as f:
    f.write(json.dumps(config, indent=2))

In [37]:
import hyperspace

username = "amir@argmaxml.com"
password = "wcEa^w5g4D"

host = "https://beta.prod.hyper-space.xyz/"

hyperspace_client = hyperspace.HyperspaceClientApi(host=host,
                                                   username=username, password= password)

collection_name = 'advec'


In [40]:

try:
    hyperspace_client.delete_collection(collection_name)
except:
    pass
hyperspace_client.create_collection('advec_config.json', collection_name)
hyperspace_client.collections_info()

{'collections': {'advec': {'creation_time': '2023-10-01T13:33:29Z',
   'last_query_time': '2023-10-02T08:45:41Z',
   'size': 89001},
  'amazon-images-norm': {'creation_time': '2023-10-02T13:55:17Z',
   'last_query_time': '2023-10-03T06:03:49Z',
   'size': 100000}}}

In [34]:
import numpy as np
vecs = np.load('data/vectors.npy')
metadata_file = open('data/context.jsonl',encoding="utf8")

In [85]:
BATCH_SIZE = 500

batch = []
for i, (metadata_row, vec) in enumerate(zip(metadata_file, vecs)):
    row = {key: value for key, value in json.loads(metadata_row).items() if key in config["configuration"].keys()}
    row['embedded_app'] = np.ndarray.tolist(vec)

    batch.append(hyperspace.Document(str(i), row))

    if i % BATCH_SIZE == 0:
        response = hyperspace_client.add_batch(batch, collection_name)
        batch.clear()
        print(i, response)
response = hyperspace_client.add_batch(batch, collection_name)
batch.clear()
print(i, response)
hyperspace_client.commit(collection_name)

0 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
500 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
1000 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
1500 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
2000 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
2500 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
3000 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
3500 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
4000 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
4500 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
5000 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
5500 {'code': 200,
 'mess

In [41]:
hyperspace_client.collections_info()

{'collections': {'advec': {'creation_time': '2023-10-01T13:33:29Z',
   'last_query_time': '2023-10-02T08:45:41Z',
   'size': 89001},
  'amazon-images-norm': {'creation_time': '2023-10-02T13:55:17Z',
   'last_query_time': '2023-10-03T06:03:49Z',
   'size': 100000}}}

In [43]:
input_document = hyperspace_client.get_document(collection_name, 42)
print(input_document['title'] + "\n" + str(input_document['categories']))

Sensors: Temp and Humidity
['WEATHER', 'APPLICATION']


In [13]:

query_with_knn = {
    'params': input_document,
    'knn' : {
        'embedded_app' : {"boost":1},
        'query' : {'boost':0}
    }
}

results = hyperspace_client.search(query_with_knn,
                                        size=5,
                                        collection_name=collection_name)

def print_res(result,keys=["title","bundle_id","categories"]):
    for i,result in enumerate(results['similarity']):
        vector_api_response = hyperspace_client.get_document(document_id=result['document_id'], collection_name=collection_name)
        response = f"{i+1} - {result['document_id']} : {result['score']} --- "
        keys_str = " - ".join([str(vector_api_response[k]) for k in keys])
        print(response+keys_str)

In [14]:
print_res(results)

1 - 12145 : 1.0000001192092896 --- Sensors: Temp and Humidity - com.ydvisual.s4envtrak - ['WEATHER', 'APPLICATION']
2 - 42 : 1.0000001192092896 --- Sensors: Temp and Humidity - com.ydvisual.s4envtrak - ['WEATHER', 'APPLICATION']
3 - 43618 : 0.9984776973724365 --- Temperature humidity barometeF - jp.metersfree - ['TOOLS', 'APPLICATION']
4 - 46921 : 0.9973560571670532 --- Real Mercury Thermometer - com.discipleskies.dsthermometer - ['WEATHER', 'APPLICATION']
5 - 9002 : 0.9973560571670532 --- Real Mercury Thermometer - com.discipleskies.dsthermometer - ['WEATHER', 'APPLICATION']


## Classic score

In [15]:
#classic score func
sf_file = 'classic_score.py'
hyperspace_client.set_function(sf_file, collection_name=collection_name, function_name='classic_score')

{'code': 200, 'message': 'Function was set successfully', 'status': 'OK'}

In [16]:
input_document = hyperspace_client.get_document(collection_name, 42)
input_document['title'] + "\n" + str(input_document['categories'])

"Sensors: Temp and Humidity\n['WEATHER', 'APPLICATION']"

In [46]:
query_with_knn = {
    'params': input_document,
    'knn' : {
        'embedded_app' : {"boost":0},
        'query' : {'boost':1}
    }
}

results = hyperspace_client.search(query_with_knn,
                                        size=10,
                                   function_name="classic_score",
                                        collection_name=collection_name)
print_res(results,["title","bundle_id","categories"])

1 - 16250 : 19.664777755737305 --- Weather Home - Live Radar - com.home.weather.radar - ['WEATHER', 'APPLICATION']
2 - 16689 : 19.664777755737305 --- Angry Lion City Attack : Anima - com.hush.game.angrylionstrike.amazing - ['WEATHER', 'APPLICATION']
3 - 16968 : 19.664777755737305 --- Windsock - Automatic METAR/TAF - com.iawix.windsock - ['WEATHER', 'APPLICATION']
4 - 17381 : 19.664777755737305 --- NOAA Buoys Live Marine Weather - com.sherpaoutdoorapp.noaaweatherbuoys - ['WEATHER', 'APPLICATION']
5 - 17629 : 19.664777755737305 --- Images of the Sun from SOHO - com.silentlexx.sohowrapper - ['WEATHER', 'APPLICATION']
6 - 17894 : 19.664777755737305 --- OnTheSnow Ski & Snow Report - com.skireport - ['WEATHER', 'APPLICATION']
7 - 17976 : 19.664777755737305 --- Skymet Weather - com.skymet.indianweather - ['WEATHER', 'APPLICATION']
8 - 18535 : 19.664777755737305 --- Weather Station for Galaxy S4 - tk.giesecke.weatherstation - ['WEATHER', 'APPLICATION']
9 - 18710 : 19.664777755737305 --- iMeteo

## Generate new embeddings and search

In [18]:
from sentence_transformers import SentenceTransformer
emb_model = SentenceTransformer('BAAI/bge-small-en')

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
sim_sentence = """a great app for gaming with my friends """
# Embedding using sentence-transformers - irrelevant at advec-be deployment machine
sim_embedding = emb_model.encode([sim_sentence], normalize_embeddings=True)[0]

query_with_knn = {
    'params': {
        
        'embedded_app':sim_embedding.tolist()
    },
    'knn' : {
        'embedded_app' : {"boost":1},

    }
}


In [32]:
%%time
results = hyperspace_client.search(query_with_knn,
                                        size=20,
                                        collection_name=collection_name)

CPU times: total: 0 ns
Wall time: 168 ms


In [33]:
results
print_res(results,['title','bundle_id'])

1 - 20141 : 0.8744263648986816 --- Game Offline 3D no Wi Fi - com.chillapps.gameoffline3d
2 - 70640 : 0.8744263648986816 --- Game Offline 3D no Wi Fi - com.chillapps.gameoffline3d
3 - 76652 : 0.8744263648986816 --- Game Offline 3D no Wi Fi - com.chillapps.gameoffline3d
4 - 37344 : 0.8736395835876465 --- Vlinder avatar maker: Anime - com.dressup.doll.vlinder.avatar.maker.anime
5 - 50248 : 0.8733617663383484 --- Real Fireworks - com.mustafademir.realfireworks
6 - 40079 : 0.8724929690361023 --- Dragon Blast - com.dragonblast.free
7 - 42537 : 0.8724929690361023 --- Monster Blast - com.candy.cute.monster.blast.gp
8 - 9564 : 0.8724929690361023 --- Dragon Blast - com.dragonblast.free
9 - 74551 : 0.871880054473877 --- KurtMaster2D - com.plbm.plbm1
10 - 84062 : 0.871880054473877 --- KurtMaster2D - com.plbm.plbm1
11 - 39177 : 0.871452808380127 --- ReallyMake: Pottery Sculpting - 1191748553
12 - 50148 : 0.8714419603347778 --- KoGaMa Brazil - com.multiverse.brkogama
13 - 50151 : 0.8714419603347778

## Hybrid
We search for similar descriptions but filter out apps that are not from ios using score function

In [47]:
#classic score func
sf_file = 'only_ios_score.py'
hyperspace_client.set_function(sf_file, collection_name=collection_name, function_name='ios_score')

{'code': 200, 'message': 'Function was set successfully', 'status': 'OK'}

In [84]:
input_document = hyperspace_client.get_document(collection_name, 71960)
input_document['title'] + "\n" + str(input_document['categories'])

"Fish Island - Fishing Paradise\n['Game', 'Petualangan', 'Kasual']"

In [92]:
query_with_knn = {
    'params': input_document,
    'knn' : {
        'embedded_app' : {"boost":1},
        'query' : {'boost':1}
    }
}

results = hyperspace_client.search(query_with_knn,
                                        size=10,
                                   function_name="ios_score",
                                        collection_name=collection_name)
print_res(results,["ios","title","bundle_id","categories"])

In [93]:
results

{'candidates': 1, 'similarity': [], 'took_ms': 3.02952}