In [1]:
!pip install -qU datasets cohere openai lancedb



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [52]:
from datasets import load_dataset

en = dataset = load_dataset("wikipedia", "20220301.en", streaming=True,)
fr = load_dataset("wikipedia", "20220301.fr", streaming=True)

datasets = {"english": iter(en['train']), "french": iter(fr['train'])}

In [5]:
next(iter(en['train']))

{'id': '12',
 'url': 'https://en.wikipedia.org/wiki/Anarchism',
 'title': 'Anarchism',
 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latte

In [6]:
next(iter(fr['train']))

{'id': '3',
 'url': 'https://fr.wikipedia.org/wiki/Antoine%20Meillet',
 'title': 'Antoine Meillet',
 'text': "Paul Jules Antoine Meillet, né le  à Moulins (Allier) et mort le  à Châteaumeillant (Cher), est le principal linguiste français des premières décennies du . Il est aussi philologue.\n\nBiographie \nD'origine bourbonnaise, fils d'un notaire de Châteaumeillant (Cher), Antoine Meillet fait ses études secondaires au lycée de Moulins.\n\nÉtudiant à la faculté des lettres de Paris à partir de 1885 où il suit notamment les cours de Louis Havet, il assiste également à ceux de Michel Bréal au Collège de France et de Ferdinand de Saussure à l'École pratique des hautes études.\n\nEn 1889, il est major de l'agrégation de grammaire.\n\nIl assure à la suite de Saussure le cours de grammaire comparée, qu'il complète à partir de 1894 par une conférence sur les langues persanes.\n\nEn 1897, il soutient sa thèse pour le doctorat ès lettres (Recherches sur l'emploi du génitif-accusatif en vieux-s

In [50]:
import os
import lancedb
import getpass
from lancedb.embeddings import EmbeddingFunctionRegistry
from lancedb.pydantic import LanceModel, Vector

if "COHERE_API_KEY" not in os.environ:
    os.environ['COHERE_API_KEY'] = getpass.getpass("Enter your Cohere API key: ")
    
registry = EmbeddingFunctionRegistry().get_instance()
cohere = registry.get("cohere").create() # uses multi-lingual model by default (768 dim)

class Schema(LanceModel):
    vector: Vector(cohere.ndims()) = cohere.VectorField()
    text: str = cohere.SourceField()
    url: str
    title: str
    id: str
    lang: str

db = lancedb.connect("~/lancedb")
tbl_cohere = db.create_table("wikipedia-cohere", schema=Schema, mode="overwrite")

In [56]:
import os
import lancedb
import getpass
from lancedb.embeddings import EmbeddingFunctionRegistry
from lancedb.pydantic import LanceModel, Vector

if "OPENAI_API_KEY" not in os.environ:
    os.environ['OPENAI_API_KEY'] = getpass.getpass("Enter your OpenAI API key: ")
    
registry = EmbeddingFunctionRegistry().get_instance()
openai = registry.get("openai").create() # uses multi-lingual model by default (768 dim)

class Schema(LanceModel):
    vector: Vector(openai.ndims()) = openai.VectorField()
    text: str = openai.SourceField()
    url: str
    title: str
    id: str
    lang: str

db = lancedb.connect("~/lancedb")
tbl_openai = db.create_table("wikipedia-openai", schema=Schema, mode="overwrite")

[2023-10-17T10:26:24Z WARN  lance::dataset] No existing dataset at /Users/ayush/lancedb/wikipedia-openai.lance, it will be created


In [60]:
from tqdm.auto import tqdm

# let's use openai embeddings. Use can also set it to cohere
tbl =  tbl_openai # tbl_cohere
batch_size = 500
lang_limit = 5000  # number of records to index from each language
data = []

for i in tqdm(range(0, lang_limit, batch_size)):
    i_end = min(i+batch_size, lang_limit)

    # we do for each language
    for lang, dataset in datasets.items():
        # get the relevant batch
        batch = [next(dataset) for _ in range(batch_size)]
        # extract text
        texts = [x['text'] for x in batch]

        # create ids
        ids = [f"{lang}-{x['id']}" for x in batch]
        data.extend({
           'text': x['text'], 'title': x['title'], 'url': x['url'], 'lang': lang, 'id': f"{lang}-{x['id']}"
        } for x in batch)

tbl.add(data)

100%|██████████| 10/10 [00:00<00:00, 19.84it/s]


RateLimitError: Rate limit reached for text-embedding-ada-002 in organization org-2w85yWn5Y4VhF4vnomj67xoX on tokens per min. Limit: 1000000 / min. Current: 1 / min. Contact us through our help center at help.openai.com if you continue to have issues.

In [47]:
rs = tbl.search("who is giovanni falcone?").limit(5).to_pydantic(Schema)

In [48]:
for r in rs:
    print(r.title, r.url, r.lang)

Corrado Gini https://en.wikipedia.org/wiki/Corrado%20Gini english
Definition of music https://en.wikipedia.org/wiki/Definition%20of%20music english
Garbage collection (computer science) https://en.wikipedia.org/wiki/Garbage%20collection%20%28computer%20science%29 english
Distributed computing https://en.wikipedia.org/wiki/Distributed%20computing english
History of the Soviet Union (1982–1991) https://en.wikipedia.org/wiki/History%20of%20the%20Soviet%20Union%20%281982%E2%80%931991%29 english


In [40]:
tbl.to_pandas()[100:105]

Unnamed: 0,vector,text,url,title,id,lang
100,"[0.1953125, 0.296875, -0.5131836, -0.026107788...",Le mois de nivôse est le quatrième mois du cal...,https://fr.wikipedia.org/wiki/Niv%C3%B4se,Nivôse,french-5599,french
101,"[-0.10498047, 0.45922852, -0.4345703, -0.24072...",Le mois de pluviôse était le cinquième mois du...,https://fr.wikipedia.org/wiki/Pluvi%C3%B4se,Pluviôse,french-5600,french
102,"[-0.097717285, 0.36206055, -0.3413086, -0.4645...",Le mois de ventôse était le sixième mois du ca...,https://fr.wikipedia.org/wiki/Vent%C3%B4se,Ventôse,french-5601,french
103,"[-0.16491699, 0.3395996, -0.32250977, -0.08172...",Le mois de germinal était le septième mois du ...,https://fr.wikipedia.org/wiki/Germinal,Germinal,french-5602,french
104,"[-0.21960449, 0.25854492, -0.44433594, -0.1755...",Le mois de floréal était le huitième mois du c...,https://fr.wikipedia.org/wiki/Flor%C3%A9al,Floréal,french-5603,french
