In [1]:
!pip install elasticsearch
!pip install sentence-transformers



In [2]:
import json
from elasticsearch import Elasticsearch

es = Elasticsearch( "http://elasticsearch:9200")


In [3]:
import pandas as pd

# Load CSV file
df = pd.read_json('./data/unstructured/drivers_wikipedias.jsonl', lines=True)
df

Unnamed: 0,driverId,wikipedia_description
0,1,British racing driver\nRivalries\nSir Lewis Ca...
1,2,Nick Lars Heidfeld (German pronunciation: [nɪk...
2,3,Nico Erik Rosberg (born 27 June 1985) is a Ger...
3,4,Fernando Alonso Díaz (Spanish pronunciation: [...
4,5,Heikki Johannes Kovalainen (Finnish pronunciat...
...,...,...
855,857,Oscar Jack Piastri (born 6 April 2001) is an A...
856,858,"Logan Hunter Sargeant (born December 31, 2000)..."
857,859,Liam Lawson (born 11 February 2002) is a New Z...
858,860,Oliver James Bearman (born 8 May 2005) is a Br...


In [4]:
df.dtypes

driverId                  int64
wikipedia_description    object
dtype: object

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('intfloat/multilingual-e5-large')

embeddings = model.encode(df["wikipedia_description"].values.tolist())
embeddings

array([[ 0.00288702, -0.03604181,  0.0042402 , ...,  0.00089948,
        -0.06555907,  0.01624825],
       [-0.01659629, -0.02656355, -0.01617689, ...,  0.00301233,
        -0.05957953,  0.04344898],
       [-0.01109292, -0.02974781, -0.002038  , ...,  0.00244354,
        -0.0783103 ,  0.01465064],
       ...,
       [ 0.00593708,  0.00384961,  0.00308535, ..., -0.02574416,
        -0.02467067,  0.03154134],
       [-0.01262657,  0.00586637,  0.00518287, ...,  0.01456884,
        -0.03208819,  0.01872776],
       [ 0.02477685, -0.00691548,  0.00730457, ...,  0.01904931,
        -0.03958486, -0.0046946 ]], dtype=float32)

In [9]:
df["wikipedia_description_vector"] = list(embeddings)
df

Unnamed: 0,driverId,wikipedia_description,wikipedia_description_vector
0,1,British racing driver\nRivalries\nSir Lewis Ca...,"[0.0028870185, -0.036041815, 0.0042401953, -0...."
1,2,Nick Lars Heidfeld (German pronunciation: [nɪk...,"[-0.016596286, -0.02656355, -0.016176887, -0.0..."
2,3,Nico Erik Rosberg (born 27 June 1985) is a Ger...,"[-0.011092918, -0.029747814, -0.0020379967, -0..."
3,4,Fernando Alonso Díaz (Spanish pronunciation: [...,"[0.0108537795, -0.010756244, 0.0008550639, -0...."
4,5,Heikki Johannes Kovalainen (Finnish pronunciat...,"[-0.018135607, -0.030896196, -0.011842099, -0...."
...,...,...,...
855,857,Oscar Jack Piastri (born 6 April 2001) is an A...,"[-0.00938711, -0.011900783, -0.0076955585, -0...."
856,858,"Logan Hunter Sargeant (born December 31, 2000)...","[0.016696291, 0.0070512095, -0.007261097, -0.0..."
857,859,Liam Lawson (born 11 February 2002) is a New Z...,"[0.0059370785, 0.0038496093, 0.0030853467, -0...."
858,860,Oliver James Bearman (born 8 May 2005) is a Br...,"[-0.012626566, 0.0058663664, 0.005182872, -0.0..."


In [14]:
mapping = {
    "mappings": {
        "properties": {
            "driverId": { "type": "integer" },
            "wikipedia_description": { "type": "text" },
            "wikipedia_description_vector": {
                "type": "dense_vector",
                "dims": embeddings.shape[1],
                "index":True,
                "similarity": "cosine"
            }
        }
    }
}

index_name = 'drivers_descriptions'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

es.indices.create(index=index_name, body=mapping)
f"Index '{index_name}' created with mapping."

"Index 'drivers_descriptions' created with mapping."

In [32]:
from elasticsearch.helpers import bulk

def bulk_index_data(es, data, index_name):
    batch_size = 50  # Reducir el tamaño del lote a 50
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        actions = []
        for doc in batch:
            actions.append({
                "_index": index_name,
                "_id": doc['driverId'],
                "_source": doc
            })
        # Capturar la respuesta para verificar errores
        resp = bulk(es, actions, raise_on_error=True)
        print("Indexed:", resp[0], "Errors:", resp[1])

drivers = df.to_dict(orient='records')
bulk_index_data(es, drivers, index_name)

Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 50 Errors: []
Indexed: 10 Errors: []


## Comprobar que se han cargado los datos

In [39]:
res = es.search(index=index_name, body={
    'query': {
        'match_all': {}
    }
})
print(json.dumps(res.body['hits']['total']['value'], indent=4))

860
