# ElasticSearch Obstoječi vektorji
Če imaš težave z računanjem vektorjev; Pokličeš Elasticsearch.   
Spet kličem vektorizacijo na stolpcu 'Text' in dobim stolpec z vektorjem 'Embedding'

In [None]:
!pip install pandas numpy networkx numpy scikit-learn requests

In [None]:
import os
import pandas as pd

data = pd.read_excel(os.path.join('..', 'data', 'Reworkan data 4.xlsx'))
data.head()

V realnem projektu, pazi na okoljske spremeljivke

In [None]:
print(f'CPTM_SPASS is set: {len(os.getenv("CPTM_SPASS"))}')
print(f'CPTM_SURL is set: {len(os.getenv("CPTM_SURL"))}')
print(f'CPTM_SUSER is set: {len(os.getenv("CPTM_SUSER"))}')

Funkcije za pridobivanje že obstoječih vektorjev iz elastike.

In [None]:
import numpy
import json
import requests
from requests.auth import HTTPBasicAuth

def embed_elastic(df: pd.DataFrame, col_name: str = 'Tekst', target_col_name: str = 'Embedding') -> pd.DataFrame:
    # Split the DataFrame into batches of 500 rows
    batch_size = 1000
    num_batches = (len(df) + batch_size - 1) // batch_size
    es_column = 'vector_768___textonic_v2'
    id_column = 'Article ID'
    
    responses = pd.DataFrame()
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        
        batch = df[id_column].iloc[start_idx:end_idx].tolist()
        
        query = '''
        {
          "query": {
            "bool": {
              "filter": [
                {"range": {"created": {"gte": "1996-01-01", "lt": "2030-01-01"}}},
                {"terms": {"uuid": ''' + json.dumps(batch) + '''}}
              ]
            }
          },
          "_source": ["uuid", ''' + json.dumps(es_column) + '''],
          "from": 0, "size": 1000, "sort": { "created": {"order": "asc"}}
        }
        '''
        
        result = {
            id_column: [],
            target_col_name: []
        }
        # Call the remote service for the current batch
        resp_text = ''
        try:
            resp = requests.post(os.getenv("CPTM_SURL"),
                                 headers={'Content-Type': 'application/json'},
                                 auth=HTTPBasicAuth(os.getenv("CPTM_SUSER"), os.getenv("CPTM_SPASS")),
                                 data=query)
            resp_text = json.loads(resp.text)
            for hit in resp_text['hits']['hits']:
                if es_column not in hit['_source']:
                    continue
                if 'uuid' not in hit['_source']:
                    continue
                vect = numpy.array(hit['_source'][es_column])  # here is article vector
                result[id_column].append(hit['_source']['uuid'])  # here is uuid of an article 
                result[target_col_name].append(vect)
            print(f'Requested/received/valid vector : {len(batch)}/{len(resp_text["hits"]["hits"])}/{len(result[target_col_name])}')
        except Exception as error:
            print(f'Elasticsearch request error [{error}] for query [{query}] with response [{resp_text}]')
            return df
        
        # Convert the collected responses to a DataFrame
        response_df = pd.DataFrame(result)
        responses = pd.concat([responses, response_df])
        
    # Merge the responses back into the original DataFrame
    orig_len = len(df)
    df = pd.merge(df, responses, on=id_column, how='left').dropna(subset=[target_col_name])
    print(f'Requested/Successful {orig_len}/{len(df)}')
    return df

Kličem vektorizacijo na stolpcu 'Text' in dobim stolpec z vektorjem 'Embedding'

In [None]:
data = embed_elastic(data, 'Tekst', 'Embedding')
data[['Article ID', 'Datum', 'Medij', 'Tekst', 'Embedding']].head(100)

Definiram funkcijo za cluster

In [None]:
import networkx as nx
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cluster_louvain(df: pd.DataFrame, col_name: str = 'Embedding', similarity_threshold: float = 0.96):
    print("Clustering ...")
    embeddings = np.array(df[col_name].to_list())
    labels = [0] * len(embeddings)
    x = cosine_similarity(embeddings, embeddings)
    similarity_matrix = x > similarity_threshold
    graph = nx.from_numpy_array(similarity_matrix)
    communities = nx.algorithms.community.louvain_communities(graph, resolution=0.1)
    for community in communities:
        initial_member = min(community)
        for member in community:
            labels[member] = initial_member

    df['Cluster'] = pd.Series(labels)
    print("Clustered")
    return df

Izračunam cluster

In [None]:
data = cluster_louvain(data, 'Embedding', 0.92)  # <- similarity_threshold
data = data.drop(columns=['Embedding'])  # remove the vectors from data we don't need them anymore
data[['Article ID', 'Datum', 'Medij', 'Cluster', 'Tekst']].head(100)