# Textonic API
Če imaš težave z računanjem vektorjev; Funkcije za vektorizacijo z Textonic API.   
Spet kličem vektorizacijo na stolpcu 'Text' in dobim stolpec z vektorjem 'Embedding'

In [None]:
!pip install pandas numpy networkx numpy scikit-learn

In [None]:
import os
import pandas as pd

excel_path = os.path.join('..', 'data', 'Reworkan data.xlsx')
data = pd.read_excel(excel_path)
    
data.head()

V realnosti pazi na okoljske spremenljivke

In [None]:
print(f'TTNX_API_KEY is set: {len(os.getenv("TTNX_API_KEY"))}')

Definiram Textonic API funkcije za vektorizacijo:

In [None]:
import uuid
import numpy as np
from ttnx.api import call_textonic


def ttnx_embed(df: pd.DataFrame, col_name: str = 'Tekst', target_col_name: str = 'Embedding'):
    # Split the DataFrame into batches of 200 rows
    batch_size = 200
    num_batches = (len(df) + batch_size - 1) // batch_size
    id_column = 'Article ID'
    
    selected_columns = [id_column, col_name]
    responses = pd.DataFrame()
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        
        batch = df.iloc[start_idx:end_idx]
        request = {
            'requestId': str(uuid.uuid4()),
            'process': {
                'analysis': {
                    'steps': [
                        {
                            'step': 'doc_embed',
                            'engine': 'e5',
                            'model': 'multilingual-e5-base',
                            'attributes': [
                                {'named_sentence_filters': 'kl_transcript'}
                            ]
                        }
                    ]
                }
            },
            'documents': []
        }
        
        already_added_ids = set() 
        # Iterate over DataFrame rows and use selected columns and neglect duplicates
        for index, row in batch.iterrows():
            a = row[selected_columns].to_dict()
            if id_column not in a or not a[id_column]:
                continue
            if a[id_column] in already_added_ids:
                continue
            already_added_ids.add(a[id_column])
            if col_name not in a:
                continue
            if isinstance(a[col_name], float):
                body_text = str(a[col_name])
            else:
                body_text = a[col_name].strip()
            if not body_text:
                continue
            
            document = {
                'id': a[id_column],
                'title': a[id_column],
                'lang': 'sl',
                'sections': [
                    {
                        'outline': 'body',
                        'data': body_text
                    }
                ]
            }
            #if a[id_column] == '974bfe2d-d56b-11ed-9246-2b5ebef623ad':
            #    print(f"[{document}]")
            #    continue
            request['documents'].append(document)
        if not request['documents']:
            continue
        resp_obj = call_textonic('/api/public/ml/process', request)
        
        result = {
            id_column: [],
            target_col_name: []
        }
        
        for res_item in resp_obj['data']:
            for res in res_item['result']:
                if 'c' in res and 'v' in res and 'doc_embed' in res['c']:
                    vect = np.array(res['v'])
                    result[id_column].append(res_item['id'])
                    result[target_col_name].append(vect)
                    
        print(f'Requested/received/valid vector : {len(batch)}/{len(resp_obj["data"])}/{len(result[target_col_name])}')
        # Convert the collected responses to a DataFrame
        response_df = pd.DataFrame(result)
        responses = pd.concat([responses, response_df])
    
    orig_len = len(df)
    df = pd.merge(df, responses, on=id_column, how='left').dropna(subset=[target_col_name])
    print(f'Requested/Successful {orig_len}/{len(df)}')
    return df

Remote call to compute Textonic embeddings za 200 člankov na enkrat:

In [None]:
data = ttnx_embed(data, 'Tekst', 'Embedding')
data[['Article ID', 'Datum', 'Medij', 'Tekst', 'Embedding']].head(200)

Funkcija za clusterizacijo

In [None]:
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

def cluster_louvain(df: pd.DataFrame, col_name: str = 'Embedding', similarity_threshold: float = 0.96):
    print("Clustering ...")
    embeddings = np.array(df[col_name].to_list())
    labels = [0] * len(embeddings)
    x = cosine_similarity(embeddings, embeddings)
    similarity_matrix = x > similarity_threshold
    graph = nx.from_numpy_array(similarity_matrix)
    communities = nx.algorithms.community.louvain_communities(graph, resolution=0.1)
    for community in communities:
        initial_member = min(community)
        for member in community:
            labels[member] = initial_member

    df['Cluster'] = pd.Series(labels)
    print("Clustered")
    return df

Zračunam clustering

In [None]:
data = cluster_louvain(data, 'Embedding', 0.96)  # <- similarity_threshold
data = data.drop(columns=['Embedding'])  # remove the vectors from data we don't need them anymore
data[['Article ID', 'Datum', 'Medij', 'Cluster', 'Tekst']].head(100)