In [None]:
!pip install pandas numpy networkx numpy scikit-learn

In [None]:
import os
import pandas as pd

data = pd.read_excel(os.path.join('..', 'data', 'Reworkan data 4.xlsx'))
data.head()

V realnem projektu, pazi na okoljske spremeljivke

In [None]:
print(os.getenv('CPTM_SPASS'))
print(os.getenv('CPTM_SURL'))
print(os.getenv('CPTM_SUSER'))

Funkcije za pridobivanje že obstoječih vektorjev iz elastike.

In [None]:
import numpy
from esdl import Elastika

def embed_elastic(df: pd.DataFrame, col_name: str = 'Tekst', target_col_name: str = 'Embedding') -> pd.DataFrame:
    # Split the DataFrame into batches of 500 rows
    batch_size = 1000
    num_batches = (len(df) + batch_size - 1) // batch_size
    es_column = 'vector_768___textonic_v2'
    id_column = 'Article ID'
    
    responses = pd.DataFrame()
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        
        batch = df[id_column].iloc[start_idx:end_idx].tolist()
        
        # Call the remote service for the current batch
        requests = Elastika()
        requests.limit(1000)
        requests.field(['uuid', es_column])
        requests.filter_uuid(batch)
        
        # Collect the responses
        articles = requests.gets('1996-01-01', '2030-01-01')
        result = {
            id_column: [],
            target_col_name: []
        }
        for a in articles:
            if es_column not in a.data:
                continue
            vect = numpy.array(a.data[es_column])
            result[id_column].append(a.uuid)
            result[target_col_name].append(vect)
        print(f'Requested/received/valid vector : {len(batch)}/{len(articles)}/{len(result[target_col_name])}')
        # Convert the collected responses to a DataFrame
        response_df = pd.DataFrame(result)
        responses = pd.concat([responses, response_df])
        
    # Merge the responses back into the original DataFrame
    orig_len = len(df)
    df = pd.merge(df, responses, on=id_column, how='left').dropna(subset=[target_col_name])
    print(f'Requested/Successful {orig_len}/{len(df)}')
    return df

Kličem vektorizacijo na stolpcu 'Text' in dobim stolpec z vektorjem 'Embedding'

In [None]:
data = embed_elastic(data, 'Tekst', 'Embedding')
data[['Article ID', 'Datum', 'Medij', 'Tekst', 'Embedding']].head(100)

Definiram funkcijo za cluster

In [None]:
import networkx as nx
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cluster_louvain(df: pd.DataFrame, col_name: str = 'Embedding', similarity_threshold: float = 0.96):
    print("Clustering ...")
    embeddings = np.array(df[col_name].to_list())
    labels = [0] * len(embeddings)
    x = cosine_similarity(embeddings, embeddings)
    similarity_matrix = x > similarity_threshold
    graph = nx.from_numpy_array(similarity_matrix)
    communities = nx.algorithms.community.louvain_communities(graph, resolution=0.1)
    for community in communities:
        initial_member = min(community)
        for member in community:
            labels[member] = initial_member

    df['Cluster'] = pd.Series(labels)
    print("Clustered")
    return df

Izračunam cluster

In [None]:
data = cluster_louvain(data, 'Embedding', 0.92)  # <- similarity_threshold
data = data.drop(columns=['Embedding'])  # remove the vectors from data we don't need them anymore
data[['Article ID', 'Datum', 'Medij', 'Cluster', 'Tekst']].head(100)