# Lokalna Vektorizacija
Vektorje za reprezentacijo članka (embedding) računam lokalno.
Če nimaš grafične kartice traja 100 let.

In [None]:
!pip install pandas transformers numpy torch networkx numpy scikit-learn

In [None]:
import os
import pandas as pd

data = pd.read_excel(os.path.join('..', 'data', 'Reworkan data 4.xlsx'))
data.head()

Funkcije za vektorizacijo multilingual-e5-base nevronski model (lokalno računanje)

In [None]:
import torch
import torch.nn.functional as functional

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def e5embed(df: pd.DataFrame, col_name: str, target_col_name: str = 'Embedding'):
    def _average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    
    def _e5_embed(text, _tokenizer, _model, max_len):
        if isinstance(text, float):
            text = str(text)
        batch_dict = _tokenizer(
            ['passage: ' + text], max_length=max_len,
            padding=True, truncation=True, return_tensors='pt'
        )
        batch_dict.to(device)
        outputs = _model(**batch_dict)
        embeddings = _average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        embeddings = functional.normalize(embeddings, p=2, dim=1)
        return embeddings.detach().cpu().numpy()[0]
    
    model_name = "intfloat/multilingual-e5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    model.to(device)
    print("Loaded model, will compute embeddings ...")
    df[target_col_name] = df[col_name].apply(_e5_embed, model=model, tokenizer=tokenizer, max_len=512)
    print("Computed embeddings")
    return df

Kličem vektorizacijo na stolpcu 'Text' in dobim stolpec z vektorjem 'Embedding'

In [None]:
data = e5embed(data, 'Tekst', 'Embedding')
data[['Article ID', 'Datum', 'Medij', 'Tekst', 'Embedding']].head(100)

Definiram funkcijo za clustering

In [None]:
import networkx as nx
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

def cluster_louvain(df: pd.DataFrame, col_name: str = 'Embedding', similarity_threshold: float = 0.96):
    print("Clustering ...")
    embeddings = np.array(df[col_name].to_list())
    labels = [0] * len(embeddings)
    x = cosine_similarity(embeddings, embeddings)
    similarity_matrix = x > similarity_threshold
    graph = nx.from_numpy_array(similarity_matrix)
    communities = nx.algorithms.community.louvain_communities(graph, resolution=0.1)
    for community in communities:
        initial_member = min(community)
        for member in community:
            labels[member] = initial_member

    df['Cluster'] = pd.Series(labels)
    print("Clustered")
    return df

Izračunam cluster

In [None]:
data = cluster_louvain(data, 'Embedding', 0.92)  # <- similarity_threshold
data = data.drop(columns=['Embedding'])  # remove the vectors from data - we don't need them anymore
data[['Article ID', 'Datum', 'Medij', 'Cluster', 'Tekst']].head(100)