# OpenAI API
Če imaš težave z računanjem vektorjev; Funkcije za vektorizacijo z OpenAI API.   
Spet kličem vektorizacijo na stolpcu 'Text' in dobim stolpec z vektorjem 'Embedding'

In [None]:
!pip install pandas numpy tiktoken openai networkx numpy scikit-learn

In [None]:
import os
import pandas as pd

excel_path = os.path.join('..', 'data', 'Reworkan data.xlsx')  # Za primer sem dal krajši file
data = pd.read_excel(excel_path)
data.head()

Pazi na okoljske spremenljivke:

In [None]:
print(os.getenv('OPENAI_API_KEY'))
print(os.getenv('OPENAI_ORG_ID'))

Definiram OpenAI API funkcije za vektorizacijo:

In [None]:
import tiktoken
import openai

def openai_embed(df: pd.DataFrame, col_name: str, target_col_name: str = 'Embedding'):
    def _oai_embed(text, encoding):
        tokens = encoding.encode(text)[:8191]
        embedding = openai.embeddings.create(  # call OpenAI
            input=tokens, model="text-embedding-ada-002"
        )
        return embedding.data[0].embedding
    
    encoding = tiktoken.get_encoding('cl100k_base')
    df[target_col_name] = df[col_name].apply(_oai_embed, encoding=encoding)
    return data

Kličem OpenAI API, da dobim embeddinge (vsak članek posebej zato traja):

In [None]:
data = openai_embed(data, 'Tekst', 'Embedding')
data[['Article ID', 'Datum', 'Medij', 'Tekst', 'Embedding']].head(100)

Funkcija za clusterizacijo

In [None]:
import networkx as nx
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
# Assuming your data variable is a DataFrame that you have already prepared
# Continuing from the last step of the previous script

def cluster_louvain(df: pd.DataFrame, col_name: str = 'Embedding', similarity_threshold: float = 0.96):
    print("Clustering ...")
    embeddings = np.array(df[col_name].to_list())
    labels = [0] * len(embeddings)
    x = cosine_similarity(embeddings, embeddings)
    similarity_matrix = x > similarity_threshold
    graph = nx.from_numpy_array(similarity_matrix)
    communities = nx.algorithms.community.louvain_communities(graph, resolution=0.1)
    for community in communities:
        initial_member = min(community)
        for member in community:
            labels[member] = initial_member

    df['Cluster'] = pd.Series(labels)
    print("Clustered")
    return df

Zračunam clustering

In [None]:
data = cluster_louvain(data, 'Embedding', 0.96)  # <- similarity_threshold
data = data.drop(columns=['Embedding'])  # remove the vectors from data we don't need them anymore
data[['Article ID', 'Datum', 'Medij', 'Cluster', 'Tekst']].head(100)