In [None]:
from kLLMmeans import kLLMmeans, get_embeddings, summarize_cluster, sequentialMiniBatchKmeans, miniBatchKLLMeans, miniBatchNLPeans
from experiment_utils import load_dataset, cluster_metrics, avg_closest_distance
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import LabelEncoder

import numpy as np
import json, pickle

import warnings
warnings.filterwarnings("ignore")

In [None]:
prompt = "The following is a cluster of questions from the same community. Write a summary that represents the cluster:"
text_type = "Summary:"

In [None]:
max_iter = 120
emb_type = 'openai'
with open("processed_data/data_stackexchange_openai_2023.pkl", "rb") as f:
    data_dict = pickle.load(f) 

groups_df = data_dict['data'].groupby('Label').count().reset_index()
selected = list(groups_df[groups_df.Year>500].Label)
data_dict['data'] = data_dict['data'][data_dict['data']['Label'].isin(selected)]
encoder = LabelEncoder()
numeric_labels = encoder.fit_transform(list(data_dict['data'].Label))
max_batch_size = 10000

for year in [2020,2021,2022,2023]:

    with open("processed_data/data_stackexchange_openai_" + str(year) + ".pkl", "rb") as f:
        data_dict = pickle.load(f) 
    
    try:
        with open("results/sims_stackexchange_results_" + str(year) + ".pkl", "rb") as f:
            results_dict = pickle.load(f)
        print('Old results_dict loaded')
    except:
        print('No previous results')
        results_dict = {}

    data_dict['data']['embeddings'] = list(data_dict['embeddings'])
    data_dict['data'] = data_dict['data'][data_dict['data']['Label'].isin(selected)]
    #total = total + data_dict['data'].shape[0]

    data_dict['data'] = data_dict['data'].sort_values('CreationDate')
    
    text_data = list(data_dict['data']['Text'])
    labels = list(encoder.transform(list(data_dict['data'].Label)))
    num_clusters = len(np.unique(labels))
    text_features = list(data_dict['data']['embeddings'])

    del data_dict

    with open("processed_data/data_sentences_stackexchange_openai_" + str(year) + ".pkl", "rb") as f:
        sentences_dict = pickle.load(f) 
    text_sentences = sentences_dict[emb_type]
    
    del sentences_dict

    oracle_clustered_embeddings = {i: [] for i in range(num_clusters)}
    for embedding, cluster in zip(text_features, labels):
        oracle_clustered_embeddings[cluster].append(embedding)
    oracle_centroids = [np.mean(oracle_clustered_embeddings[i], axis=0) if oracle_clustered_embeddings[i] else None for i in range(num_clusters)]

    for seed in range(5):
        if results_dict.get(seed) is None:
            results_dict[seed] = {}

        #minibatch
        if results_dict[seed].get('minibatchkmeans') is None:
            minibatchkmeans = MiniBatchKMeans(n_clusters=num_clusters,
                                     random_state=seed,
                                     batch_size=max_batch_size,
                                     init="k-means++")
            minibatch_assignments = minibatchkmeans.fit_predict(text_features)
            minibatch_centroids = minibatchkmeans.cluster_centers_
            results = cluster_metrics(np.array(labels), minibatch_assignments, oracle_centroids, minibatch_centroids, minibatch_centroids)
            
            data_results ={'assignments':minibatch_assignments,
                           'final_centroids':minibatch_centroids,
                           'results':results}
    
            
            results_dict[seed]['minibatchkmeans'] = data_results
            print([year, seed, 'minibatchkmeans', results])
        
        #seq minibatch
        if results_dict[seed].get('seqminibatchkmeans') is None:
            seqminibatchKMeans = sequentialMiniBatchKmeans(text_features, 
                                                   num_clusters, 
                                                   random_state=seed, 
                                                   max_batch_size=max_batch_size)
            seqminibatch_assignments = seqminibatchKMeans.predict(text_features)
            seqminibatch_centroids = seqminibatchKMeans.cluster_centers_
            results = cluster_metrics(np.array(labels), seqminibatch_assignments, oracle_centroids, seqminibatch_centroids, seqminibatch_centroids)
     
            data_results ={'assignments':seqminibatch_assignments,
                           'final_centroids':seqminibatch_centroids,
                           'results':results}
            
            results_dict[seed]['seqminibatchkmeans'] = data_results
            print([year, seed, 'seqminibatchkmeans', results])
        
        #kmeans
        if results_dict[seed].get('kmeans') is None:
            kmeans = KMeans(n_clusters=num_clusters, random_state=seed)
            kmeans_assignments = kmeans.fit_predict(text_features)
            kmeans_centroids = kmeans.cluster_centers_
            results = cluster_metrics(np.array(labels), kmeans_assignments, oracle_centroids, kmeans_centroids, kmeans_centroids)
            
            data_results ={'assignments':seqminibatch_assignments,
                           'final_centroids':seqminibatch_centroids,
                           'results':results}
            
            results_dict[seed]['kmeans'] = data_results
            print([year, seed, 'kmeans', results])

        #kLLMmeans
        for force_context_length in [10, 50]:
            if results_dict[seed].get(force_context_length) is None:
                results_dict[seed][force_context_length] = {}
                
            for max_llm_iter in [1, 5]:

                if results_dict[seed][force_context_length].get(max_llm_iter) is None:
                    summaries, centroids = miniBatchKLLMmeans(text_data, 
                                                            num_clusters,
                                                            max_batch_size = max_batch_size, 
                                                            init = 'k-means++',
                                                            prompt = prompt, text_type = text_type,
                                                            force_context_length = force_context_length, max_llm_iter = max_llm_iter, 
                                                            max_iter = 120, tol=1e-4, random_state = seed, 
                                                            emb_type = 'openai', text_features = text_features)

                    kmeans2 = KMeans(n_clusters=num_clusters, init=centroids, max_iter=1)
                    cluster_assignments = kmeans2.fit_predict(text_features)
                    results = cluster_metrics(np.array(labels), cluster_assignments, oracle_centroids, centroids, centroids)
                    
                    data_results ={'assignments':cluster_assignments,
                                   'summaries':summaries,
                                   'final_centroids':centroids,
                                   'results':results}
                    
                    results_dict[seed][force_context_length][max_llm_iter] = data_results
                    print([year, seed, force_context_length, max_llm_iter, results])

                    # Save as pkl file
                    with open("results/sims_stackexchange_results_" + str(year) + ".pkl", "wb") as f:
                        pickle.dump(results_dict, f)
                        
                else:
                    results = results_dict[seed][force_context_length][max_llm_iter]['results']
                    print([year, seed, force_context_length, max_llm_iter, results])
        #kNLPmeans
        for nlp_type in ['lsa', 'centroid', 'textrank']:
            if results_dict[seed].get(nlp_type) is None:
                results_dict[seed][nlp_type] = {}
                
            for max_llm_iter in [1, 5]:

                if results_dict[seed][nlp_type].get(max_llm_iter) is None:
                    summaries, centroids = miniBatchKNLPmeans(text_data, 
                                                            num_clusters,
                                                            max_batch_size = max_batch_size, 
                                                            init = 'k-means++',
                                                            force_context_length = 0, max_llm_iter = max_llm_iter, 
                                                            max_iter = 120, tol=1e-4, random_state = seed, 
                                                            emb_type = 'openai', text_features = text_features,
                                                            top_k = top_k, text_sentences = text_sentences,
                                                            nlp = nlp_type)
                    
                    kmeans2 = KMeans(n_clusters=num_clusters, init=centroids, max_iter=1)
                    cluster_assignments = kmeans2.fit_predict(text_features)
                    results = cluster_metrics(np.array(labels), cluster_assignments, oracle_centroids, centroids, centroids)
                    
                    data_results ={'assignments':cluster_assignments,
                                   'summaries':summaries,
                                   'final_centroids':centroids,
                                   'results':results}
                    
                    results_dict[seed][nlp_type][max_llm_iter] = data_results
                    print([year, seed, nlp_type, max_llm_iter, results])

                    # Save as pkl file
                    with open("results/sims_stackexchange_results_" + str(year) + ".pkl", "wb") as f:
                        pickle.dump(results_dict, f)
                        
                else:
                    results = results_dict[seed][nlp_type][max_llm_iter]['results']
                    print([year, seed, nlp_type, max_llm_iter, results])

                