In [6]:
import pandas as pd
import numpy as np
import json
import os
import collections
import time
import chromadb
from tqdm import tqdm
from chromadb.config import Settings
from sklearn.preprocessing import MinMaxScaler

## Function generator

In [80]:
class VirtualAggregator:
    """
    Generates a distribution of selected papers based on specified parameters.

    Parameters:
    -----------
    k : int
        Number of citations to sample.
    N : int
        Page size for pagination.
    p : list
        List of weights for criteria: [semantic similarity, publication year, number of citations, publication venue].
    Q : str
        Query used for selecting papers.
    results_df : pandas.DataFrame
        DataFrame containing query results with columns: 'id', 'title', 'similarity', 'year', 'n_citation', 'gov_score'.

    Returns:
    --------
    Counter
        Counter object containing identifiers of selected papers and their counts.
    """
    def __init__(self, N, k, pn):
        self.N = N
        self.k = k
        self.pn = pn
        self.collection = None

    def get_similar_articles(self, query):
        collection_status = False
        max_retries = 5
        retries = 0

        while not collection_status and retries < max_retries:
            try:
                chroma_client = chromadb.HttpClient(host="localhost", port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))
                chroma_collection = chroma_client.get_or_create_collection(name="articles_with_score")
                query_result = chroma_collection.query(query_texts=[query], n_results=self.k)
                return query_result
            except Exception as e:
                print(e)
                retries += 1
            # finally:
            #     if chroma_client:
            #         chroma_client.close()
     
        if not collection_status:
            raise Exception("Failed to connect to the collection after 5 attempts")

    def distribution_function(self, page_count):
        pages_distribution = np.exp(-np.arange(1, page_count + 1))
        pages_distribution /= pages_distribution.sum()
        return pages_distribution

    def distribution_generator(self, collection_df):    
        # Normalizacja lat, liczby cytowań i punktów za publikacje
        scaler = MinMaxScaler()
        collection_df[['year_normalized', 'citations_normalized', 'points_normalized']] = scaler.fit_transform(collection_df[['year', 'n_citation', 'gov_score']])

        # Obliczanie rankingu
        collection_df['score'] = self.pn[0] * collection_df['similarity'] + \
              self.pn[1] * collection_df['year_normalized'] + \
              self.pn[2] * collection_df['citations_normalized'] + \
              self.pn[3] * collection_df['points_normalized']

        # Sortowanie po rankingu
        df_sorted = collection_df.sort_values(by='score', ascending=False)

        # Stronicowanie wyników
        ranked_indices = df_sorted['id'].to_numpy()
        pages = [ranked_indices[i:i + self.N] for i in range(0, len(ranked_indices), self.N)]
        pages_distribution = self.distribution_function(len(pages))
    
        # Losowanie k prac
        np.random.seed(42)  # Ustawienie ziarna losowości dla powtarzalności wyników
        selected_papers = []
        for _ in range(self.k):
            selected_page_index = np.random.choice(len(pages), p=pages_distribution)
            selected_page = pages[selected_page_index]
            selected_paper_index = np.random.choice(selected_page)
            selected_papers.append(selected_paper_index)

        # Zapisanie identyfikatorów wylosowanych prac
        selected_paper_counts = collections.Counter(selected_papers)
        
        # Wyświetlenie wyników
        # print(f"Selected paper indices: {selected_papers}")
        # print(f"Selected paper counts: {selected_paper_counts}")
        # display(results_df.head())

        return selected_paper_counts

    def generate_query(self):
        # Funkcja losujaca zapytanie z bazy za pomoca klienta do chromaDB
        pass
    
    def build_ranking(self, query):
        # load N similar articles
        articles = self.chroma_collection.query(
            query_texts=[query],
            n_results=self.N
        )

        # Use Q function to get bettrt
        pass

    # todo - unused
    def select_papers(self, ranking):
        selected_papers = random.sample(ranking, self.k)
        return selected_papers

class Experiment:
    def __init__(self, settings):
        self.virtual_aggregator = None
        self.queries = None
        self.settings = settings

    def run_experiment(self):
        self.load_queries()
        print(f"Loaded: {len(self.queries)} queries")

        result_df = pd.DataFrame({
            'title': None,
            'settings': [],
            'distribution': None,
        })

        for i, query in enumerate(tqdm(self.queries, total=len(self.queries), desc="Queries", unit="query")):  
            for sample in self.settings:
                self.virtual_aggregator = VirtualAggregator(sample['N'], sample['k'], sample['pn'])
                distribution = self.step(query)

                # Save result
                result_df = pd.concat([result_df, pd.DataFrame([
                    {'title': query, 'settings': sample, 'distribution': dict(distribution)}
                ])], ignore_index=True)           

            self.save_results(result_df)

    def step(self, query):
        similar_articles = self.virtual_aggregator.get_similar_articles(query)

        collection_df = pd.DataFrame({
            'id': similar_articles['ids'][0],
            'title': similar_articles['documents'][0],
            'similarity': similar_articles['distances'][0],
            'year': [metadata['year'] for metadata in similar_articles['metadatas'][0]],
            'n_citation': [metadata['n_citation'] for metadata in similar_articles['metadatas'][0]],
            'gov_score': [metadata['gov_score'] for metadata in similar_articles['metadatas'][0]]
        })
        # display(collection_df)
        return self.virtual_aggregator.distribution_generator(collection_df)

    def load_queries(self):
        df_query = pd.read_csv('../data/queries_df.csv')
        self.queries = df_query['Query'].tolist()

    def save_results(self, results_df):
        results_df.to_csv('../data/results.csv', index=False)


In [83]:
# Parametry wirtualnego agregatora
settings = [
    {
        'N': 20,
        'k': 15,
        'pn': [0.5, 0.3, 0.1, 0.1],
    }
]

experiment = Experiment(settings)
experiment.run_experiment()

Loaded: 850000 queries


Queries:   0%|                                                                | 6/850000 [00:01<74:04:31,  3.19query/s]


KeyboardInterrupt: 

In [82]:
df_results = pd.read_csv('../data/results.csv')
display(df_results.head())

Unnamed: 0,title,settings,distribution
0,Is Proxy Record Customizable Manager?,"{'N': 20, 'k': 15, 'pn': [0.5, 0.3, 0.1, 0.1]}","{'345143': 2, '321846': 4, '413251': 2, '47508..."
1,Is Accelerator Explore Boolean Selector?,"{'N': 20, 'k': 15, 'pn': [0.5, 0.3, 0.1, 0.1]}","{'710026': 2, '814098': 4, '494531': 2, '77464..."
2,Is Request Organize Mapped Recommender?,"{'N': 20, 'k': 15, 'pn': [0.5, 0.3, 0.1, 0.1]}","{'232610': 2, '323064': 4, '785944': 2, '67275..."
3,Is Queue Differentiate Private Viewer?,"{'N': 20, 'k': 15, 'pn': [0.5, 0.3, 0.1, 0.1]}","{'4430': 2, '344689': 4, '363043': 2, '186106'..."
4,Is Executor Explain Finite Architect?,"{'N': 20, 'k': 15, 'pn': [0.5, 0.3, 0.1, 0.1]}","{'327285': 2, '77569': 4, '156780': 2, '188865..."
