In [1]:
import pandas as pd
import numpy as np
import json
import os
import collections
import time
import chromadb
from tqdm import tqdm
from chromadb.config import Settings
from sklearn.preprocessing import MinMaxScaler

## Function generator

In [16]:
class VirtualAggregator:
    """
    Generates a distribution of selected papers based on specified parameters.

    Parameters:
    -----------
    k : int
        Number of citations to sample.
    N : int
        Page size for pagination.
    p : list
        List of weights for criteria: [semantic similarity, publication year, number of citations, publication venue].
    Q : str
        Query used for selecting papers.
    results_df : pandas.DataFrame
        DataFrame containing query results with columns: 'id', 'title', 'similarity', 'year', 'n_citation', 'gov_score'.

    Returns:
    --------
    Counter
        Counter object containing identifiers of selected papers and their counts.
    """
    def __init__(self):
        self.collection = None
        self.N = None
        self.k = None
        self.pn = None
        self.chroma_collection = None
        self.init_connection()

    def set_parameters(self, N, k, pn):
        self.N = N
        self.k = k
        self.pn = pn

    def init_connection(self):
        collection_status = False
        max_retries = 5
        retries = 0

        while not collection_status and retries < max_retries:
            try:
                chroma_client = chromadb.HttpClient(host="localhost", port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))
                self.chroma_collection = chroma_client.get_or_create_collection(name="articles_with_score")
                collection_status = True
            except Exception as e:
                print(e)
                retries += 1
            # finally:
            #     if chroma_client:
            #         chroma_client.close() # we cant close connection 
     
        if not collection_status:
            raise Exception("Failed to connect to the collection after 5 attempts")

    def get_similar_articles(self, query, k):
        collection_status = False
        max_retries = 5
        retries = 0

        while not collection_status and retries < max_retries:
            try:
                return self.chroma_collection.query(query_texts=[query], n_results=2*k)
            except Exception as e:
                print(e)
                retries += 1
            # finally:
            #     if chroma_client:
            #         chroma_client.close() # we cant close connection 
     
        if not collection_status:
            raise Exception("Failed to connect to the collection after 5 attempts")

    def distribution_function(self, page_count):
        pages_distribution = np.exp(-np.arange(1, page_count + 1))
        pages_distribution /= pages_distribution.sum()
        return pages_distribution

    def distribution_generator(self, collection_df):    
        scaler = MinMaxScaler()
        collection_df[['year_normalized', 'citations_normalized', 'points_normalized']] = scaler.fit_transform(collection_df[['year', 'n_citation', 'gov_score']])

        collection_df['score'] = self.pn[0] * collection_df['similarity'] + \
            self.pn[1] * collection_df['year_normalized'] + \
            self.pn[2] * collection_df['citations_normalized'] + \
            self.pn[3] * collection_df['points_normalized'] 

        df_sorted = collection_df.sort_values(by='score', ascending=False)

        # Stronicowanie wyników
        ranked_indices = df_sorted['id'].to_numpy()
        pages = [ranked_indices[i:i + self.N] for i in range(0, len(ranked_indices), self.N)]
        pages_distribution = self.distribution_function(len(pages))
        
        # Losowanie k prac
        np.random.seed(42)  # Ustawienie ziarna losowości dla powtarzalności wyników

        selected_papers = []
        for _ in range(self.k):
            selected_page_index = np.random.choice(len(pages), p=pages_distribution)
            selected_page = pages[selected_page_index]
            selected_paper_index = np.random.choice(selected_page)
            selected_papers.append(selected_paper_index)

            # Usuwanie wylosowanych wyników
            index_to_remove = np.where(selected_page == selected_paper_index)[0]
            selected_page = np.delete(selected_page, index_to_remove)

        # Zapisanie identyfikatorów wylosowanych prac
        selected_paper_counts = collections.Counter(selected_papers)
        
        # Wyświetlenie wyników
        # print(f"Selected paper indices: {selected_papers}")
        # print(f"Selected paper counts: {selected_paper_counts}")
        # display(results_df.head())

        return selected_paper_counts

    def select_papers(self, ranking):
        selected_papers = random.sample(ranking, self.k)
        return selected_papers


In [17]:
class Experiment:
    def __init__(self, settings):
        self.virtual_aggregator = VirtualAggregator()
        self.queries = None
        self.settings = settings
        self.similar_articles = None

    def run_experiment(self):
        self.load_queries()
        print(f"Loaded: {len(self.queries)} queries")

        result_df = pd.DataFrame({
            'title': None,
            'settings': [],
            'distribution': None,
        })

        max_k = max(self.settings, key=lambda x: x['k'])['k']
        for i, query in enumerate(tqdm(self.queries, total=len(self.queries), desc="Queries", unit="query")):
            self.similar_articles = self.virtual_aggregator.get_similar_articles(query, max_k)

            for sample in self.settings:
                self.virtual_aggregator.set_parameters(sample['N'], sample['k'], sample['pn'])
                distribution = self.step(query)

                # Save result
                result_df = pd.concat([result_df, pd.DataFrame([
                    {'title': query, 'settings': sample, 'distribution': dict(distribution)}
                ])], ignore_index=True)           

            self.save_results(result_df)

    def step(self, query):
        collection_df = pd.DataFrame({
            'id': self.similar_articles['ids'][0],
            'title': self.similar_articles['documents'][0],
            'similarity': self.similar_articles['distances'][0],
            'year': [metadata['year'] for metadata in self.similar_articles['metadatas'][0]],
            'n_citation': [metadata['n_citation'] for metadata in self.similar_articles['metadatas'][0]],
            'gov_score': [metadata['gov_score'] for metadata in self.similar_articles['metadatas'][0]]
        })

        return self.virtual_aggregator.distribution_generator(collection_df)

    def load_queries(self):
        df_query = pd.read_csv('../data/queries_df.csv')
        self.queries = df_query['Query'].tolist()

    def save_results(self, results_df):
        results_df.to_csv('../data/results.csv', index=False)


# Test

## 1. Health test

In [11]:
# Parametry wirtualnego agregatora
settings = [
    {
        'N': 20,
        'k': 10,
        'pn': [0.5, 0.3, 0.1, 0.1],
    },
    {
        'N': 20,
        'k': 15,
        'pn': [0.5, 0.2, 0.2, 0.1],
    },
]

experiment = Experiment(settings)
experiment.run_experiment()

Loaded: 850000 queries


15

Queries:   0%|                                                              | 580/850000 [01:05<26:39:12,  8.85query/s]


KeyboardInterrupt: 

## 2. Fill experiment

In [14]:
def generate_examples_with_fixed_pn(num_examples):
    examples = []
    for _ in range(num_examples):
        N = np.random.randint(10, 30)
        k = np.random.randint(5, N)
        pn = np.random.dirichlet(np.ones(4), size=1)[0]
        pn = np.round(pn, 2).tolist()
        examples.append({
            'N': N,
            'k': k,
            'pn': pn,
        })
    return examples

settings = generate_examples_with_fixed_pn(500)
#display(settings)

In [15]:
experiment = Experiment(settings)
experiment.run_experiment()

Loaded: 850000 queries


28

Queries:   0%|                                                              | 68/850000 [02:07<441:17:56,  1.87s/query]


KeyboardInterrupt: 

# Read result

In [20]:
df_results = pd.read_csv('../data/results.csv')
display(df_results.head()) 

Unnamed: 0,title,settings,distribution
0,Is Proxy Record Customizable Manager?,"{'N': 11, 'k': 8, 'pn': [0.32, 0.0, 0.56, 0.11]}","{'645696': 2, '132239': 1, '700503': 2, '47508..."
1,Is Proxy Record Customizable Manager?,"{'N': 21, 'k': 13, 'pn': [0.19, 0.53, 0.08, 0....","{'158554': 2, '39779': 1, '565416': 1, '461731..."
2,Is Proxy Record Customizable Manager?,"{'N': 25, 'k': 19, 'pn': [0.2, 0.5, 0.07, 0.23]}","{'46166': 2, '733536': 1, '461731': 2, '158554..."
3,Is Proxy Record Customizable Manager?,"{'N': 12, 'k': 9, 'pn': [0.22, 0.04, 0.02, 0.71]}","{'658043': 2, '132239': 1, '551126': 2, '47508..."
4,Is Proxy Record Customizable Manager?,"{'N': 23, 'k': 22, 'pn': [0.47, 0.02, 0.25, 0....","{'603747': 2, '780573': 1, '210987': 1, '15855..."
