In [1]:
import pandas as pd
import numpy as np
import json
import os
import collections
import time
import chromadb
from tqdm import tqdm
from chromadb.config import Settings
from sklearn.preprocessing import MinMaxScaler
import csv
import itertools

# Losowanie k prac
np.random.seed(42)  # Ustawienie ziarna losowości dla powtarzalności wyników

## Distribution generator

In [2]:
class VirtualAggregator:
    """
    Generates a distribution of selected papers based on specified parameters.

    Parameters:
    -----------
    k : int
        Number of citations to sample.
    N : int
        Page size for pagination.
    p : list
        List of weights for criteria: [semantic similarity, publication year, number of citations, publication venue].
    Q : str
        Query used for selecting papers.
    results_df : pandas.DataFrame
        DataFrame containing query results with columns: 'id', 'title', 'similarity', 'year', 'n_citation', 'gov_score'.

    Returns:
    --------
    Counter
        Counter object containing identifiers of selected papers and their counts.
    """
    def __init__(self):
        self.collection = None
        self.N = None
        self.k = None
        self.pn = None
        self.chroma_collection = None
        self.init_connection()

    def set_parameters(self, N, k, pn):
        self.N = N
        self.k = k
        self.pn = pn

    def init_connection(self):
        collection_status = False
        max_retries = 5
        retries = 0

        while not collection_status and retries < max_retries:
            try:
                chroma_client = chromadb.HttpClient(host="localhost", port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))
                self.chroma_collection = chroma_client.get_or_create_collection(name="articles_with_score")
                collection_status = True
            except Exception as e:
                print(e)
                retries += 1
            # finally:
            #     if chroma_client:
            #         chroma_client.close() # we cant close connection 
     
        if not collection_status:
            raise Exception("Failed to connect to the collection after 5 attempts")

    def get_similar_articles(self, query, max_similarities):
        collection_status = False
        max_retries = 5
        retries = 0

        while not collection_status and retries < max_retries:
            try:
                return self.chroma_collection.query(query_texts=[query], n_results=max_similarities)
            except Exception as e:
                print(e)
                retries += 1
            # finally:
            #     if chroma_client:
            #         chroma_client.close() # we cant close connection 
     
        if not collection_status:
            raise Exception("Failed to connect to the collection after 5 attempts")

    def distribution_function(self, page_count):
        pages_distribution = np.exp(-np.arange(1, page_count + 1))
        pages_distribution /= pages_distribution.sum()
        return pages_distribution

    def distribution_generator(self, collection_dict):    
        values_to_scale = np.array([
                collection_dict['year'],
                collection_dict['n_citation'],
                collection_dict['gov_score']
            ]).T

        # Dopasowanie i przekształcenie danych
        scaler = MinMaxScaler()
        scaled_values = scaler.fit_transform(values_to_scale)

        collection_dict['year_normalized'] = scaled_values[:, 0].tolist()
        collection_dict['citations_normalized'] = scaled_values[:, 1].tolist()
        collection_dict['points_normalized'] = scaled_values[:, 2].tolist()

        collection_dict['score'] = [
            self.pn[0] * collection_dict['similarity'][i] +
            self.pn[1] * collection_dict['year_normalized'][i] +
            self.pn[2] * collection_dict['citations_normalized'][i] +
            self.pn[3] * collection_dict['points_normalized'][i]
            for i in range(len(collection_dict['id']))
        ]

        # Tworzenie listy słowników dla posortowania
        sorted_collection = sorted(
            [
                {
                    'id': collection_dict['id'][i],
                    'title': collection_dict['title'][i],
                    'similarity': collection_dict['similarity'][i],
                    'year': collection_dict['year'][i],
                    'n_citation': collection_dict['n_citation'][i],
                    'gov_score': collection_dict['gov_score'][i],
                    'year_normalized': collection_dict['year_normalized'][i],
                    'citations_normalized': collection_dict['citations_normalized'][i],
                    'points_normalized': collection_dict['points_normalized'][i],
                    'score': collection_dict['score'][i]
                }
                for i in range(len(collection_dict['id']))
            ],
            key=lambda x: x['score'],
            reverse=True
        )
        
        # Stronicowanie wyników
        ranked_indices = [entry['id'] for entry in sorted_collection]
        pages = [ranked_indices[i:i + self.N] for i in range(0, len(ranked_indices), self.N)]
        pages_distribution = self.distribution_function(len(pages))

        selected_papers = []
        for _ in range(self.k):
            # Problem pustej strony - pojawia sie kiedy zdejmiemy juz wszytskei dostepne artykuły z tej strony w drodze losowania bez powtórzeń
            non_empty_pages = [page for page in pages if len(page) > 0]
            non_empty_distribution = self.distribution_function(len(non_empty_pages))
            
            selected_page_index = np.random.choice(len(non_empty_pages), p=non_empty_distribution) #rozkłąd e^-x wybrania stronyz  ltóej bedzie cytay losowany
            selected_page = non_empty_pages[selected_page_index]                
            selected_paper_index = np.random.choice(selected_page) # rozkałd stały rpawdopodobiestwa wybrania ze strony
            selected_papers.append(selected_paper_index)

            # Usuwanie wylosowanych wyników
            pages[selected_page_index] = [x for x in selected_page if x != selected_paper_index]

        # Zapisanie identyfikatorów wylosowanych prac
        selected_paper_counts = collections.Counter(selected_papers)

        return selected_paper_counts

    def select_papers(self, ranking):
        selected_papers = random.sample(ranking, self.k)
        return selected_papers


In [None]:
# prev `../data/queries_df.csv` was generated in 3. but was replased with .pkl

class Experiment:
    def __init__(self, settings):
        self.virtual_aggregator = VirtualAggregator()
        self.queries = None
        self.settings = settings
        self.similar_articles = None

    def run_experiment(self):
        self.load_queries()
        print(f"Loaded: {len(self.queries)} queries")

        distribution_dict = {}

        counter = 0
        result_dict = {
            'query_id': [],
            'settings': [],
            'distribution': [],
        }

        for i, query in enumerate(tqdm(self.queries, total=len(self.queries), desc="Queries", unit="query")):
            self.similar_articles = self.virtual_aggregator.get_similar_articles(query, 250)

            for sample in self.settings:
                self.virtual_aggregator.set_parameters(sample['N'], sample['k'], sample['pn'])
                step_distribution = self.step(query)

                # Save result
                result_dict['query_id'].append(i)
                result_dict['settings'].append(sample)
                result_dict['distribution'].append(dict(step_distribution))

                if str(sample) in distribution_dict:                  
                    distribution_dict[str(sample)].update(step_distribution)
                else:
                    distribution_dict[str(sample)] = step_distribution

                counter += 1

            if counter % 300 == 0:
                self.save_distribution(distribution_dict)
                self.save_results(result_dict)
                result_dict = {
                    'query_id': [],
                    'settings': [],
                    'distribution': [],
                }

        self.save_results(result_dict)
        self.save_distribution(distribution_dict)

    def step(self, query):
        collection_dict = {
            'id': self.similar_articles['ids'][0],
            'title': self.similar_articles['documents'][0],
            'similarity': self.similar_articles['distances'][0],
            'year': [metadata['year'] for metadata in self.similar_articles['metadatas'][0]],
            'n_citation': [metadata['n_citation'] for metadata in self.similar_articles['metadatas'][0]],
            'gov_score': [metadata['gov_score'] for metadata in self.similar_articles['metadatas'][0]]
        }

        return self.virtual_aggregator.distribution_generator(collection_dict)

    def load_queries(self):
        df_query = pd.read_csv('../data/queries_df.csv')
        self.queries = df_query['query'].tolist()

    def save_results(self, result_dict):
        #results_df.to_csv('../data/results.csv', index=False)
        # Zapisanie słownika do pliku CSV
        file_exists = os.path.isfile('../data/results.csv')
        keys = result_dict.keys()
        with open('../data/results.csv', 'a', newline='') as output_file:
            dict_writer = csv.DictWriter(output_file, fieldnames=keys)
            if not file_exists:
                dict_writer.writeheader()  # Zapis nagłówków tylko, gdy plik nie istnieje
            dict_writer.writerows([dict(zip(keys, row)) for row in zip(*result_dict.values())])

    def save_distribution(self, distribution_dict):
        distribution_df = pd.DataFrame(list(distribution_dict.items()), columns=['settings', 'distribution'])
        distribution_df['distribution'] = distribution_df['distribution'].apply(lambda x: dict(x))
        distribution_df.to_csv('../distributions.csv')


In [4]:
def generate_all_settings():
    page_sizes = [10, 100]
    citation_numbers = [10, 25, 50]
    weights = [0., 0.1, 0.25, 0.33, 0.5, 0.75, 0.9, 1.0]

    all_combinations = list(itertools.product(weights, repeat=4))

    valid_configs = [list(c) for c in all_combinations if 0.99 <= sum(c) <= 1.0]

    # Generowanie wszystkich możliwych ustawień
    settings = []
    for page_size in page_sizes:
        for citation_number in citation_numbers:
            for config in valid_configs:
                settings.append({
                    'N': page_size,
                    'k': citation_number,
                    'pn': config
                })

    return settings

## Program

### 1. Generate settings

In [5]:
settings = generate_all_settings()
display(settings[0])
print("Liczba wygenerowanych konfiguracji: ", len(settings))

{'N': 10, 'k': 10, 'pn': [0.0, 0.0, 0.0, 1.0]}

Liczba wygenerowanych konfiguracji:  306


### 2. Run main code

In [25]:
experiment = Experiment(settings)
experiment.run_experiment()

Loaded: 850000 queries


Queries:   0%|▏                                                           | 2108/850000 [18:39<125:02:44,  1.88query/s]


KeyboardInterrupt: 

### 3. Read result

In [6]:
distribution_df = pd.read_csv('../data/results.csv')
display(distribution_df.head()) 

Unnamed: 0,query_id,settings,distribution
0,150,"{'N': 10, 'k': 10, 'pn': [0.0, 0.0, 0.0, 1.0]}","{'358372': 1, '228408': 1, '261158': 1, '19472..."
1,150,"{'N': 10, 'k': 10, 'pn': [0.0, 0.0, 0.1, 0.9]}","{'694324': 1, '453823': 1, '444086': 1, '56808..."
2,150,"{'N': 10, 'k': 10, 'pn': [0.0, 0.0, 0.25, 0.75]}","{'694324': 1, '453823': 1, '444086': 1, '56808..."
3,150,"{'N': 10, 'k': 10, 'pn': [0.0, 0.0, 0.5, 0.5]}","{'453823': 1, '541503': 1, '694324': 1, '19472..."
4,150,"{'N': 10, 'k': 10, 'pn': [0.0, 0.0, 0.75, 0.25]}","{'70014': 1, '257451': 1, '541503': 1, '194723..."
