In [1]:
import pandas as pd
import numpy as np
import json
import os
import collections
import time
import chromadb
from chromadb.config import Settings
from sklearn.preprocessing import MinMaxScaler

In [8]:
def distribution_function(page_count):
    pages_distribution = np.exp(-np.arange(1, page_count + 1))
    pages_distribution /= pages_distribution.sum()
    return pages_distribution

def distribution_generator(k, N, p, Q, results_df): 
    """
    Generates a distribution of selected papers based on specified parameters.

    Parameters:
    -----------
    k : int
        Number of citations to sample.
    N : int
        Page size for pagination.
    p : list
        List of weights for criteria: [semantic similarity, publication year, number of citations, publication venue].
    Q : str
        Query used for selecting papers.
    results_df : pandas.DataFrame
        DataFrame containing query results with columns: 'id', 'title', 'similarity', 'year', 'n_citation', 'gov_score'.

    Returns:
    --------
    Counter
        Counter object containing identifiers of selected papers and their counts.
    """

    # Normalizacja lat, liczby cytowań i punktów za publikacje
    scaler = MinMaxScaler()
    collection_df[['year_normalized', 'citations_normalized', 'points_normalized']] = scaler.fit_transform(collection_df[['year', 'n_citation', 'gov_score']])

    # Obliczanie rankingu
    collection_df['score'] = p[0] * collection_df['similarity'] + \
          p[1] * collection_df['year_normalized'] + \
          p[2] * collection_df['citations_normalized'] + \
          p[3] * collection_df['points_normalized']
    
    # Sortowanie po rankingu
    df_sorted = collection_df.sort_values(by='score', ascending=False)

    # Stronicowanie wyników
    ranked_indices = df_sorted['id'].to_numpy()
    pages = [ranked_indices[i:i + N] for i in range(0, len(ranked_indices), N)]
    pages_distribution = distribution_function(len(pages))

    # Losowanie k prac
    np.random.seed(42)  # Ustawienie ziarna losowości dla powtarzalności wyników
    selected_papers = []
    for _ in range(k):
        selected_page_index = np.random.choice(len(pages), p=pages_distribution)
        selected_page = pages[selected_page_index]
        selected_paper_index = np.random.choice(selected_page)
        selected_papers.append(selected_paper_index)

    # Zapisanie identyfikatorów wylosowanych prac
    selected_paper_counts = collections.Counter(selected_papers)
    
    # Wyświetlenie wyników
    print(f"Selected paper indices: {selected_papers}")
    print(f"Selected paper counts: {selected_paper_counts}")
    # display(results_df.head())

    return selected_paper_counts

In [46]:
# Parametry eksperymentu
k = 15
N = 20
weights = [0.5, 0.3, 0.1, 0.1]

In [4]:
# Open connection
collection_status = False
while collection_status != True:
    try:
        chroma_client = chromadb.HttpClient(host="localhost", port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))
        collection = chroma_client.get_or_create_collection(name="articles_with_score")

        collection_status = True
    except Exception as e:
        print(e)
        time.sleep(1)  # Dodajemy małe opóźnienie, aby uniknąć zbyt wielu prób
        pass

## Single query

In [5]:
example_query = "Example query"

In [11]:
query_status = False
while query_status != True:
    try:
        start_time = time.time()
        results = collection.query(query_texts=[example_query], n_results=500) #include=['embeddings']

        end_time = time.time()
        print(f"Query time: {end_time - start_time} seconds")

        query_status = True
    except Exception as e:
        print(e)
        time.sleep(1)  # Dodajemy małe opóźnienie, aby uniknąć zbyt wielu prób
        pass

# Przekształcenie wyników zapytania do DataFrame      
collection_df = pd.DataFrame({
    'id': results['ids'][0],
    'title': results['documents'][0],
    'similarity': results['distances'][0],
    'year': [metadata['year'] for metadata in results['metadatas'][0]],
    'n_citation': [metadata['n_citation'] for metadata in results['metadatas'][0]],
    'gov_score': [metadata['gov_score'] for metadata in results['metadatas'][0]]
})

Query time: 0.058737993240356445 seconds


In [86]:
start_time = time.time()
display(distribution_generator(15, 20, [0.5, 0.3, 0.1, 0.1], example_query, collection_df))

end_time = time.time()
print(f"Query time: {end_time - start_time} seconds")

Selected paper indices: [197957, 195594, 197995, 199493, 199904, 174919, 196192, 190326, 198354, 199373, 197743, 197957, 198354, 199904, 195594]
Selected paper counts: Counter({197957: 2, 195594: 2, 199904: 2, 198354: 2, 197995: 1, 199493: 1, 174919: 1, 196192: 1, 190326: 1, 199373: 1, 197743: 1})


Unnamed: 0,id,title,similarity,year,n_citation,gov_score,year_normalized,citations_normalized,points_normalized,score
0,706409,Code query by example,0.44305,2011,50,100,0.914634,0.001743,0.444444,0.540534
1,365230,Queries = examples + counterexamples,0.623748,1996,3,100,0.731707,0.000105,0.444444,0.575841
2,145730,XML QUERY BY EXAMPLE,0.677612,2002,50,20,0.804878,0.001743,0.0,0.580444
3,190632,"Query by class, rule, and concept",0.777606,1994,50,70,0.707317,0.001743,0.277778,0.62895
4,194736,Query processing techniques in the summary-tab...,0.823286,1989,87,100,0.646341,0.003034,0.444444,0.650293


Counter({197957: 2,
         195594: 2,
         199904: 2,
         198354: 2,
         197995: 1,
         199493: 1,
         174919: 1,
         196192: 1,
         190326: 1,
         199373: 1,
         197743: 1})

Query time: 0.14789605140686035 seconds


## Multiple query

In [9]:
from tqdm.notebook import tqdm

weights = [
    [0.5, 0.3, 0.1, 0.1],
    [0.25, 0.25, 0.25, 0.25],
    [0.4, 0.3, 0.2, 0.1],
    [0.3, 0.4, 0.2, 0.1],
    [0.2, 0.3, 0.3, 0.2],
    [0.1, 0.1, 0.4, 0.4],
    [0.6, 0.2, 0.1, 0.1],
    [0.1, 0.5, 0.3, 0.1],
    [0.2, 0.2, 0.2, 0.4],
    [0.3, 0.3, 0.3, 0.1],
    [0.4, 0.1, 0.4, 0.1],
    [0.35, 0.25, 0.25, 0.15],
    [0.45, 0.15, 0.2, 0.2],
    [0.5, 0.2, 0.1, 0.2],
    [0.3, 0.3, 0.2, 0.2],
    [0.2, 0.4, 0.2, 0.2],
    [0.1, 0.2, 0.3, 0.4],
    [0.15, 0.25, 0.35, 0.25],
    [0.2, 0.2, 0.4, 0.2],
    [0.25, 0.25, 0.2, 0.3],
    [0.35, 0.15, 0.3, 0.2],
    [0.4, 0.2, 0.1, 0.3],
    [0.1, 0.3, 0.4, 0.2],
    [0.2, 0.3, 0.2, 0.3],
    [0.3, 0.2, 0.3, 0.2],
    [0.4, 0.3, 0.1, 0.2],
    [0.25, 0.2, 0.3, 0.25],
    [0.2, 0.25, 0.25, 0.3],
    [0.15, 0.35, 0.2, 0.3],
    [0.3, 0.1, 0.3, 0.3]
]

query_status = False
while query_status != True:
    try:
        start_time = time.time()
        results = collection.query(query_texts=[example_query], n_results=200000) #include=['embeddings']

        end_time = time.time()
        print(f"Query time: {end_time - start_time} seconds")

        query_status = True
    except Exception as e:
        print(e)
        time.sleep(1)  # Dodajemy małe opóźnienie, aby uniknąć zbyt wielu prób
        pass

# Przekształcenie wyników zapytania do DataFrame      
collection_df = pd.DataFrame({
    'id': results['ids'][0],
    'title': results['documents'][0],
    'similarity': results['distances'][0],
    'year': [metadata['year'] for metadata in results['metadatas'][0]],
    'n_citation': [metadata['n_citation'] for metadata in results['metadatas'][0]],
    'gov_score': [metadata['gov_score'] for metadata in results['metadatas'][0]]
})

start_time = time.time()
for weight in tqdm(weights):
    distribution_generator(15, 20, weight, example_query, collection_df)

end_time = time.time()
print(f"Query time: {end_time - start_time} seconds")

Query time: 14.46756649017334 seconds


  0%|          | 0/30 [00:00<?, ?it/s]

Selected paper indices: [197957, 195594, 197995, 199493, 199904, 174919, 196192, 190326, 198354, 199373, 197743, 197957, 198354, 199904, 195594]
Selected paper counts: Counter({197957: 2, 195594: 2, 199904: 2, 198354: 2, 197995: 1, 199493: 1, 174919: 1, 196192: 1, 190326: 1, 199373: 1, 197743: 1})
Selected paper indices: [102178, 199614, 81235, 25300, 174919, 190326, 196682, 104395, 125668, 168526, 178676, 102178, 125668, 174919, 199614]
Selected paper counts: Counter({102178: 2, 199614: 2, 174919: 2, 125668: 2, 81235: 1, 25300: 1, 190326: 1, 196682: 1, 104395: 1, 168526: 1, 178676: 1})
Selected paper indices: [199614, 197438, 199174, 174919, 140853, 104395, 197549, 190326, 133670, 199493, 199415, 199614, 133670, 140853, 197438]
Selected paper counts: Counter({199614: 2, 197438: 2, 140853: 2, 133670: 2, 199174: 1, 174919: 1, 104395: 1, 197549: 1, 190326: 1, 199493: 1, 199415: 1})
Selected paper indices: [199415, 197438, 199174, 25300, 174919, 190326, 195522, 104395, 199155, 199493, 199