In [1]:
import pandas as pd
import numpy as np
import json
import os
import collections
import time
import chromadb
from chromadb.config import Settings
from sklearn.preprocessing import MinMaxScaler

In [2]:
def distribution_function(page_count):
    pages_distribution = np.exp(-np.arange(1, page_count + 1))
    pages_distribution /= pages_distribution.sum()
    return pages_distribution

def distribution_generator(k, N, p, Q, results_df): 
    """
    Generates a distribution of selected papers based on specified parameters.

    Parameters:
    -----------
    k : int
        Number of citations to sample.
    N : int
        Page size for pagination.
    p : list
        List of weights for criteria: [semantic similarity, publication year, number of citations, publication venue].
    Q : str
        Query used for selecting papers.
    results_df : pandas.DataFrame
        DataFrame containing query results with columns: 'id', 'title', 'similarity', 'year', 'n_citation', 'gov_score'.

    Returns:
    --------
    Counter
        Counter object containing identifiers of selected papers and their counts.
    """

    # Normalizacja lat, liczby cytowań i punktów za publikacje
    scaler = MinMaxScaler()
    collection_df[['year_normalized', 'citations_normalized', 'points_normalized']] = scaler.fit_transform(collection_df[['year', 'n_citation', 'gov_score']])

    # Obliczanie rankingu
    collection_df['score'] = p[0] * collection_df['similarity'] + \
          p[1] * collection_df['year_normalized'] + \
          p[2] * collection_df['citations_normalized'] + \
          p[3] * collection_df['points_normalized']
    
    # Sortowanie po rankingu
    df_sorted = collection_df.sort_values(by='score', ascending=False)

    # Stronicowanie wyników
    ranked_indices = df_sorted['id'].to_numpy()
    pages = [ranked_indices[i:i + N] for i in range(0, len(ranked_indices), N)]
    pages_distribution = distribution_function(len(pages))

    # Losowanie k prac
    np.random.seed(42)  # Ustawienie ziarna losowości dla powtarzalności wyników
    selected_papers = []
    for _ in range(k):
        selected_page_index = np.random.choice(len(pages), p=pages_distribution)
        selected_page = pages[selected_page_index]
        selected_paper_index = np.random.choice(selected_page)
        selected_papers.append(selected_paper_index)

    # Zapisanie identyfikatorów wylosowanych prac
    selected_paper_counts = collections.Counter(selected_papers)
    
    # Wyświetlenie wyników
    print(f"Selected paper indices: {selected_papers}")
    print(f"Selected paper counts: {selected_paper_counts}")
    # display(results_df.head())

    return selected_paper_counts

In [3]:
# Parametry eksperymentu
k = 15
N = 20
weights = [0.5, 0.3, 0.1, 0.1]

In [5]:
# Open connection
collection_status = False
while collection_status != True:
    try:
        chroma_client = chromadb.HttpClient(host="localhost", port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))
        collection = chroma_client.get_or_create_collection(name="articles_with_score")

        collection_status = True
    except Exception as e:
        print(e)
        time.sleep(1)  # Dodajemy małe opóźnienie, aby uniknąć zbyt wielu prób
        pass

Could not connect to a Chroma server. Are you sure it is running?
Could not connect to a Chroma server. Are you sure it is running?


KeyboardInterrupt: 

## Single query

In [5]:
example_query = "Example query"

In [None]:
query_status = False
while query_status != True:
    try:
        start_time = time.time()
        results = collection.query(query_texts=[example_query], n_results=500) #include=['embeddings']

        end_time = time.time()
        print(f"Query time: {end_time - start_time} seconds")

        query_status = True
    except Exception as e:
        print(e)
        time.sleep(1)  # Dodajemy małe opóźnienie, aby uniknąć zbyt wielu prób
        pass

# Przekształcenie wyników zapytania do DataFrame      
collection_df = pd.DataFrame({
    'id': results['ids'][0],
    'title': results['documents'][0],
    'similarity': results['distances'][0],
    'year': [metadata['year'] for metadata in results['metadatas'][0]],
    'n_citation': [metadata['n_citation'] for metadata in results['metadatas'][0]],
    'gov_score': [metadata['gov_score'] for metadata in results['metadatas'][0]]
})

('Connection aborted.', ConnectionResetError(10054, 'Istniejące połączenie zostało gwałtownie zamknięte przez zdalnego hosta', None, 10054, None))
('Connection aborted.', ConnectionResetError(10054, 'Istniejące połączenie zostało gwałtownie zamknięte przez zdalnego hosta', None, 10054, None))
('Connection aborted.', ConnectionResetError(10054, 'Istniejące połączenie zostało gwałtownie zamknięte przez zdalnego hosta', None, 10054, None))


In [86]:
start_time = time.time()
display(distribution_generator(15, 20, [0.5, 0.3, 0.1, 0.1], example_query, collection_df))

end_time = time.time()
print(f"Query time: {end_time - start_time} seconds")

Selected paper indices: [197957, 195594, 197995, 199493, 199904, 174919, 196192, 190326, 198354, 199373, 197743, 197957, 198354, 199904, 195594]
Selected paper counts: Counter({197957: 2, 195594: 2, 199904: 2, 198354: 2, 197995: 1, 199493: 1, 174919: 1, 196192: 1, 190326: 1, 199373: 1, 197743: 1})


Unnamed: 0,id,title,similarity,year,n_citation,gov_score,year_normalized,citations_normalized,points_normalized,score
0,706409,Code query by example,0.44305,2011,50,100,0.914634,0.001743,0.444444,0.540534
1,365230,Queries = examples + counterexamples,0.623748,1996,3,100,0.731707,0.000105,0.444444,0.575841
2,145730,XML QUERY BY EXAMPLE,0.677612,2002,50,20,0.804878,0.001743,0.0,0.580444
3,190632,"Query by class, rule, and concept",0.777606,1994,50,70,0.707317,0.001743,0.277778,0.62895
4,194736,Query processing techniques in the summary-tab...,0.823286,1989,87,100,0.646341,0.003034,0.444444,0.650293


Counter({197957: 2,
         195594: 2,
         199904: 2,
         198354: 2,
         197995: 1,
         199493: 1,
         174919: 1,
         196192: 1,
         190326: 1,
         199373: 1,
         197743: 1})

Query time: 0.14789605140686035 seconds


## Multiple query

In [9]:
from tqdm.notebook import tqdm

weights = [
    [0.5, 0.3, 0.1, 0.1],
    [0.25, 0.25, 0.25, 0.25],
    [0.4, 0.3, 0.2, 0.1],
    [0.3, 0.4, 0.2, 0.1],
    [0.2, 0.3, 0.3, 0.2],
    [0.1, 0.1, 0.4, 0.4],
    [0.6, 0.2, 0.1, 0.1],
    [0.1, 0.5, 0.3, 0.1],
    [0.2, 0.2, 0.2, 0.4],
    [0.3, 0.3, 0.3, 0.1],
    [0.4, 0.1, 0.4, 0.1],
    [0.35, 0.25, 0.25, 0.15],
    [0.45, 0.15, 0.2, 0.2],
    [0.5, 0.2, 0.1, 0.2],
    [0.3, 0.3, 0.2, 0.2],
    [0.2, 0.4, 0.2, 0.2],
    [0.1, 0.2, 0.3, 0.4],
    [0.15, 0.25, 0.35, 0.25],
    [0.2, 0.2, 0.4, 0.2],
    [0.25, 0.25, 0.2, 0.3],
    [0.35, 0.15, 0.3, 0.2],
    [0.4, 0.2, 0.1, 0.3],
    [0.1, 0.3, 0.4, 0.2],
    [0.2, 0.3, 0.2, 0.3],
    [0.3, 0.2, 0.3, 0.2],
    [0.4, 0.3, 0.1, 0.2],
    [0.25, 0.2, 0.3, 0.25],
    [0.2, 0.25, 0.25, 0.3],
    [0.15, 0.35, 0.2, 0.3],
    [0.3, 0.1, 0.3, 0.3]
]

query_status = False
while query_status != True:
    try:
        start_time = time.time()
        results = collection.query(query_texts=[example_query], n_results=200000) #include=['embeddings']

        end_time = time.time()
        print(f"Query time: {end_time - start_time} seconds")

        query_status = True
    except Exception as e:
        print(e)
        time.sleep(1)  # Dodajemy małe opóźnienie, aby uniknąć zbyt wielu prób
        pass

# Przekształcenie wyników zapytania do DataFrame      
collection_df = pd.DataFrame({
    'id': results['ids'][0],
    'title': results['documents'][0],
    'similarity': results['distances'][0],
    'year': [metadata['year'] for metadata in results['metadatas'][0]],
    'n_citation': [metadata['n_citation'] for metadata in results['metadatas'][0]],
    'gov_score': [metadata['gov_score'] for metadata in results['metadatas'][0]]
})

start_time = time.time()
for weight in tqdm(weights):
    distribution_generator(15, 20, weight, example_query, collection_df)

end_time = time.time()
print(f"Query time: {end_time - start_time} seconds")

Query time: 14.46756649017334 seconds


  0%|          | 0/30 [00:00<?, ?it/s]

Selected paper indices: [197957, 195594, 197995, 199493, 199904, 174919, 196192, 190326, 198354, 199373, 197743, 197957, 198354, 199904, 195594]
Selected paper counts: Counter({197957: 2, 195594: 2, 199904: 2, 198354: 2, 197995: 1, 199493: 1, 174919: 1, 196192: 1, 190326: 1, 199373: 1, 197743: 1})
Selected paper indices: [102178, 199614, 81235, 25300, 174919, 190326, 196682, 104395, 125668, 168526, 178676, 102178, 125668, 174919, 199614]
Selected paper counts: Counter({102178: 2, 199614: 2, 174919: 2, 125668: 2, 81235: 1, 25300: 1, 190326: 1, 196682: 1, 104395: 1, 168526: 1, 178676: 1})
Selected paper indices: [199614, 197438, 199174, 174919, 140853, 104395, 197549, 190326, 133670, 199493, 199415, 199614, 133670, 140853, 197438]
Selected paper counts: Counter({199614: 2, 197438: 2, 140853: 2, 133670: 2, 199174: 1, 174919: 1, 104395: 1, 197549: 1, 190326: 1, 199493: 1, 199415: 1})
Selected paper indices: [199415, 197438, 199174, 25300, 174919, 190326, 195522, 104395, 199155, 199493, 199

In [13]:
def read_specific_line(file_path, line_number):
    with open(file_path, 'r') as file:
        # Pomijamy linie przed linią docelową
        for _ in range(line_number - 1):
            file.readline()
        
        # Czytamy i zwracamy docelową linię
        target_line = file.readline()
    
    return target_line

# Definiowanie ścieżki do pliku i numeru linii
file_path = '../data/process_args.csv'
line_number = 177139

# Wczytywanie określonej linii
line = read_specific_line(file_path, line_number)

# Wyświetlanie zawartości linii
print(line)
# Is Loader Connect Connected Integrator?,20,14,"Is Linker Query Implicit Recruiter?,20,17,"[0.28, 0.53, 0.15, 0.04]"

Is Loader Connect Connected Integrator?,20,14,"Is Linker Query Implicit Recruiter?,20,17,"[0.28, 0.53, 0.15, 0.04]"



In [8]:
import pandas as pd

# Definiowanie numeru linii, którą chcesz załadować
line_number = 177139

# Wczytanie tylko jednej określonej linii z pliku CSV
df_query = pd.read_csv(
    '../data/process_args.csv',
    skiprows=[line_number],
)

# Wyświetlenie rozmiaru DataFrame (ilość wierszy i kolumn)
display(df_query.size)

ParserError: Error tokenizing data. C error: Expected 4 fields in line 177139, saw 7


In [17]:
def count_lines_with_text(file_path, search_text):
    count = 0
    search_text = f'"{search_text}'  # Dodanie cudzysłowów do szukanego tekstu
    
    with open(file_path, 'r') as file:
        # Uzyskaj liczbę linii w pliku do poprawnego wyświetlania paska postępu
        total_lines = sum(1 for line in open(file_path, 'r'))
        
        # Użyj tqdm do śledzenia postępu
        for line in tqdm(file, total=total_lines, desc="Przeszukiwanie linii", unit="linia"):
            if search_text in line:
                count += 1
    
    return count

# Ścieżka do pliku CSV i tekst do wyszukania
file_path = '../data/process_args.csv'
search_text = 'Is'  # Tekst do wyszukania w liniach

# Liczenie linii zawierających określony tekst
line_count = count_lines_with_text(file_path, search_text)

print(f"Liczba linii zawierających '{search_text}' (z cudzysłowami): {line_count}")


Przeszukiwanie linii: 100%|██████████████████████████████████████| 250011271/250011271 [02:22<00:00, 1754934.00linia/s]

Liczba linii zawierających 'Is' (z cudzysłowami): 21





In [32]:
from tqdm import tqdm

def count_lines_with_text(file_path):
    count = 0
    search_text = 'Is'  # Tekst do wyszukania w liniach
    search_text = f'"{search_text}'  # Dodanie cudzysłowów do szukanego tekstu
    lines = {
        'index': [],
        'line': [],
    }
    
    with open(file_path, 'r') as file:
        # Uzyskaj liczbę linii w pliku do poprawnego wyświetlania paska postępu
        total_lines = sum(1 for line in open(file_path, 'r'))
        
        # Użyj tqdm do śledzenia postępu
        for idx, line in enumerate(tqdm(file, total=total_lines, desc="Przeszukiwanie linii", unit="linia")):
            if search_text in line:
                count += 1;
                lines['index'].append(idx)
                lines['line'].append(line)
    
    display(lines)
    return count

# Ścieżka do pliku CSV i tekst do wyszukania
file_path = '../data/process_args.csv'

# Liczenie linii zawierających określony tekst
line_count = count_lines_with_text(file_path,)

print(f"Liczba linii zawierających '{search_text}' (z cudzysłowami): {line_count}")


Przeszukiwanie linii: 100%|██████████████████████████████████████| 250011271/250011271 [02:49<00:00, 1477308.06linia/s]


{'index': [177138,
  13460000,
  16521887,
  19115349,
  47186524,
  48554320,
  51046753,
  92159322,
  95497827,
  99947901,
  116349733,
  121468990,
  124996171,
  143352753,
  159927118,
  193484653,
  199703631,
  219015383,
  219965770,
  233342978,
  235483898],
 'line': ['Is Loader Connect Connected Integrator?,20,14,"Is Linker Query Implicit Recruiter?,20,17,"[0.28, 0.53, 0.15, 0.04]"\n',
  'Is Module Handle Concurrent Reviewer?,12,7,"Is Model Explain Multidimensional Bot?,27,19,"[0.73, 0.04, 0.09, 0.14]"\n',
  'Is Cryptosystem Reinforce Decentralized Educator?,26,22,"[0.22, 0.61, 0.02, 0.15]"Is Scheduler Recover Finite Specialist?,24,16,"[0.35, 0.18, 0.4, 0.07]"\n',
  'Is Tunnel Decrypt Layered Proxy?,19,14,"Is Runtime Automate Designed Gateway?,18,16,"[0.35, 0.09, 0.31, 0.26]"\n',
  '21, 0.17, 0.27, 0.35]"Is Library Compress Mapped Moderator?,11,5,"[0.01, 0.11, 0.09, 0.8]"\n',
  'Is Software Monitor Buffered Generator?,20,14,"Is Cryptography Authenticate Infinite Responder?

Liczba linii zawierających 'Is' (z cudzysłowami): 21


In [29]:
def read_specific_line(file_path, line_number):
    search_text = "Is"
    count = 0
    search_text = f'"{search_text}'  # Dodanie cudzysłowów do szukanego tekstu
    lines = {
        'index': [],
        'line': [],
    }


    with open(file_path, 'r') as file:
        #total_lines = sum(1 for line in open(file_path, 'r'))

        for _ in range(line_number ):
            line = file.readline()
        display(line)
        for idx, li in enumerate(tqdm([line], total=1, desc="Przeszukiwanie linii", unit="linia")):
            display(idx)
            display(li)
            
            if search_text in li:
                count += 1;
                lines['line'].append(li)

    display(lines)
    return count

# Definiowanie ścieżki do pliku i numeru linii
file_path = '../data/process_args.csv'
line_number = 177139

# Wczytywanie określonej linii
line = read_specific_line(file_path, line_number)

# Wyświetlanie zawartości linii
print(line)

'Is Loader Connect Connected Integrator?,20,14,"Is Linker Query Implicit Recruiter?,20,17,"[0.28, 0.53, 0.15, 0.04]"\n'

Przeszukiwanie linii:   0%|                                                                   | 0/1 [00:00<?, ?linia/s]

0

'Is Loader Connect Connected Integrator?,20,14,"Is Linker Query Implicit Recruiter?,20,17,"[0.28, 0.53, 0.15, 0.04]"\n'

Przeszukiwanie linii: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 225.66linia/s]


{'index': [],
 'line': ['Is Loader Connect Connected Integrator?,20,14,"Is Linker Query Implicit Recruiter?,20,17,"[0.28, 0.53, 0.15, 0.04]"\n']}

1


# TEST QUERY 

In [2]:
import pandas as pd
import numpy as np
import json
import os
import collections
import time
import chromadb
from chromadb.config import Settings

In [14]:
query = 'The Parlay proxy manager — Architecture considerations'
id = '345143'
try:
    chroma_client = chromadb.PersistentClient(path="../data/chroma")
    # chroma_client = chromadb.HttpClient(host="localhost", port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))
    chroma_collection = chroma_client.get_or_create_collection(name="articles_with_score")
    #result = chroma_collection.query(query_texts=[query], n_results=1)
    result = chroma_collection.get(ids=[id], include=['metadatas', 'documents', 'embeddings'])
    # display(result)
    new_order_dict = {
        'ids': result['ids'],
        'documents': result['documents'],
        'metadatas': result['metadatas'],
        'uris': result['uris'],
        'data': result['data'],
        'embeddings': result['embeddings'],
    }
    display(new_order_dict)

except Exception as e:
    print(e)
                

{'ids': ['345143'],
 'documents': ['The Parlay proxy manager — Architecture considerations'],
 'metadatas': [{'gov_score': 100, 'n_citation': 2, 'year': 2003}],
 'uris': None,
 'data': None,
 'embeddings': [[-0.023660996928811073,
   -0.05538276955485344,
   0.0037664470728486776,
   -0.07373573631048203,
   -0.09675989300012589,
   -0.08433341234922409,
   0.00044770335080102086,
   -0.023750444874167442,
   -0.08346416056156158,
   -0.05838252231478691,
   -0.010637793689966202,
   0.049456220120191574,
   -0.01020707655698061,
   -0.024917516857385635,
   0.05492120981216431,
   0.02982347272336483,
   0.0862276554107666,
   -0.017597965896129608,
   0.050892967730760574,
   -0.06354695558547974,
   -0.07932235300540924,
   -0.12982389330863953,
   -0.08781290054321289,
   -0.0021734966430813074,
   -0.057305388152599335,
   0.02898515947163105,
   -0.022760603576898575,
   -0.0004416572046466172,
   -0.02239416539669037,
   -0.055121954530477524,
   0.059956520795822144,
   -0.0220