In [1]:
import os
import sys
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler
import chromadb

In [2]:
print(np.__version__)

2.2.6


In [3]:
# test chroma
print("Chroma OK:", chromadb.__version__)

Chroma OK: 1.4.0


### 1. Prepare paths and localisation

Ensure the project root is the working directory so that relative paths
and imports from the `src` package behave consistently with .py scripts.

In [2]:
if os.getcwd().endswith('notebooks'):
    os.chdir('..')
    print(f"Zmieniono katalog roboczy na: {os.getcwd()}")

if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

Zmieniono katalog roboczy na: d:\$projects\python\master_thesis


### 2. Generating a Global Scaler

Before running the simulation, it is crucial to create a **global scaler** file (`.pkl`) to ensure consistent feature scaling across the entire project. This step guarantees that popularity weights and other features remain comparable across experiments.

In [13]:
# 1. Load the intermediate dataset
df_interim = pd.read_csv('data/interim/articles_with_score_df.csv')

# 2. Log-transform citation counts
#    Prevents records with extremely high citations from dominating the scaling process
df_interim['n_citation_log'] = np.log1p(df_interim['n_citation'])

# 3. Fit a MinMaxScaler on the global features
scaler = MinMaxScaler()
scaler.fit(df_interim[['year', 'n_citation_log', 'gov_score']].values)

# 4. Save the scaler to the models folder for later use
os.makedirs('models', exist_ok=True)
joblib.dump(scaler, 'models/global_scaler.pkl')

print("Health Check - Scaler: Global Scaler saved to models/global_scaler.pkl")


Health Check - Scaler: Global Scaler saved to models/global_scaler.pkl


### 3. Declare Experiment Settings

Experiment settings are stored as a **list of dictionaries**. Each dictionary represents one configuration of parameters for the simulation, including dataset size, number of citations, and feature weights. This setup allows systematic testing of multiple combinations.

In [None]:
# # Example configurations (you may have 300+)
# settings = [
#     {"N": 10, "k": 5, "pn": [0.25, 0.25, 0.25, 0.25]}, # Equal distribution of weights
#     {"N": 10, "k": 5, "pn": [1.0, 0.0, 0.0, 0.0]},     # Only semantic similarity
#     {"N": 100, "k": 10, "pn": [0.0, 0.0, 1.0, 0.0]}    # Only citations (N=100)
# ]

# # Optionally, save the experiment settings to a file
# # This allows other modules or experiments to read the configurations consistently
# os.makedirs('data/external', exist_ok=True)
# pd.to_pickle(settings, 'data/external/settings.pkl')


In [9]:
from src.config.settings_generator import generate_all_settings

SETTINGS_PATH = "data/external/settings.pkl"
settings = generate_all_settings()

os.makedirs('data/external', exist_ok=True)
pd.to_pickle(settings, SETTINGS_PATH)

### 4. Fragmentary Smoke Test of the Engine

Before running a full batch of 40,000 queries, it is recommended to perform a **smoke test**. This ensures that the `Experiment` class correctly initializes the `VirtualAggregator` and can process at least one query without errors.

In [11]:
# Example: Run a single-query smoke test
# This verifies that all components (embedding, aggregation, scoring) are wired correctly

from src.models.experiment import Experiment


# Initialize the experiment with minimal settings
settings = pd.read_pickle("data/external/settings.pkl")

exp = Experiment(settings) # test_mode=True

# Process a single query to validate the pipeline
single_result = exp.run_single_query(0)

for s_id, distribution in single_result.items():
    print(f"--- Setting {s_id} ---")
    print(f"Liczba wybranych unikalnych prac: {len(distribution)}")
    print(f"Top 3 najczęściej wybrane ID: {distribution.most_common(3)}")
    print("\n")


2025-12-28 14:04:41,783 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-12-28 14:04:41,886 - INFO - Loading queries from data/interim/queries_with_embeddings.pkl


--- Setting 0 ---
Liczba wybranych unikalnych prac: 5
Top 3 najczęściej wybrane ID: [(np.str_('535037'), 1), (np.str_('745295'), 1), (np.str_('4315'), 1)]


--- Setting 1 ---
Liczba wybranych unikalnych prac: 5
Top 3 najczęściej wybrane ID: [(np.str_('387008'), 1), (np.str_('494046'), 1), (np.str_('841109'), 1)]


--- Setting 2 ---
Liczba wybranych unikalnych prac: 10
Top 3 najczęściej wybrane ID: [(np.str_('491641'), 1), (np.str_('464010'), 1), (np.str_('791746'), 1)]






### 5. Run main step

This step launches the full simulation. Thanks to the **built-in health check** in the `Experiment` class, the pipeline can automatically resume from the last saved query if the process was interrupted.

In [3]:
from src.models.experiment import Experiment

# Configure the experiment orchestrator
settings = pd.read_pickle("data/external/settings.pkl")

experiment_orchestrator = Experiment(settings)

# The batch parameter defines how many queries to process in this session (e.g., 40,000)
# Checkpoints (CSV saves) are automatically performed every 500 queries (configured in the class)
experiment_orchestrator.run_experiment(batch=50000)

print("Experiment batch finished.")


2025-12-31 05:59:07,447 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-12-31 05:59:07,595 - INFO - Starting experiment execution
2025-12-31 05:59:07,595 - INFO - Loading queries from data/interim/queries_with_embeddings.pkl
2025-12-31 06:03:44,851 - INFO - Skipping already processed queries: 800000
2025-12-31 06:03:44,851 - INFO - Processing query range: 800000–850000
Queries: 100%|██████████| 50000/50000 [2:03:53<00:00,  6.73it/s]   
2025-12-31 08:07:38,050 - INFO - Final result persistence


Experiment batch finished.
