In [2]:
import pandas as pd

#!{sys.executable} -m pip install "numpy<2.0"
import numpy as np
import os
import sys

# !{sys.executable} -m pip install tqdm
from tqdm import tqdm


# !{sys.executable} -m pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util

# !{sys.executable} -m pip install torch
import torch


!{sys.executable} -m pip install chromadb
import chromadb



  from .autonotebook import tqdm as notebook_tqdm




In [3]:
# test numpy version
#
# !!! ChromaDB requires NumPy <2.0, !!!
print(np.__version__)

2.2.6


In [4]:
# test sentence transformers
model = SentenceTransformer("all-MiniLM-L6-v2")
emb = model.encode("test")
print(emb.shape)

(384,)


In [5]:
# test torch
print(torch.__version__)
print("CUDA available:", torch.cuda.is_available())

2.9.1+cpu
CUDA available: False


In [6]:
# test chroma
print("Chroma OK:", chromadb.__version__)

Chroma OK: 1.4.0


### 1. Prepare paths and localisation

Ensure the project root is the working directory so that relative paths
and imports from the `src` package behave consistently with .py scripts.

In [7]:
if os.getcwd().endswith('notebooks'):
    os.chdir('..')
    print(f"Zmieniono katalog roboczy na: {os.getcwd()}")

if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

Zmieniono katalog roboczy na: d:\$projects\python\master_thesis


### 2. Unit tests

#### a) Small Embedding Batch

Instead of processing the full dataset, we start with a small subset of 100 articles to validate the embedding pipeline.
This test ensures that:

- the Sentence Transformer model loads correctly,

- embeddings are generated with the expected dimensionality,

- the output `.pkl` file is saved properly and can be reloaded.

Running this lightweight test helps catch configuration or environment issues early, before launching large-scale embedding generation.

In [13]:
from src.features.build_features import FeatureBuilder

CHROMA_TEST_COLLECTION = "test_collection"

In [9]:
# Prepare a small test sample
test_input = "data/interim/articles_with_score_df.csv"
test_output = "data/interim/test/titles_embeddings.pkl"

# Load only a small subset for testing
df_sample = pd.read_csv(test_input).head(100)
df_sample.to_csv("data/interim/test/test_sample.csv", index=False)

# Run test embedding generation
builder = FeatureBuilder()
builder.create_article_embeddings(
    input_path="data/interim/test/test_sample.csv",
    output_path=test_output,
    batch_size=32
)

# Verification
check_df = pd.read_pickle(test_output)
print(f"Embedding dimensionality: {len(check_df.iloc[0]['embedding'])}")
display(check_df.head(3))

2025-12-28 11:28:56,061 - INFO - Initializing model 'all-MiniLM-L6-v2' on device: cpu
2025-12-28 11:28:56,063 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-12-28 11:28:57,977 - INFO - Loading articles from data/interim/test/test_sample.csv
2025-12-28 11:28:57,996 - INFO - Encoding 100 article titles (batch_size=32)
Batches: 100%|██████████| 4/4 [00:00<00:00, 20.20it/s]
2025-12-28 11:28:58,205 - INFO - Article embeddings saved to data/interim/test/titles_embeddings.pkl


Embedding dimensionality: 384


Unnamed: 0,id,title,year,references,authors,n_citation,venue,gov_score,embedding
0,00dc2bba-3237-4d4e-b541-1205b97df981,Software Evolution through Transformations.,2003,[],"['Reiko Heckel', 'Tom Mens', 'Michel Wermeling...",50,Electronic Notes in Theoretical Computer Science,40,"[-0.060643066, 0.0054728007, -0.020285834, -0...."
1,019a51de-5136-41cb-81fa-ed659c915913,Context Dependent Automatic Textile Image Anno...,2007,[],"['Yosuke Furukawa', 'Yusuke Kamoi', 'Tatsuya S...",50,Journal of Advanced Computational Intelligence...,20,"[-0.019282764, 0.074676365, -0.0070751593, 0.0..."
2,03385413-7c3a-44ce-b40c-58ff21304270,On the signed total chromatic number of a graph.,2006,[],['Michael A. Henning'],7,Ars Combinatoria,40,"[0.037024684, 0.019165784, 0.025710292, -0.053..."


#### b) Test ChromaDB (Test Collection)

Verify whether the data is correctly stored in the vector database. A separate collection name (`test_collection`) is used to avoid polluting the main database.

In [10]:
# Load the sample into a test collection in ChromaDB
builder.load_to_chroma(
    df_path=test_output,
    chroma_path="data/chroma_test",
    collection_name=CHROMA_TEST_COLLECTION
)

# Quick search test directly in ChromaDB
import chromadb
client = chromadb.PersistentClient(path="data/chroma_test")
col = client.get_collection(CHROMA_TEST_COLLECTION)
print(f"Number of records in the test database: {col.count()}")

2025-12-28 11:31:01,905 - INFO - Initializing ChromaDB at data/chroma_test
2025-12-28 11:31:01,940 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-12-28 11:31:02,159 - INFO - Uploading 100 records to ChromaDB
Chroma Upload: 100%|██████████| 1/1 [00:00<00:00,  7.36it/s]
2025-12-28 11:31:02,302 - INFO - ChromaDB ingestion completed


Number of records in the test database: 100


#### c) Test Query Generation

Check whether the mechanism for randomly selecting words from categories works correctly and can generate unique phrases.

In [None]:
# Generate only 10 queries instead of 850,000
test_queries_df = builder.generate_queries(raw_data_dir="data/raw", limit=10)

print("Generated test queries:")
display(test_queries_df)

In [14]:
# Verify that query processing (query embeddings) runs without errors
builder.process_queries(
    raw_dir="data/raw", 
    output_path="data/interim/test/test_queries.pkl", 
    batch_size=16
)


2025-12-28 11:36:35,465 - INFO - Generating synthetic queries
Query Generation: 100%|██████████| 10/10 [00:00<00:00, 18666.24it/s]

Generated test queries:





Unnamed: 0,query
0,Container Compress Clustered Manager
1,Linker Merge Diagnostic Operator
2,Key Distribute Distributed Gateway
3,Template Compress Enterprise Reducer
4,Algorithm Optimize Decentralized Orchestrator
5,Source Override Direct Architect
6,Anomaly Override Lightweight Supporter
7,Repository Replace Deployed Client
8,Spectrum Develop Feature-rich Tester
9,Algorithm Overload Optimized Originator


2025-12-28 11:36:35,479 - INFO - Generating synthetic queries
Query Generation: 100%|██████████| 850000/850000 [00:01<00:00, 614637.44it/s]
2025-12-28 11:36:37,066 - INFO - Encoding query embeddings
Batches: 100%|██████████| 53125/53125 [15:21<00:00, 57.66it/s]
2025-12-28 11:52:09,425 - INFO - Query embeddings saved to data/interim/test/test_queries.pkl


### 3. Run main step

**Article Embeddings**
The `create_article_embeddings` method converts all article titles into vector embeddings suitable for semantic search. This is performed on the full dataset (~850,000 records).

**ChromaDB Ingestion**
The resulting embeddings are loaded into a ChromaDB persistent collection, enabling fast vector similarity searches.

**Query Generation and Embedding**
Queries are generated from raw word lists and converted to embeddings using the same model as for articles. The output is stored as a pickle file for later use in experiments.


In [None]:
from src.features.build_features import FeatureBuilder

ARTICLES_INTERIM = "data/interim/articles_with_score_df.csv"
TITLES_PICKLE = "data/interim/titles_with_embeddings.pkl"
CHROMA_DIR = "data/chroma"
RAW_WORDS_DIR = "data/raw"
QUERIES_PICKLE = "data/interim/queries_with_embeddings.pkl"

In [None]:
builder = FeatureBuilder()

In [None]:
# 1. Article Embeddings (for 850k records)
builder.create_article_embeddings(ARTICLES_INTERIM, TITLES_PICKLE)

In [9]:
# 2. Load embeddings into ChromaDB
builder.load_to_chroma(TITLES_PICKLE, CHROMA_DIR)

2025-12-28 12:34:40,902 - INFO - Initializing model 'all-MiniLM-L6-v2' on device: cpu
2025-12-28 12:34:40,904 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-12-28 12:34:42,864 - INFO - Initializing ChromaDB at data/chroma
2025-12-28 12:34:46,932 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-12-28 12:34:47,135 - INFO - Uploading 850406 records to ChromaDB
Chroma Upload: 100%|██████████| 171/171 [13:34<00:00,  4.76s/it]
2025-12-28 12:48:21,479 - INFO - ChromaDB ingestion completed
2025-12-28 12:48:22,727 - INFO - Generating synthetic queries
Query Generation: 100%|██████████| 850000/850000 [00:01<00:00, 623658.43it/s]
2025-12-28 12:48:24,321 - INFO - Encoding query embeddings
Batches: 100%|██████████| 6641/6641 [10:44<00:00, 10.30it/s]
2025-12-28 12:59:19,270 - INFO - Query embeddings saved to data/interim/queries_with_embeddings.pkl


In [None]:
# 3. Generate queries and create embeddings for all 850k queries
builder.process_queries(RAW_WORDS_DIR, QUERIES_PICKLE)

#### Health check

In [10]:
# Run an integrated health check on the ChromaDB collection
is_valid = builder.validate_collection(TITLES_PICKLE, CHROMA_DIR)

if is_valid:
    print(" ✅ Health check passed: You can safely proceed with experiments.")
else:
    print(" ❌ WARNING: Data integrity issue detected in ChromaDB. Please investigate before running experiments!")


2025-12-28 12:59:19,444 - INFO - Checking integrity for collection: articles_with_score
2025-12-28 12:59:25,149 - INFO - HEALTH CHECK PASSED: Collection size (850406) matches DataFrame.


 ✅ Health check passed: You can safely proceed with experiments.
