In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

## Load data

In [64]:
# Wczytaj dane z pliku CSV
csv_file_path = "../data/interim//articles_with_score_df.csv"
df = pd.read_csv(csv_file_path)

### Prepare embedding

In [7]:
from sentence_transformers import SentenceTransformer, util
import torch

if torch.cuda.is_available():
    print("CUDA is available: ", torch.cuda.is_available())   
    print("Number of CUDA devices: ", torch.cuda.device_count())
    print("CUDA current device: ", torch.cuda.current_device())
    print("CUDA device name: ", torch.cuda.get_device_name(0))

    # Utwórz instancję modelu SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda')
else:
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cpu')

# Zanurzenie tytułów
titles = df['title'].tolist()

titles_embeddings = []

for title in tqdm(titles):
    embedding = model.encode([title])
    titles_embeddings.append(embedding[0])

df['embedding'] = titles_embeddings

100%|███████████████████████████████████████████████████████████████████████| 850406/850406 [1:27:03<00:00, 162.79it/s]


## Load data to Vector DB (chroma)

In [3]:
import chromadb
from chromadb.config import Settings

chroma_client = chromadb.HttpClient(host="localhost", port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))

collection_status = False
while collection_status != True:
    try:
        document_collection = chroma_client.get_or_create_collection(name="articles_with_score")
        collection_status = True
    except Exception as e:
        print(e)
        pass

In [37]:
batch_size = 10000
last_confirmed_id = 0

# Dzielimy DataFrame na tablice po 10 000 elementów
for batch_start in tqdm(range(0, df.shape[0], batch_size), desc='Batches', unit='batch'):
    batch_df = df.iloc[batch_start:batch_start + batch_size]

    # Pobieramy listę embeddingów, dokumentów, metadanych i id dla bieżącej partii
    batch_embeddings = batch_df['embedding'].apply(lambda x: x.tolist()).tolist()
    batch_documents = batch_df['title'].tolist()
    batch_metadatas = [{'year': row['year'], 'n_citation': row['n_citation'], 'gov_score': row['gov_score']} for index, row in batch_df.iterrows()]
    batch_ids = [str(index + 1) for index in batch_df.index]
    
    # Dodajemy partię danych do kolekcji
    document_collection.add(
        embeddings=batch_embeddings,
        documents=batch_documents,
        metadatas=batch_metadatas,
        ids=batch_ids
    )

    # Aktualizujemy last_confirmed_id
    last_confirmed_id = batch_df.index[-1] + 1

# Wypisz ostatnio potwierdzone ID
# print("Last confirmed ID:", last_confirmed_id)

# Sprawdzamy rozmiar kolekcji
try:
    collection_size = document_collection.count()
    print("Size of the collection:", collection_size)
except Exception as e:
    print("Failed to get collection size:", e)

Batches: 100%|██████████████████████████████████████████████████████████████████████| 86/86 [27:16<00:00, 19.03s/batch]


Last confirmed ID: 850406
Size of the collection: 850406


In [42]:
# Usuwamy dataset aby opróżnić pamięć RAM
del df

### Health check

In [71]:
if document_collection.count() == df.shape[0]:
    print("Correct size of the articles collection:", document_collection.count())
else:
    print("Data inconsistency detected!!!")

Correct size of the articles collection: 850406


## Word collection initialization

In this section collection of `verb`, `noun`, `adjectives` and `participles` were prepared. These will later be used to build random impressions and phrases. To achive this we will use the **spacy** library.

The previously mentioned collections will be stored in the form of a dictionary due to the indexing properties of this data structure

In [6]:
import spacy

dict_verb = {}
dict_noun = {}
dict_adj = {}
dict_ger = {}

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

titles_data = document_collection.get()
title_df = pd.DataFrame({'title': titles_data['documents']})

spacy.require_gpu()
nlp = spacy.load('en_core_web_sm')

def process_titles(title):
    document = nlp(title)

    for token in document:
        # Ignore punctuation marks and whitespaces
        if not token.is_punct and not token.is_space:
            lemma = token.lemma_

            # 1. Imiesłowy czasownikowe:
            # - Imiesłowy bierny (czasownik bierny): Tag VBN (Past participle)
            # - Imiesłowy czynny (czasownik czynny): Tag VBG (Gerund or present participle)
            # 2. imiesłowy przymiotnikowe
            # - Imiesłowy bierny (przymiotnik bierny): VAPP (perfect participle, auxiliary)
            # - Imiesłowy czynny (przymiotnik czynny): VMPP (perfect participle, modal)
            # - VVPP (perfect participles)
            # 3. Imiesłowy przymiotnikowe:
            #    np "running shoes"
            if token.pos_ == "VERB" and token.tag_ in ["VBN", "VBG"]:
                dict_ger[lemma] = lemma
            elif token.pos_ == "VERB":
                dict_verb[lemma] = lemma
            elif token.pos_ == "ADJ" and token.tag_ in ["VAPP", "VMPP", "VVPP"]:
                dict_ger[lemma] = lemma
            elif token.pos_ == "ADJ":
                dict_adj[lemma] = lemma
            elif token.pos_ == "NOUN" and token.tag_ == "VBG":
                dict_ger[lemma] = lemma
            elif token.pos_ == "NOUN":
                dict_noun[lemma] = lemma

# Processing titles and building dictionaries
title_df['title'].progress_apply(process_titles)

# Summary
print("Verb dictionary:")
print(len(dict_verb))
print("\nDictionary of nouns:")
print(len(dict_noun))
print("\nDictionary of adjectives:")
print(len(dict_adj))
print("\nDictionary of participles:")
print(len(dict_ger))

100%|████████████████████████████████████████████████████████████████████████| 850406/850406 [6:07:14<00:00, 38.59it/s]

Verb dictionary:
6459

Dictionary of nouns:
49604

Dictionary of adjectives:
21932

Dictionary of participles:
6049





**Note**
Removing more than 41 666 records at once throw error:

Error adding noun collection: {"error":"ValueError('Cannot submit more than 41,666 embeddings at once. Please submit your embeddings in batches of size 41,666 or less.')"}

In [10]:
from tqdm import tqdm

collections = {
    "verbs": dict_verb,
    "noun": dict_noun,
    "adjectives": dict_adj,
    "gerounds": dict_ger
}

for collection_name, collection_data in collections.items():
    batch_size = 1000
    last_confirmed_id = 0

    print(collection_name)
    try:
        connection = chroma_client.get_or_create_collection(name=collection_name)

        # Clear collection
        if connection.count() > 0:
            print(connection.count())
            ids = connection.get()['ids']
            for start_idx in range(0, len(ids), batch_size * 10):
                connection.delete(ids[start_idx:start_idx + batch_size * 10])

        # Add new data
        words_collection = list(collection_data.keys())
        for start_idx in tqdm(range(0, len(words_collection), batch_size), desc=collection_name, unit='batch'):
            collection_element = words_collection[start_idx:start_idx + batch_size]
            connection.add(documents=collection_element, ids=collection_element)
    except Exception as e:
        print(f"Error adding {collection_name} collection: {e}")

verbs


verbs: 100%|██████████████████████████████████████████████████████████████████████████| 7/7 [02:59<00:00, 25.65s/batch]


noun


noun: 100%|█████████████████████████████████████████████████████████████████████████| 50/50 [23:19<00:00, 27.98s/batch]


adjectives


adjectives: 100%|███████████████████████████████████████████████████████████████████| 22/22 [10:13<00:00, 27.89s/batch]


gerounds


gerounds: 100%|███████████████████████████████████████████████████████████████████████| 7/7 [02:56<00:00, 25.18s/batch]


In [13]:
### TEST COLLECTION READ
verbs_connection = chroma_client.get_or_create_collection(name="verbs")
display(verbs_connection.get()['documents'])

['--in',
 '-PAM',
 '-approach',
 '-assignment',
 '-base',
 '-binomial',
 '-conjugate',
 '-constraine',
 '-design',
 '-fold',
 '-from',
 '-hard',
 '-hypermodule',
 '-induce',
 '-linear',
 '-linke',
 '-opt',
 '-oriente',
 '-pairwise',
 '-perfect',
 '-pseudo',
 '-term',
 '-wise',
 '103lr',
 '1839–1910',
 '1T1C',
 '2T2C',
 '2d',
 '3)^{\\ast}$',
 '3D',
 '3d',
 '3did',
 '3gpp',
 '4BOK',
 '4RTD',
 '=',
 '@trust',
 'ABINIT',
 'ACSE',
 'ADOPEL',
 'AGGLOMERATIVE',
 'ALGORITHM',
 'ASSIGNMENT',
 'ATGC',
 'ATTENTIONAL',
 'AUDIO',
 'AWGN',
 'AssociatesISBN',
 'AutoGraphiX',
 'BIND',
 'BLAST',
 'BODB',
 'BPEL',
 'BROADCAST',
 'BURST',
 'BioBIKE',
 'BioSAVE',
 'Burst',
 'CARSA',
 'CATASTROPHIC',
 'CEREVISIAE',
 'CHEBYSHEV',
 'CIFEr',
 'CIRCULANT',
 'CLONE',
 'CMMI',
 'COEFFICIENT',
 'COLONY',
 'COLOR',
 'COMPLEX',
 'CONSTANT',
 'CONTRAST',
 'CONTROLLER',
 'COULD',
 'CREATIVITY',
 'CRITERIA',
 'CRYPTOGRAPHIC',
 'CUBE',
 'CURVE',
 'CaIrO3',
 'Canonical-',
 'CircaDB',
 'Cl',
 'Cost',
 'D0C',
 'D2/3',
 'D

In [None]:
# Cuda SPACY Help Links
# https://learn.microsoft.com/en-us/answers/questions/1398254/installed-visual-studio-build-tools-but-cannot-fin
# https://stackoverflow.com/questions/70840683/installed-visual-studio-2022-but-cl-is-not-recognized-as-an-internal-or-extern
# https://stackoverflow.com/questions/73961872/cupy-cuda-failed-to-import-cupy
# https://stackoverflow.com/questions/75355264/how-to-enable-cuda-gpu-acceleration-for-spacy-on-windows