In [8]:
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm

## Load data

Queries generator will use set of four lists with static words:
1. Nouns
2. Verbs
3. Adjectives
4. Gerund (participles)

In [9]:
collections = ["nouns", "verbs", "adjectives", "participles", "gerounds"]
dfs = {}

for name in collections:
    column_name = name.capitalize()
    csv_path = f"../data/raw/{name}.csv"
    try:
        df = pd.read_csv(csv_path, header=None, names=[column_name])
        dfs[name] = df
    except FileNotFoundError:
        print(f"Nie można znaleźć pliku {csv_path}, pomijam...")
    except Exception as e:
        print(f"Wystąpił błąd podczas odczytu pliku {csv_path}: {e}")

nouns_df = dfs["nouns"]
verbs_df = dfs["verbs"]
adjectives_df = dfs["adjectives"]
participles_df = dfs["participles"]

Nie można znaleźć pliku ../data/raw/gerounds.csv, pomijam...


## Query generator

Sample code snippet to generate sentences that are candidate phrases for search queries. Which will be used in a search engine to find articles on a similar topic.

In [10]:
def query_generator(nouns, verbs, adjectives, participles, limit):
    generated_queries = set()
    
    while len(generated_queries) < limit:
        noun = random.choice(nouns)[0]
        verb = random.choice(verbs)[0]
        adjective = random.choice(adjectives)[0]
        participle = random.choice(participles)[0]
        
        query = f"{noun} {verb} {adjective} {participle}"
        
        if query not in generated_queries:
            generated_queries.add(query)
            yield query

### 1. Language Tool

First attempt to make queries more natrual with `Language Tool` library

Source: [Githube](https://github.com/Findus23/pyLanguagetool) </br>
Tutorial: [here](https://www.kaggle.com/code/yeoyunsianggeremie/how-to-use-language-tool-python-without-internet)

In [None]:
from language_tool_python import LanguageTool

def correct_sentence_lt(sentence: list, debug: bool = False) -> str:
    """
    Corrects a sentence using LanguageTool.

    Args:
        sentence (str): The sentence to correct.
        debug (bool, optional): A flag indicating whether to display matches, defaults to False.

    Returns:
        str: The corrected sentence.
    """
    lt = LanguageTool('en-US')
    matches = lt.check(sentence)

    if debug:
        display(matches)

    return lt.correct(sentence)

### 2. Gramformer

Second attempt to build more consistent sentences this time with `Gramformer`

Source: [Githube](https://github.com/thevkrant/gramformer) </br>
Tutorial: [here](https://www.vennify.ai/gramformer-correct-grammar-transformer-nlp/)

In [None]:
from gramformer import Gramformer
import torch

def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    else:
        print(f"`Cuda` is unavailable")

def correct_sentence_gf(gf: Gramformer, sentence: str, debug = False) -> str:
    """
    Corrects a sentence using Gramformer.

    Args:
        gf (Gramformer): The Gramformer object to use for correction.
        sentence (str): The sentence to correct.
        debug (bool, optional): A flag indicating whether to display matches, defaults to False.

    Returns:
        str: The corrected sentence.
    """

    return gf.correct(sentence, max_candidates=1)[0]

### 1. Generate queries

In [11]:
queries = query_generator(nouns_df.values.tolist(), verbs_df.values.tolist(), adjectives_df.values.tolist(), participles_df.values.tolist(), 850000)

queries_df = pd.DataFrame({'query': queries})
display(len(queries_df['query'].unique()))

# TEST LIBRARIES FIXING SENTENCES
# First: dont work very well
# Second: Make sentences better

850000

In [13]:
queries_df.drop_duplicates(subset=['query'])
queries_df.to_csv('../data/queries_df.csv', index=False)

### 2. Generate embeddings for queries

We generate embeddings for raw queries to expedite the target experiment. By querying the vector database ChromaDB with pre-embedded content, we offload the burden of embedding the raw query received in the request before searching and returning the K most similar titles of scientific papers.

In [14]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch

In [22]:
queries_df = pd.read_csv('../data/queries_df.csv')

In [23]:
if torch.cuda.is_available():
    print("CUDA is available: ", torch.cuda.is_available())   
    print("Number of CUDA devices: ", torch.cuda.device_count())
    print("CUDA current device: ", torch.cuda.current_device())
    print("CUDA device name: ", torch.cuda.get_device_name(0))

    # Utwórz instancję modelu SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda')
else:
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cpu')


titles = queries_df['query'].tolist()
batch_size = 2000

titles_embeddings = []

for i in tqdm(range(0, len(titles), batch_size), desc="Embedding titles"):
    batch = titles[i:i + batch_size]
    batch_embeddings = model.encode(batch)
    titles_embeddings.extend(batch_embeddings)

queries_df['embedding'] = titles_embeddings

CUDA is available:  True
Number of CUDA devices:  1
CUDA current device:  0
CUDA device name:  NVIDIA GeForce RTX 3060 Laptop GPU


Embedding titles: 100%|██████████████████████████████████████████████████████████████| 425/425 [02:47<00:00,  2.53it/s]


In [25]:
def convert_embedding_to_string(embedding):
    return str(embedding)

tqdm.pandas()
queries_df['embedding'] = queries_df['embedding'].progress_apply(convert_embedding_to_string)

display(queries_df.head())
display(len(queries_df))

100%|█████████████████████████████████████████████████████████████████████████| 850000/850000 [15:53<00:00, 891.47it/s]


Unnamed: 0,query,embedding
0,Graph Construct Encoded Reviewer,[-7.29738623e-02 5.73125109e-02 -6.29330575e-...
1,Analytics Promote Elastic Interpreter,[-1.60536561e-02 -9.04399843e-04 -8.83568376e-...
2,Token Recompile Derivative Architect,[-7.27694631e-02 -2.80314535e-02 3.56670655e-...
3,Script Execute Asynchronous Debugger,[-5.32670319e-02 5.11088921e-03 -6.76202029e-...
4,Scheduler Navigate Compiled Practitioner,[-4.12793793e-02 -1.11413710e-02 -3.63953374e-...


850000

In [26]:
chunk_size = 5000

queries_df.iloc[:0].to_csv('../data/queries_with_embedding.csv', index=False)

for i in tqdm(range(0, len(queries_df), chunk_size), desc="Saving progress"):
    df_chunk = queries_df.iloc[i:i+chunk_size]
    df_chunk.to_csv('../data/queries_with_embedding.csv', mode='a', header=False, index=False)

Saving progress: 100%|███████████████████████████████████████████████████████████████| 170/170 [01:34<00:00,  1.80it/s]
