In [20]:
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm

## Load data

Queries generator will use set of four lists with static words:
1. Nouns
2. Verbs
3. Adjectives
4. Gerund (participles)

In [62]:
collections = ["nouns", "verbs", "adjectives", "participles", "gerounds"]
dfs = {}

for name in collections:
    column_name = name.capitalize()
    csv_path = f"../data/raw/{name}.csv"
    try:
        df = pd.read_csv(csv_path, header=None, names=[column_name])
        dfs[name] = df
    except FileNotFoundError:
        print(f"Nie można znaleźć pliku {csv_path}, pomijam...")
    except Exception as e:
        print(f"Wystąpił błąd podczas odczytu pliku {csv_path}: {e}")

nouns_df = dfs["nouns"]
verbs_df = dfs["verbs"]
adjectives_df = dfs["adjectives"]
participles_df = dfs["participles"]

Nie można znaleźć pliku ../data/raw/gerounds.csv, pomijam...


## Query generator

Sample code snippet to generate sentences that are candidate phrases for search queries. Which will be used in a search engine to find articles on a similar topic.

In [57]:
def query_generator(nouns, verbs, adjectives, participles, limit):
    for _ in range(limit):
        noun = random.choice(nouns)[0]
        verb = random.choice(verbs)[0]
        adjective = random.choice(adjectives)[0]
        participle = random.choice(participles)[0]
        
        yield f"Is {noun} {verb} {adjective} {participle}?"

### 1. Language Tool

First attempt to make queries more natrual with `Language Tool` library

Source: [Githube](https://github.com/Findus23/pyLanguagetool) </br>
Tutorial: [here](https://www.kaggle.com/code/yeoyunsianggeremie/how-to-use-language-tool-python-without-internet)

In [72]:
from language_tool_python import LanguageTool

def correct_sentence_lt(sentence: list, debug: bool = False) -> str:
    """
    Corrects a sentence using LanguageTool.

    Args:
        sentence (str): The sentence to correct.
        debug (bool, optional): A flag indicating whether to display matches, defaults to False.

    Returns:
        str: The corrected sentence.
    """
    lt = LanguageTool('en-US')
    matches = lt.check(sentence)

    if debug:
        display(matches)

    return lt.correct(sentence)

### 2. Gramformer

Second attempt to build more consistent sentences this time with `Gramformer`

Source: [Githube](https://github.com/thevkrant/gramformer) </br>
Tutorial: [here](https://www.vennify.ai/gramformer-correct-grammar-transformer-nlp/)

In [None]:
from gramformer import Gramformer
import torch

def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    else:
        print(f"`Cuda` is unavailable")

def correct_sentence_gf(gf: Gramformer, sentence: str, debug = False) -> str:
    """
    Corrects a sentence using Gramformer.

    Args:
        gf (Gramformer): The Gramformer object to use for correction.
        sentence (str): The sentence to correct.
        debug (bool, optional): A flag indicating whether to display matches, defaults to False.

    Returns:
        str: The corrected sentence.
    """

    return gf.correct(sentence, max_candidates=1)[0]

In [66]:
queries = query_generator(nouns_df.values.tolist(), verbs_df.values.tolist(), adjectives_df.values.tolist(), participles_df.values.tolist(), 850000)

queries_df = pd.DataFrame({'Query': queries})
display(len(queries_df['Query'].unique()))

# CODE TO TEST LIBRARIES FIXING SENTENCES
# First: dont work very well
# Second: Make sentences better
#
# for query in queries:
#     print(query)

#     if True:
#         lt = LanguageTool('en-US')
#         correct_sentence_lt(lt, query)
#     else:
#         gf = Gramformer(models = 1, use_gpu=True)
#         correct_sentence_gf(gf, query)

847796

In [69]:
# Saving generated sentences
queries_df.drop_duplicates(subset=['Query'])
queries_df.to_csv('../data/queries_df.csv', index=False)

### Improve the sentence test

In [None]:
# Make sentences better but last too long (about 160h)
gf_query = pd.DataFrame(columns=['Query'])

corrected_queries = []

index_map = {}

set_seed(1212)
gf = Gramformer(models=1, use_gpu=True)

for i, query in enumerate(tqdm(queries_df['Query'])):
    correct_query = correct_sentence_gf(gf, query)
    corrected_queries.append(correct_query[0])
    index_map[i] = correct_query[0]

gf_query = pd.DataFrame(corrected_queries, columns=['Query'])
gf_query.to_csv('../data/queries_gramformer_df.csv', index=False)