In [1]:
# Restart the session afther this cell to avoid Google Colab errors
!pip install --upgrade --force-reinstall numpy==1.26.4 pandas

[0mCollecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas)
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Using cached pytz-20

In [2]:
!pip install pybibx
!pip install tabulate tqdm

[0m

In [3]:
# Dowload .bib file
#!wget https://github.com/Valdecy/pyBibX/raw/main/assets/bibs/scopus.bib

In [4]:
# Required Libraries
import textwrap

from pybibx.base import pbx_probe
from tabulate import tabulate

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load .bib
# Arguments: file_bib = 'filename.bib'; db = 'scopus', 'wos', 'pubmed'; del_duplicated = True, False
file_name = 'dados/teste_scopus.bib'
database  = 'scopus'
bibfile   = pbx_probe(file_bib = file_name, db = database, del_duplicated = True)

A Total of 207 Documents were Found ( 207 Documents and 0 Duplicates )

Article = 158
Book chapter = 7
Conference paper = 13
Data paper = 1
Editorial = 1
Erratum = 1
Note = 2
Review = 23
Short survey = 1


In [6]:
# Health Analysis
health = bibfile.health_bib()

# Check Health
health

Unnamed: 0,Entries,Completeness (%),Number of Docs
0,Sources,100.00%,207
1,Abstracts,98.07%,203
2,Affiliation,100.00%,207
3,Author(s),100.00%,207
4,DOI,98.55%,204
5,Keywords - Authors,89.86%,186
6,Keywords - Plus,72.46%,150
7,References,0.00%,0
8,Year,100.00%,207


In [7]:
print(bibfile.data['abstract'].head(2))

0    Electric vehicles (EVs) rely heavily on lithiu...
1    Technology integration in education holds prom...
Name: abstract, dtype: object


In [8]:
!pip install pybtex
!pip install bibtexparser

[0m

In [9]:
import pandas as pd
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase


In [None]:
import time
import os
import traceback
import random
from multiprocessing import Pool, cpu_count
from functools import partial
import pandas as pd
import ollama

# === CONFIGURAÇÕES ===
MAX_REQUESTS_PER_MINUTE = 240
SECONDS_BETWEEN_REQUESTS = 60 / MAX_REQUESTS_PER_MINUTE
BATCH_SIZE = 1
WORKERS = min(4, cpu_count())
MODEL = "gemma3:27b"  # Ajuste para o modelo local desejado
MODELS = ["gemma3:4b", "deepseek-r1:1.5b"]
TEMPERATURE = 0.2
RESULT_CSV_PATH = "temp_files/resultados_parciais.csv"
LOG_PATH = "temp_files/log_execucao.txt"
QUERY = (
    "Does this abstract discuss artificial intelligence in feedback for learning management systems on education?"
)

# === UTILITÁRIOS ===
def log(text: str) -> None:
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
    with open(LOG_PATH, "a", encoding="utf-8") as f:
        f.write(f"{timestamp} - {text}\n")
    print(f"{timestamp} - {text}")


def chunk_dataframe(df, batch_size: int):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i : i + batch_size], i

# === FUNÇÃO PARA CHAMAR O LLM LOCAL ===
def call_local_llm(messages, model: str):
    """
    Envia a lista de mensagens para o modelo local via Ollama e retorna o texto de resposta.
    """
    response = ollama.chat(
        model=model,
        messages=messages,
        options={"temperature": TEMPERATURE},
        stream=False
    )
    # O conteúdo da resposta geralmente está em response.message.content
    return response.message.content.strip()

# === FUNÇÃO DE PROCESSAMENTO DE LOTE COM RETRIES ===
def process_batch_with_retry(
    batch_df, global_index, query=QUERY, model=MODEL, retry_limit=5
):
    retry_count = 0
    delay = SECONDS_BETWEEN_REQUESTS + random.uniform(0, 5)

    while retry_count < retry_limit:
        try:
            time.sleep(delay)
            # Monta as mensagens para o LLM
            messages = [
                {"role": "system", "content": (
                    "You are a research assistant who helps analyze scientific articles."
                )}
            ]
            prompt = f"{query}\n\nRestrict yourself to answering the question with exclusively 'yes' or 'no'.\n\n"

            # Adiciona cada abstract ao prompt
            for i, row in batch_df.iterrows():
                prompt += f"Abstract {i + 1}:\n{row['abstract']}\n\n"

            messages.append({"role": "user", "content": prompt})

            content = call_local_llm(messages, model=model, temperature=TEMPERATURE)
            answers = content.splitlines()

            results = []
            for answer, (_, row) in zip(answers, batch_df.iterrows()):
                result = row.to_dict()
                clean = answer.strip().lower()
                coluna = "relevant_" + model.split(":")[0]
                result[coluna] = clean == "yes"
                results.append(result)

            log(f"Lote {global_index} processado com sucesso (temp={TEMPERATURE}).")
            return results

        except Exception as e:
            retry_count += 1
            wait_time = 2 ** retry_count + random.uniform(0, 1)
            log(f"[ERRO] Lote {global_index}, tentativa {retry_count}: {e}")
            time.sleep(wait_time)

    log(f"[FALHA] Lote {global_index} excedeu o limite de tentativas.")
    return []

# === WRAPPER PARA MULTIPROCESSING ===
def process_args_wrapper(args, query, model):
    return process_batch_with_retry(*args, query=query, model=model)

# === FUNÇÃO PRINCIPAL ===
def analyze_abstracts_parallel(df: pd.DataFrame, query=QUERY, model=MODEL,
                                batch_size=BATCH_SIZE, workers=WORKERS):
    # Retoma de arquivo se existir
    if os.path.exists(RESULT_CSV_PATH):
        acumulado = pd.read_csv(RESULT_CSV_PATH)
        start = len(acumulado)
        log(f"Retomando a partir do índice {start}")
    else:
        acumulado = pd.DataFrame()
        start = 0

    to_process = df.iloc[start:].reset_index(drop=True)
    batches = [(batch, idx + start) for batch, idx in chunk_dataframe(to_process, batch_size)]

    log(f"Iniciando {len(batches)} lotes com {workers} workers")

    with Pool(processes=workers) as pool:
        processor = partial(process_args_wrapper, query=query, model=model)
        for outcome in pool.imap_unordered(processor, batches):
            if outcome:
                df_part = pd.DataFrame(outcome)
                acumulado = pd.concat([acumulado, df_part], ignore_index=True)
                acumulado.to_csv(RESULT_CSV_PATH, index=False)

    log("Processamento completo.")
    return acumulado


In [11]:
def load_and_filter_bases(directory: str) -> pd.DataFrame:
    log(f"Carregando bases de '{directory}'")
    dfs = []
    for fname in sorted(os.listdir(directory)):
        if fname.lower().endswith('.csv'):
            path = os.path.join(directory, fname)
            try:
                df = pd.read_csv(path)
                log(f"{fname}: {len(df)} registros")
                dfs.append(df)
            except Exception as e:
                log(f"Erro lendo {fname}: {e}")
    if not dfs:
        return pd.DataFrame()
    df = pd.concat(dfs, ignore_index=True)
    return df


In [12]:
dados = bibfile.data
df_ieee = load_and_filter_bases("dados")
colunas_desejadas_ieee = ['Document Title', 'Abstract', 'Author Affiliations', 'Authors', 'DOI', 'ISBNs',
                             'ISSN', 'Publication Title', 'Publication Year']
df_ieee = df_ieee[colunas_desejadas_ieee].copy()
#print(df_ieee.columns)
colunas_desejadas_scopus = ['title', 'abstract', 'journal', 
                            'affiliation', 'author', 'doi', 'isbn',
                             'issn', 'year']

df_scopus = dados[colunas_desejadas_scopus].copy()
df_scopus = df_scopus.rename(columns={
    'title': 'Document Title',
    'abstract': 'Abstract',
    'abbrev_source_title': 'Publication Title',
    'affiliation': 'Author Affiliations',
    'author': 'Authors',
    'doi': 'DOI',
    'isbn': 'ISBNs',
    'issn': 'ISSN',
    'journal': 'Publication Title',
    'references': 'References',
    'url': 'URL',
    'year': 'Publication Year'
})


# Certifique-se de que o DataFrame `dados` contém pelo menos as colunas 'abstract' e outras desejadas
#resultados = analyze_abstracts_parallel(dados, query=query_global, model=model, batch_size=5, workers=8)

print(df_ieee.shape, '\t', df_scopus.shape)
dados = pd.concat([df_ieee, df_scopus], ignore_index=True)
print('Antes da remoção de duplicados: ', dados.shape)
dados.columns = dados.columns.str.lower()
dados = dados.dropna(subset=['abstract'])
dados = dados.drop_duplicates(subset=['abstract'])
dados = dados.reset_index(drop=True)
print('Após remoção de duplicados: ', dados.shape)

resultados = analyze_abstracts_parallel(
    df=dados,
    query=QUERY,
    model=MODEL,
    batch_size=BATCH_SIZE,
    workers=WORKERS
)

# Salvar CSV final (opcional)
resultados.to_csv("temp_files/resultados_finais.csv", index=False)

2025-04-20 10:02:12 - Carregando bases de 'dados'
2025-04-20 10:02:12 - export_ieee_2022-2025.csv: 936 registros
(936, 9) 	 (207, 9)
Antes da remoção de duplicados:  (1143, 9)
Após remoção de duplicados:  (1140, 9)
2025-04-20 10:02:12 - Iniciando 1140 lotes com 4 workers
2025-04-20 10:02:22 - Lote 1 processado com sucesso.
2025-04-20 10:02:22 - Lote 2 processado com sucesso.
2025-04-20 10:02:23 - Lote 3 processado com sucesso.
2025-04-20 10:02:23 - Lote 0 processado com sucesso.
2025-04-20 10:02:28 - Lote 7 processado com sucesso.
2025-04-20 10:02:29 - Lote 5 processado com sucesso.
2025-04-20 10:02:30 - Lote 4 processado com sucesso.
2025-04-20 10:02:32 - Lote 8 processado com sucesso.
2025-04-20 10:02:32 - Lote 6 processado com sucesso.
2025-04-20 10:02:34 - Lote 9 processado com sucesso.
2025-04-20 10:02:38 - Lote 12 processado com sucesso.
2025-04-20 10:02:39 - Lote 10 processado com sucesso.
2025-04-20 10:02:41 - Lote 11 processado com sucesso.
2025-04-20 10:02:41 - Lote 13 proces

In [14]:
print(resultados.columns)
print(resultados['relevant_deepseek-r1:1.5b'].value_counts())
#dados_filtered = resultados[resultados['relevant'] != 'False']
#dados_filtered.head(3)

Index(['document title', 'abstract', 'author affiliations', 'authors', 'doi',
       'isbns', 'issn', 'publication title', 'publication year',
       'relevant_deepseek-r1:1.5b'],
      dtype='object')
relevant_deepseek-r1:1.5b
False    1140
Name: count, dtype: int64


In [None]:
positivos = resultados[resultados["relevant"] == True].copy()
print(positivos.head(5))
