In [None]:
# Restart the session afther this cell to avoid Google Colab errors
!pip install --upgrade --force-reinstall numpy==1.26.4 pandas

In [None]:
!pip install pybibx
!pip install tabulate

In [None]:
# Dowload .bib file
#!wget https://github.com/Valdecy/pyBibX/raw/main/assets/bibs/scopus.bib

In [None]:
# Required Libraries
import textwrap

from pybibx.base import pbx_probe
from tabulate import tabulate

In [None]:
# Load .bib
# Arguments: file_bib = 'filename.bib'; db = 'scopus', 'wos', 'pubmed'; del_duplicated = True, False
file_name = 'dados/scopus.bib'
database  = 'scopus'
bibfile   = pbx_probe(file_bib = file_name, db = database, del_duplicated = True)

In [None]:
# Health Analysis
health = bibfile.health_bib()

# Check Health
health

In [None]:
print(bibfile.data['abstract'].head(2))

In [None]:
!pip install pybtex
!pip install bibtexparser

In [None]:
import pandas as pd
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase


In [None]:
import time
import os
import traceback
import random
from multiprocessing import Pool, cpu_count
from functools import partial
import pandas as pd
import ollama

# === CONFIGURAÇÕES ===
MAX_REQUESTS_PER_MINUTE = 600
SECONDS_BETWEEN_REQUESTS = 60 / MAX_REQUESTS_PER_MINUTE
BATCH_SIZE = 1
WORKERS = min(12, cpu_count())
MODEL = "gemma3:27b"  # Ajuste para o modelo local desejado
TEMPERATURE = 0.2
RESULT_CSV_PATH = "temp_files/resultados_parciais.csv"
LOG_PATH = "temp_files/log_execucao.txt"
QUERY = (
    "Does this abstract discuss artificial intelligence in feedback for learning management systems on education?"
)

# === UTILITÁRIOS ===
def log(text: str) -> None:
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
    with open(LOG_PATH, "a", encoding="utf-8") as f:
        f.write(f"{timestamp} - {text}\n")
    print(f"{timestamp} - {text}")


def chunk_dataframe(df, batch_size: int):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i : i + batch_size], i

# === FUNÇÃO PARA CHAMAR O LLM LOCAL ===
def call_local_llm(messages, model: str):
    """
    Envia a lista de mensagens para o modelo local via Ollama e retorna o texto de resposta.
    """
    response = ollama.chat(
        model=model,
        messages=messages,
        options={"temperature": TEMPERATURE},
        stream=False
    )
    # O conteúdo da resposta geralmente está em response.message.content
    return response.message.content.strip()

# === FUNÇÃO DE PROCESSAMENTO DE LOTE COM RETRIES ===
def process_batch_with_retry(
    batch_df, global_index, query=QUERY, model=MODEL, retry_limit=5
):
    retry_count = 0
    delay = SECONDS_BETWEEN_REQUESTS + random.uniform(0, 5)

    while retry_count < retry_limit:
        try:
            time.sleep(delay)
            # Monta as mensagens para o LLM
            messages = [
                {"role": "system", "content": (
                    "You are a research assistant who helps analyze scientific articles."
                )}
            ]
            prompt = f"{query}\n\nRestrict yourself to answering the question with exclusively 'yes' or 'no'.\n\n"

            # Adiciona cada abstract ao prompt
            for i, row in batch_df.iterrows():
                prompt += f"Abstract {i + 1}:\n{row['abstract']}\n\n"

            messages.append({"role": "user", "content": prompt})

            # Chama o LLM local
            content = call_local_llm(messages, model=model)
            answers = content.splitlines()

            # Monta os resultados
            results = []
            for answer, (_, row) in zip(answers, batch_df.iterrows()):
                result = row.to_dict()
                clean = answer.strip().lower()
                result["relevant"] = clean == "yes"
                result["tokens_used"] = None  # Não disponível no Ollama local
                results.append(result)

            log(f"Lote {global_index} processado com sucesso.")
            return results

        except Exception as e:
            retry_count += 1
            wait_time = 2 ** retry_count + random.uniform(0, 1)
            log(f"[ERRO] Lote {global_index}, tentativa {retry_count}: {e}")
            time.sleep(wait_time)

    log(f"[FALHA] Lote {global_index} excedeu o limite de tentativas.")
    return []

# === WRAPPER PARA MULTIPROCESSING ===
def process_args_wrapper(args, query, model):
    return process_batch_with_retry(*args, query=query, model=model)

# === FUNÇÃO PRINCIPAL ===
def analyze_abstracts_parallel(df: pd.DataFrame, query=QUERY, model=MODEL,
                                batch_size=BATCH_SIZE, workers=WORKERS):
    # Retoma de arquivo se existir
    if os.path.exists(RESULT_CSV_PATH):
        acumulado = pd.read_csv(RESULT_CSV_PATH)
        start = len(acumulado)
        log(f"Retomando a partir do índice {start}")
    else:
        acumulado = pd.DataFrame()
        start = 0

    to_process = df.iloc[start:].reset_index(drop=True)
    batches = [(batch, idx + start) for batch, idx in chunk_dataframe(to_process, batch_size)]

    log(f"Iniciando {len(batches)} lotes com {workers} workers")

    with Pool(processes=workers) as pool:
        processor = partial(process_args_wrapper, query=query, model=model)
        for outcome in pool.imap_unordered(processor, batches):
            if outcome:
                df_part = pd.DataFrame(outcome)
                acumulado = pd.concat([acumulado, df_part], ignore_index=True)
                acumulado.to_csv(RESULT_CSV_PATH, index=False)

    log("Processamento completo.")
    return acumulado


In [None]:
def load_and_filter_bases(directory: str) -> pd.DataFrame:
    log(f"Carregando bases de '{directory}'")
    dfs = []
    for fname in sorted(os.listdir(directory)):
        if fname.lower().endswith('.csv'):
            path = os.path.join(directory, fname)
            try:
                df = pd.read_csv(path)
                log(f"{fname}: {len(df)} registros")
                dfs.append(df)
            except Exception as e:
                log(f"Erro lendo {fname}: {e}")
    if not dfs:
        return pd.DataFrame()
    df = pd.concat(dfs, ignore_index=True)

In [None]:
dados = bibfile.data
dados = dados

df_ieee = load_and_filter_bases("dados")
colunas_desejadas_ieee = ['Document Title', 'Abstract', 'Author Affiliations', 'Authors', 'DOI', 'ISBNs',
                             'ISSN', 'Publication Title', 'Publication Year']
df_ieee = df_ieee[colunas_desejadas_ieee].copy()
print(df_ieee.columns)
df_scopus = dados.rename(columns={
    'title': 'Document Title',
    'abstract': 'Abstract',
    'abbrev_source_title': 'Publication Title',
    'affiliation': 'Author Affiliations',
    'author': 'Authors',
    'doi': 'DOI',
    'isbn': 'ISBNs',
    'issn': 'ISSN',
    'journal': 'Publication Title',
    'references': 'References',
    'url': 'URL',
    'year': 'Publication Year'
}, inplace=True)

colunas_desejadas_scopus = ['title', 'abstract', 'abbrev_source_title', 
                            'affiliation', 'author', 'document_type', 'doi', 'isbn',
                             'issn', 'journal', 'references', 'url', 'year']
df_scopus = dados[colunas_desejadas_scopus].copy()
# Certifique-se de que o DataFrame `dados` contém pelo menos as colunas 'abstract' e outras desejadas
#resultados = analyze_abstracts_parallel(dados, query=query_global, model=model, batch_size=5, workers=8)

dados = pd.concat([df_ieee, df_scopus], ignore_index=True)
print(dados.shape)
dados = dados.dropna(subset=['abstract'])
dados = dados.drop_duplicates(subset=['abstract'])
dados = dados.reset_index(drop=True)
log('Após remoção de duplicados %s', dados.shape)

resultados = analyze_abstracts_parallel(
    df=dados,
    query=QUERY,
    model=MODEL,
    batch_size=BATCH_SIZE,
    workers=WORKERS
)

# Salvar CSV final (opcional)
resultados.to_csv("temp_files/resultados_finais.csv", index=False)

In [None]:
print(resultados.columns)
print(resultados['relevant'].value_counts())
#dados_filtered = resultados[resultados['relevant'] != 'False']
#dados_filtered.head(3)

In [None]:
positivos = resultados[resultados["relevant"] == True].copy()
#print(positivos.head(5))
