In [None]:
# Restart the session afther this cell to avoid Google Colab errors
!pip install --upgrade --force-reinstall numpy==1.26.4 pandas

In [None]:
!pip install pybibx
!pip install tabulate tqdm

In [None]:
# Dowload .bib file
#!wget https://github.com/Valdecy/pyBibX/raw/main/assets/bibs/scopus.bib

In [None]:
# Required Libraries
import textwrap

from pybibx.base import pbx_probe
from tabulate import tabulate

In [None]:
# Load .bib
# Arguments: file_bib = 'filename.bib'; db = 'scopus', 'wos', 'pubmed'; del_duplicated = True, False
file_name = 'dados/scopus.bib'
database  = 'scopus'
bibfile   = pbx_probe(file_bib = file_name, db = database, del_duplicated = True)

In [None]:
print(bibfile.data.document_type.value_counts())
filtro = ['Article','Conference paper']
bibfile.data = bibfile.data[bibfile.data['document_type'].isin(filtro)]
print(bibfile.data.document_type.value_counts())

In [None]:
file_name_acm = 'dados/acm.bib'
database_acm  = 'acm'
bibfile_acm   = pbx_probe(file_bib = file_name_acm, db = database_acm, del_duplicated = True)


In [None]:
bibfile.merge_database(file_bib=file_name_acm, db=database_acm, del_duplicated=True)

In [None]:
# Health Analysis
health = bibfile.health_bib()

# Check Health
health

In [None]:
print(bibfile.data['abstract'].head(2))

In [None]:
!pip install pybtex
!pip install bibtexparser

In [None]:
import pandas as pd
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase


In [None]:
import time
import os
import random
from multiprocessing import Pool, cpu_count
from functools import partial
import pandas as pd
import ollama

# === CONFIGURAÇÕES GLOBAIS ===
MAX_REQUESTS_PER_MINUTE = 300
SECONDS_BETWEEN_REQUESTS = 60 / MAX_REQUESTS_PER_MINUTE
BATCH_SIZE = 1
WORKERS = min(8, cpu_count())
MODELS = ["llama3:8b", "gemma3:27b-it-qat", "phi4-mini", "phi4"] # "llama3:8b", "gemma3:27b-it-qat", "cogito:8b", "phi4"
TEMPERATURE = 0

QUERY = (
    #"Does this abstract discuss artificial intelligence in feedback for learning management systems or online learning environment on education?"
    "Analyze the following scientific article abstract and determine whether it "
     "addresses the use of artificial intelligence to provide feedback in virtual learning environments.\n"
     "Consider aspects such as: the application of AI techniques, automated feedback systems, "
     "digital educational platforms, and online learning. Respond only with ‘Yes’ if the article is related, "
     "or ‘No’ if it is not.\n\n"
)

# === UTILITÁRIOS ===
def log(text: str, log_path: str) -> None:
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"{timestamp} - {text}\n")
    print(f"{timestamp} - {text}")


def chunk_dataframe(df, batch_size: int):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i : i + batch_size], i


# === CHAMADA AO LLM LOCAL ===
def call_local_llm(messages, model: str, temperature: float):
    response = ollama.chat(
        model=model,
        messages=messages,
        options={"temperature": temperature},
        stream=False
    )
    return response.message.content.strip()


# === PROCESSAMENTO DE LOTE COM RETRIES ===
def process_batch_with_retry(
    batch_df, global_index, query, model, temperature,
    seconds_between_requests, log_path, retry_limit=5
):
    retry_count = 0
    delay = seconds_between_requests + random.uniform(0, 5)

    while retry_count < retry_limit:
        try:
            if model == "cogito:8b":
                time.sleep(delay)
                messages = [
                    {"role": "system", "content": (
                        "Enable deep thinking subroutine."
                    )}
                ]
                prompt = f"{query}\n\nYou are a research assistant who helps analyze scientific articles. Restrict yourself to answering the question with exclusively 'yes' or 'no'.\n\n"
            else:
                time.sleep(delay)
                messages = [
                    {"role": "system", "content": (
                        "You are a research assistant who helps analyze scientific articles."
                    )}
                ]
                prompt = f"{query}\n\nRestrict yourself to answering the question with exclusively 'yes' or 'no'.\n\n"

            for i, row in batch_df.iterrows():
                prompt += f"Abstract {i + 1}:\n{row['abstract']}\n\n"

            messages.append({"role": "user", "content": prompt})

            content = call_local_llm(messages, model=model, temperature=temperature)
            answers = content.splitlines()

            results = []
            coluna = "relevant_" + model.split(":")[0]
            for answer, (_, row) in zip(answers, batch_df.iterrows()):
                result = row.to_dict()
                clean = answer.strip().lower()
                result[coluna] = (clean == "yes")
                results.append(result)

            log(f"[{model}] Lote {global_index} OK", log_path)
            return results

        except Exception as e:
            retry_count += 1
            wait_time = 2 ** retry_count + random.uniform(0, 1)
            log(f"[{model}][ERRO] Lote {global_index}, tentativa {retry_count}: {e}", log_path)
            time.sleep(wait_time)

    log(f"[{model}][FALHA] Lote {global_index} excedeu tentativas", log_path)
    return []


def process_args_wrapper(args, query, model, temperature, seconds_between_requests, log_path):
    return process_batch_with_retry(*args, query=query, model=model,
                                    temperature=temperature,
                                    seconds_between_requests=seconds_between_requests,
                                    log_path=log_path)


# === PIPELINE PARA UM MODELO ===
def analyze_abstracts_parallel(
    df: pd.DataFrame,
    query: str,
    model: str,
    batch_size: int,
    workers: int,
    result_csv_path: str,
    log_path: str,
    temperature: float,
    seconds_between_requests: float,
) -> pd.DataFrame:
    if os.path.exists(result_csv_path):
        acumulado = pd.read_csv(result_csv_path)
        start = len(acumulado)
        log(f"[{model}] Retomando do índice {start}", log_path)
    else:
        acumulado = pd.DataFrame()
        start = 0

    to_process = df.iloc[start:].reset_index(drop=True)
    batches = [
        (batch, idx + start)
        for batch, idx in chunk_dataframe(to_process, batch_size)
    ]

    log(f"[{model}] Iniciando {len(batches)} lotes com {workers} workers", log_path)

    with Pool(processes=workers) as pool:
        processor = partial(
            process_args_wrapper,
            query=query,
            model=model,
            temperature=temperature,
            seconds_between_requests=seconds_between_requests,
            log_path=log_path
        )
        for outcome in pool.imap_unordered(processor, batches):
            if outcome:
                df_part = pd.DataFrame(outcome)
                acumulado = pd.concat([acumulado, df_part], ignore_index=True)
                acumulado.to_csv(result_csv_path, index=False)

    log(f"[{model}] Processamento completo.", log_path)
    return acumulado


# === RUN_ALL_MODELS MODIFICADA ===
def run_all_models(df: pd.DataFrame) -> pd.DataFrame:
    """
    Executa o pipeline para todos os modelos em MODELS e retorna um DataFrame combinado
    contendo todas as colunas relevant_<model>.
    """
    combined = df.copy()

    for model in MODELS:
        model_name = model.split(":")[0]
        result_path = f"temp_files/resultados_parciais_{model_name}.csv"
        log_path = f"temp_files/log_execucao_{model_name}.txt"

        if model == "gemma3:27b-it-qat":
            resultados = analyze_abstracts_parallel(
            df=df,
            query=QUERY,
            model=model,
            batch_size=BATCH_SIZE,
            workers=(WORKERS + 1),
            result_csv_path=result_path,
            log_path=log_path,
            temperature=TEMPERATURE,
            seconds_between_requests=SECONDS_BETWEEN_REQUESTS
        )

        elif model == "phi4-mini":
            resultados = analyze_abstracts_parallel(
            df=df,
            query=QUERY,
            model=model,
            batch_size=BATCH_SIZE,
            workers=(WORKERS + 4),
            result_csv_path=result_path,
            log_path=log_path,
            temperature=TEMPERATURE,
            seconds_between_requests=SECONDS_BETWEEN_REQUESTS
        )
        elif model == "llama3":
            resultados = analyze_abstracts_parallel(
            df=df,
            query=QUERY,
            model=model,
            batch_size=BATCH_SIZE,
            workers=(WORKERS + 4),
            result_csv_path=result_path,
            log_path=log_path,
            temperature=TEMPERATURE,
            seconds_between_requests=SECONDS_BETWEEN_REQUESTS
        ) 
        else:
            resultados = analyze_abstracts_parallel(
            df=df,
            query=QUERY,
            model=model,
            batch_size=BATCH_SIZE,
            workers=WORKERS,
            result_csv_path=result_path,
            log_path=log_path,
            temperature=TEMPERATURE,
            seconds_between_requests=SECONDS_BETWEEN_REQUESTS
        )

        col = f"relevant_{model_name}"
        combined = combined.merge(
            resultados[[col]],
            left_index=True, right_index=True
        )

    return combined


In [None]:
def load_and_filter_bases(directory: str) -> pd.DataFrame:
    print(f"Carregando bases de '{directory}'")
    dfs = []
    for fname in sorted(os.listdir(directory)):
        if fname.lower().endswith('.csv'):
            path = os.path.join(directory, fname)
            try:
                df = pd.read_csv(path)
                print(f"{fname}: {len(df)} registros")
                dfs.append(df)
            except Exception as e:
                print(f"Erro lendo {fname}: {e}")
    if not dfs:
        return pd.DataFrame()
    df = pd.concat(dfs, ignore_index=True)
    return df


In [None]:
dados = bibfile.data
df_ieee = load_and_filter_bases("dados")
colunas_desejadas_ieee = ['Document Title', 'Abstract', 'Author Affiliations', 'Authors', 'DOI', 'ISBNs',
                             'ISSN', 'Publication Title', 'Publication Year']
df_ieee = df_ieee[colunas_desejadas_ieee].copy()
#print(df_ieee.columns)
colunas_desejadas_scopus = ['title', 'abstract', 'journal', 
                            'affiliation', 'author', 'doi', 'isbn',
                             'issn', 'year']

df_scopus = dados[colunas_desejadas_scopus].copy()
df_scopus = df_scopus.rename(columns={
    'title': 'Document Title',
    'abstract': 'Abstract',
    'abbrev_source_title': 'Publication Title',
    'affiliation': 'Author Affiliations',
    'author': 'Authors',
    'doi': 'DOI',
    'isbn': 'ISBNs',
    'issn': 'ISSN',
    'journal': 'Publication Title',
    'references': 'References',
    'url': 'URL',
    'year': 'Publication Year'
})


# Certifique-se de que o DataFrame `dados` contém pelo menos as colunas 'abstract' e outras desejadas
#resultados = analyze_abstracts_parallel(dados, query=query_global, model=model, batch_size=5, workers=8)

print(df_ieee.shape, '\t', df_scopus.shape)
dados = pd.concat([df_ieee, df_scopus], ignore_index=True)
print('Antes da remoção de duplicados: ', dados.shape)
dados.columns = dados.columns.str.lower()
dados = dados.dropna(subset=['abstract'])
dados = dados.drop_duplicates(subset=['abstract'])
dados = dados.reset_index(drop=True)
print('Após remoção de duplicados: ', dados.shape)

resultados = run_all_models(dados)

# Salvar CSV final (opcional)
resultados.to_csv("temp_files/resultados_finais.csv", index=False)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
print(resultados.columns)
# Exibir os resultados
for model in MODELS:
    model_name = model.split(":")[0]
    print(f"\nResultados para o modelo {model_name}:")
    print(resultados[f'relevant_{model_name}'].value_counts())
#dados_filtered = resultados[resultados['relevant'] != 'False']
#dados_filtered.head(3)

In [None]:
print(resultados['document title'].head(3))

In [None]:
print(resultados[resultados['relevant_gemma3'] == True])

In [None]:
positivos = resultados[resultados["relevant"] == True].copy()
print(positivos.head(5))


## Critrérios de exclusão