In [296]:
from llama_parse import LlamaParse
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.kdbai import KDBAIVectorStore
from getpass import getpass
import kdbai_client as kdbai

from dotenv import load_dotenv

load_dotenv()

True

In [297]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
# API access to llama-cloud
# os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv('LLAMA_CLOUD_API_KEY')

KDBAI_ENDPOINT = (os.environ["KDBAI_ENDPOINT"] if "KDBAI_ENDPOINT" in os.environ else input("KDB.AI endpoint: "))
KDBAI_API_KEY = (os.environ["KDBAI_API_KEY"] if "KDBAI_API_KEY" in os.environ else getpass("KDB.AI API key: "))

#connect to KDB.AI
session = kdbai.Session(api_key=KDBAI_API_KEY, endpoint=KDBAI_ENDPOINT)

KDBAIException: Your KDB.AI server is not compatible with this client (kdbai_client==1.5.0).
Please use kdbai_client >=1.6.0 and <=latest.

In [22]:
# Connect with kdbai database
db = session.database("default")

In [31]:
# The schema contains two metadata columns (document_id, text) and one embeddings column
schema = [
        dict(name="document_id", type="str"),
        dict(name="text", type="str"),
        dict(name="embeddings", type="float32s"),
    ]

# indexflat, define the index name, type, column to apply the index to (embeddings)
# and params which include thesearch metric (Euclidean distance), and dims
indexFlat = {
        "name": "flat",
        "type": "flat",
        "column": "embeddings",
        "params": {'dims': 1536, 'metric': 'L2'},
    }

KDBAI_TABLE_NAME = "LlamaParse_Table"

# First ensure the table does not already exist
try:
    db.table(KDBAI_TABLE_NAME).drop()
except kdbai.KDBAIException:
    pass

#Create the table
table = db.create_table(table=KDBAI_TABLE_NAME, schema=schema, indexes=[indexFlat])

In [24]:
EMBEDDING_MODEL  = "text-embedding-3-small"
GENERATION_MODEL = "gpt-4o"

llm = OpenAI(model=GENERATION_MODEL)
embed_model = OpenAIEmbedding(model=EMBEDDING_MODEL)

Settings.llm = llm
Settings.embed_model = embed_model

pdf_file_name = './MACRec.pdf'

In [25]:
# parsing_instructions = '''The document titled "LLM In-Context Recall is Prompt Dependent" is an academic preprint from April 2024, authored by Daniel Machlab and Rick Battle from the VMware NLP Lab. It explores the in-context recall capabilities of Large Language Models (LLMs) using a method called "needle-in-a-haystack," where a specific factoid is embedded in a block of unrelated text. The study investigates how the recall performance of various LLMs is influenced by the content of prompts and the biases in their training data. The research involves testing multiple LLMs with varying context window sizes to assess their ability to recall information accurately when prompted differently. The paper includes detailed methodologies, results from numerous tests, discussions on the impact of prompt variations and training data, and conclusions on improving LLM utility in practical applications. It contains many tables. Answer questions using the information in this article and be precise.'''
# print(parsing_instructions)

documents = LlamaParse(
    result_type="markdown", 
    # parsing_instructions=parsing_instructions
).load_data(pdf_file_name)
# print(documents[0].text[:1000])

Started parsing the file under job_id aa583c54-abd1-4eee-947a-f13adbfc51c2
..

In [26]:
for i, doc in enumerate(documents, start=1):
    print(f"{i}".ljust(100, '-'))
    print(doc.text)

1---------------------------------------------------------------------------------------------------
# MACRec: a Multi-Agent Collaboration Framework for Recommendation

Zhefan Wang∗

DCST, Tsinghua University

Beijing 100084, China

wzf23@mails.tsinghua.edu.cn

Yuanqing Yu∗

DCST, Tsinghua University

Beijing 100084, China

yyq23@mails.tsinghua.edu.cn

Wendi Zheng

DCST, Tsinghua University

Beijing 100084, China

zhengwd23@mails.tsinghua.edu.cn

Weizhi Ma†

AIR, Tsinghua University

Beijing 100084, China

mawz@tsinghua.edu.cn

Min Zhang†

DCST, Tsinghua University

Beijing 100084, China

z-m@tsinghua.edu.cn

arXiv:2402.15235v3 [cs.IR] 1 Nov 2024

# ABSTRACT

LLM-based agents have gained considerable attention for their decision-making skills and ability to handle complex tasks. Recognizing the current gap in leveraging agent capabilities for multi-agent collaboration in recommendation systems, we introduce MACRec, a novel framework designed to enhance recommendation systems through mu

In [27]:
# Parse the documents using MarkdownElementNodeParser
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8).from_defaults()

# Retrieve nodes (text) and objects (table)
nodes = node_parser.get_nodes_from_documents(documents)

base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

# insert the table markdown into the text of each table object
for i in range(len(objects)):
  objects[i].text = objects[i].obj.text[:]

0it [00:00, ?it/s]
1it [00:00, 18558.87it/s]
1it [00:00, 15650.39it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [32]:
vector_store = KDBAIVectorStore(table)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

#Create the index, inserts base_nodes and objects into KDB.AI
recursive_index = VectorStoreIndex(
    nodes= base_nodes + objects, storage_context=storage_context
)

# Query KDB.AI to ensure the nodes were inserted
table.query()

Unnamed: 0,document_id,text,embeddings
0,2b516ba5-4f80-4723-85bd-1210fc2131e4,MACRec: a Multi-Agent Collaboration Framework ...,"[-0.0010586621, 0.041707043, 0.02925757, 0.029..."
1,a6c378f2-ab9d-46fd-89cb-25f4cf62b35e,"SIGIR ’24, July 14–18, 2024, Washington, DC, U...","[-0.0015073667, 0.03517134, 0.057041712, 0.030..."
2,b1dd3c00-1b68-47fc-917d-d2543f7c36a9,Varying requirements for agents in different s...,"[-0.004967418, 0.037573203, 0.04944687, 0.0059..."
3,6a286a2e-717a-4419-a3e3-fd5c173631a3,MACRec: a Multi-Agent Collaboration Framework ...,"[-0.0118856495, 0.06537797, 0.021507366, 0.003..."
4,2ad81017-66f8-4e74-8598-c902244f810c,Code is available at https://github.com/wzf200...,"[-0.018466502, 0.047101013, 0.060254864, 0.020..."
5,dfda5935-52df-4254-98a0-58761eb9b69a,"SIGIR ’24, July 14–18, 2024, Washington, DC, U...","[-0.027369712, 0.04296594, -0.007082258, 0.006..."
6,224ab1d8-a8f7-4fe9-bdb7-e1a7fa37e50b,"Moreover, given that the answers to the sequen...","[-0.013393156, 0.05124579, 0.016177049, 0.0100..."
7,79c2c0d4-1557-43c8-9d29-5fce9c418069,MACRec: a Multi-Agent Collaboration Framework ...,"[-0.02555791, 0.009939187, 0.06459412, 0.02189..."
8,19d54e30-e5e8-40c1-b1c2-7642ac3adb95,arXiv preprint arXiv:2308.09904 (2023).\n12. P...,"[-0.0062998543, 0.022451159, 0.04597343, 0.013..."
9,5e1b00ab-6c49-417f-80ac-d812d56e0501,2022. Glm-130b: An open bilingual pre-trained ...,"[-0.0017179978, -0.0010304812, 0.04113033, 0.0..."


In [33]:
from openai import OpenAI
client = OpenAI()

def embed_query(query):
    query_embedding = client.embeddings.create(
            input=query,
            model="text-embedding-3-small"
        )
    return query_embedding.data[0].embedding

def retrieve_data(query):
    query_embedding = embed_query(query)
    results = table.search(vectors={'flat':[query_embedding]},n=5,filter=[('<>','document_id','4a9551df-5dec-4410-90bb-43d17d722918')])
    retrieved_data_for_RAG = []
    for index, row in results[0].iterrows():
      retrieved_data_for_RAG.append(row['text'])
    return retrieved_data_for_RAG

def RAG(query):
  question = "You will answer this question based on the provided reference material: " + query
  messages = "Here is the provided context: " + "\n"
  results = retrieve_data(query)
  if results:
    for data in results:
      messages += data + "\n"
  response = client.chat.completions.create(
      model="gpt-4o",
      messages=[
          {"role": "system", "content": question},
          {
          "role": "user",
          "content": [
              {"type": "text", "text": messages},
          ],
          }
      ],
      # max_tokens=300,
  )
  content = response.choices[0].message.content
  return content

In [36]:
print(RAG("이 논문의 핵심은 뭐야? 본문의 내용을 인용/발췌해서 설명해줘. 한글로 대답해."))

이 논문의 핵심은 MACRec이라는 새로운 프레임워크를 소개하는 것입니다. MACRec는 LLM 기반의 다중 에이전트 협업 프레임워크로, 추천 시스템에서의 멀티 에이전트 협업을 통해 추천 작업을 직접적으로 해결하는 것을 목표로 합니다. 이 프레임워크는 매니저, 사용자/아이템 분석가, 반성자, 탐색자, 작업 해석자와 같은 여러 전문화된 에이전트들의 협력을 통해 추천 작업을 수행합니다. MACRec는 평가 예측, 순차적 추천, 대화형 추천, 추천 결과 설명 생성과 같은 다양한 추천 작업에 쉽게 활용할 수 있습니다. 이 프레임워크는 특히 단일 에이전트가 수행하기 어려운 복잡한 의사결정 작업에서 유용하며, 인간의 작업 흐름에 가까운 다중 에이전트 협업이 이를 보다 효과적으로 수행할 수 있다고 설명합니다. "In this work, we propose a novel LLM-based multi-agent collaboration framework for recommendation, called MACRec. Unlike existing studies on using agents for user/item simulation, we directly tackle recommendation tasks through the collaboration of various agents."(본문에서 발췌)


# Reference 파싱

In [618]:
answer = RAG(f"""Find this paper's References. Give me that References with the given json form. Don't return any other comments except that References

EXAMPLE : 
{{
    1 : {{
            "from_paper : 
                            {{
                                "title" : "Language models are few-shot learners",
                                "authors" : "Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al",
                                "source" : "Advances in neural information processing systems 33 (2020), 1877–1901",
                                "year" : 2020
                            }}
    }},
    2 : {{
        ...
    }},
    ...
}}
""")

In [619]:
ref_dict = eval(answer.replace("```json\n", "").replace("```", ""))

In [620]:
ref_dict

{1: {'from_paper': {'title': 'Language models are few-shot learners',
   'authors': 'Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al',
   'source': 'Advances in neural information processing systems 33 (2020), 1877–1901',
   'year': 2020}},
 2: {'from_paper': {'title': 'Trends in distributed artificial intelligence',
   'authors': 'Brahim Chaib-Draa, Bernard Moulin, René Mandiau, and Patrick Millot',
   'source': 'Artificial Intelligence Review 6 (1992), 35–66',
   'year': 1992}},
 3: {'from_paper': {'title': 'Agentverse: Facilitating multi-agent collaboration and exploring emergent behaviors in agents',
   'authors': 'Weize Chen, Yusheng Su, Jingwei Zuo, Cheng Yang, Chenfei Yuan, Chen Qian, Chi-Min Chan, Yujia Qin, Yaxi Lu, Ruobing Xie, et al',
   'source': 'arXiv preprint arXiv:2308.10848 (2023)',
   'year': 2023}},
 4: {'from_paper': {'title': 'Improving Factuality and Rea

# get_citation_count

In [621]:
search_query = scholarly.search_pubs(ref_paper_title)

In [622]:
for result in search_query:
    break

In [623]:
result

{'container_type': 'Publication',
 'source': <PublicationSource.PUBLICATION_SEARCH_SNIPPET: 'PUBLICATION_SEARCH_SNIPPET'>,
 'bib': {'title': 'Gpt-4 technical report',
  'author': ['J Achiam', 'S Adler', 'S Agarwal', 'L Ahmad'],
  'pub_year': '2023',
  'venue': 'arXiv preprint arXiv …',
  'abstract': 'This technical report presents GPT-4, a large multimodal  To test its capabilities in such  scenarios, GPT-4 was evaluated on a  For example, on a simulated bar exam, GPT-4 achieves a'},
 'filled': False,
 'gsrank': 1,
 'pub_url': 'https://arxiv.org/abs/2303.08774',
 'author_id': ['', 'K8mpmWAAAAAJ', '8UZIqcoAAAAJ', ''],
 'url_scholarbib': '/scholar?hl=en&q=info:yMRuEJga_zIJ:scholar.google.com/&output=cite&scirp=0&hl=en',
 'url_add_sclib': '/citations?hl=en&oe=ASCII&xsrf=&continue=/scholar%3Fq%3DGPT-4%2BTechnical%2BReport%26hl%3Den%26oe%3DASCII%26as_sdt%3D0,33&citilm=1&update_op=library_add&info=yMRuEJga_zIJ&ei=nWibZ6KANI2l6rQPhef7qAs&json=',
 'num_citations': 7508,
 'citedby_url': '/schol

In [625]:
result['pub_url']

'https://arxiv.org/abs/2303.08774'

In [626]:
ref_dict[1]

{'from_paper': {'title': 'Language models are few-shot learners',
  'authors': 'Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al',
  'source': 'Advances in neural information processing systems 33 (2020), 1877–1901',
  'year': 2020}}

In [627]:
search_query = scholarly.search_pubs('Language models are few-shot learners')

In [628]:
for result in search_query:
    break

In [674]:
PAPER_COMPARE_PROMPT = """🔹 Task Instruction
Determine whether A paper and B paper refer to the same research work. If they do, respond with "YES"; otherwise, respond with "NO".

When making this judgment, apply the following considerations:

🔹 Considerations for Matching Papers
1. Title Matching (Minor Differences Allowed)
✅ Match the papers even if:

The capitalization, punctuation, or spacing is slightly different.
Example: "GPT-4 Technical Report" vs. "Gpt-4 technical report" → Match
There are minor wording differences that do not change the meaning.
Example: "Large-scale language model society" vs. "Large language model society" → Match
🚨 Do NOT match the papers if:

The core meaning of the title is different.
Example: "GPT-4 Overview" vs. "GPT-3.5 Architecture" → Not the same paper
2. Source Matching (Preprints, Conferences, Journals, DOI, URLs)
✅ Match the papers even if:

One is an arXiv preprint, and the other is a published conference/journal version of the same research.
Example: arXiv preprint arXiv:2303.17760 → NeurIPS 2023 proceedings link → Same research work
The URLs are different but point to the same DOI, arXiv ID, or official publisher repository.
Example:
"https://arxiv.org/abs/2303.08774"
"https://proceedings.neurips.cc/.../2303.08774"
→ Same paper
The conference/journal version is an extended version of an arXiv paper, unless there is major content divergence.
🚨 Do NOT match the papers if:

The DOI/arXiv ID is different, and there is no indication that one is a revision of the other.
One is from a completely different publisher (e.g., IEEE vs. ACL Anthology) without a clear link between them.
3. Author Name Variations (Abbreviations & Institutional Naming Allowed)
✅ Match the papers even if:

Authors use initials instead of full names.
Example: "Guohao Li" vs. "G Li" → Same author
Authors are listed differently between an arXiv preprint and a published paper.
Example: "OpenAI" vs. "J Achiam, S Adler, S Agarwal" → Match if source matches
A company name is used instead of individual authors.
🚨 Do NOT match the papers if:

A completely different research group is listed.
The list of authors has no significant overlap.
4. Edition or Version Differences (Preprint vs. Published Paper)
✅ Match the papers even if:

One version is an early preprint and the other is a peer-reviewed conference/journal version.
The published version contains minor updates or additional experiments but is still based on the same research.
🚨 Do NOT match the papers if:

The newer version substantially changes the research (e.g., different methodology, new experiments, different conclusions).
The preprint was not accepted by the listed conference/journal.

A paper title : {a_paper_title}
A paper authors : {a_paper_authors}
A paper source : {a_paper_source}

B paper title : {b_paper_title}
B paper authors : {b_paper_authors}
B paper source : {b_paper_source}"""


REQUEST_HTML_PARSING_PROMPT = """Parse the given HTML code like the given format. Never answer the other comments but formatted information.

HTML : {one_paper_box_html}

Format example :
{{
    "title" : "Language models are few-shot learners",
    "authors" : "T Brown, B Mann, N Ryder",
    "citation_count" : 39209,
    "link_description" : ""
}}"""

REQUEST_REAL_CITATION_PROMPT = """What is the real citation count of the below paper title and authors?
- Return with given format using only Candidates' information.
- If an exact match for the paper cannot be found in Candidates, say only 'NO'.

### The paper whose citation count I want to know
paper title : {ref_paper_title}
paper authors : {ref_paper_authors}

### Candidates
{request_box_collect}

### Return Format
{{
    "title" : ,
    "authors" : ,
    "citation_count" : {{
                        'value' : citation_count,
                        }}
}}"""


In [679]:
import threading
import time
from scholarly import scholarly

# 검색할 논문 제목
ref_paper_title = "GPT-4 Technical Report"

# 타임아웃 설정 (예: 10초)
TIMEOUT = 5

# 결과 저장 변수
search_result = None

# ✅ 함수 실행을 위한 쓰레드 클래스
class ScholaryThread(threading.Thread):
    def __init__(self):
        super().__init__()
        self.result = None

    def run(self):
        try:
            # search_pubs 실행
            self.result = scholarly.search_pubs(ref_paper_title)
        except Exception as e:
            self.result = None

# ✅ 실행 시간 측정 및 타임아웃 적용
def search_with_timeout():
    global search_result

    thread = ScholaryThread()
    thread.start()
    thread.join(TIMEOUT)  # 타임아웃 적용

    if thread.is_alive():
        print("⏳ Timeout exceeded! Moving to the next step.")
        thread.join(0)  # 강제 종료
    else:
        search_result = thread.result  # 검색 결과 저장

# ✅ 실행
search_with_timeout()

# ✅ 결과 확인
if search_result:
    print("✅ Search successful!")
else:
    print("⚠ No results found or timeout occurred.")


⏳ Timeout exceeded! Moving to the next step.
⚠ No results found or timeout occurred.


In [677]:
search_query = scholarly.search_pubs(ref_paper_title)

KeyboardInterrupt: 

In [681]:
from scholarly import scholarly
import requests
from bs4 import BeautifulSoup

import threading
import time
from scholarly import scholarly
from datetime import datetime

def get_citation_count_using_scholarly(
    ref_paper_title, 
    ref_paper_authors, 
    ref_paper_source,
    comp_try_limit = 20,
    TIMEOUT = 20):  # 타임아웃 설정 추가
    
    print(f"\tSearch using scholarly")

    # ✅ 검색 수행을 위한 쓰레드 클래스
    class ScholarlyThread(threading.Thread):
        def __init__(self):
            super().__init__()
            self.result = None

        def run(self):
            try:
                self.result = scholarly.search_pubs(ref_paper_title)
            except Exception as e:
                self.result = None

    # ✅ 실행 시간 측정 및 타임아웃 적용
    def search_with_timeout():
        thread = ScholarlyThread()
        thread.start()
        thread.join(TIMEOUT)  # 타임아웃 적용

        if thread.is_alive():
            print(f"⏳ Timeout {TIMEOUT} sec exceeded! Moving to the next step.")
            return None  # 타임아웃 발생 시 None 반환
        return thread.result  # 성공하면 결과 반환

    # ✅ Google Scholar에서 논문 검색 (타임아웃 적용)
    search_query = search_with_timeout()
    
    if search_query is None:
        print(f"\tscholarly search timed out or failed")
        return None
    
    cnt = 0
    if not len(search_query._rows):  # 검색 결과 없을 때
        print(f"\tscholarly no result")
        return None
        
    for result in search_query:
        cnt += 1
        b_paper_title = result['bib']['title']
        b_paper_authors = ', '.join(result['bib']['author'])
        b_paper_source = result.get('pub_url', '')

        # ✅ 논문 비교 (paper_compare 함수 사용)
        if paper_compare(ref_paper_title, ref_paper_authors, ref_paper_source, 
                         b_paper_title, b_paper_authors, b_paper_source) == 'YES':
            citation_count = result.get('num_citations', 0)  # 인용수 가져오기
            return {
                "title": b_paper_title,
                "authors": b_paper_authors,
                "citation_count": {
                    'value': citation_count,
                    'date': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
            }

        # 검색 횟수 제한 도달 시 종료
        if cnt >= comp_try_limit:
            print(f"scholarly couldn't find a match within {comp_try_limit} attempts")
            return None


In [709]:
    print(f"\tSearch using Request")
    url = f"https://scholar.google.com/scholar?q={ref_paper_title}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)

	Search using Request


In [710]:
(response.status_code == 200) and (response.text != "")

True

In [711]:
# HTML 파싱
soup = BeautifulSoup(response.text, "html.parser")

# 논문 제목과 관련 정보 추출
results = soup.select(".gs_ri")
print(f"\t# of results : {len(results)}")

	# of results : 0


In [712]:

def get_citation_count_using_request(ref_paper_title, ref_paper_authors):
    print(f"\tSearch using Request")
    url = f"https://scholar.google.com/scholar?q={ref_paper_title}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    
    # 상태 코드 확인
    if (response.status_code == 200) and (response.text != ""):
        # HTML 파싱
        soup = BeautifulSoup(response.text, "html.parser")
        
        # 논문 제목과 관련 정보 추출
        results = soup.select(".gs_ri")
        print(f"\t# of results : {len(results)}")
        if not len(results):
            return None
        request_box_collect = ""
        for one_paper_box_html in results:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": REQUEST_HTML_PARSING_PROMPT.format(one_paper_box_html=one_paper_box_html)
                    },
                ]
            )
            answer = response.choices[0].message.content
            request_box_collect += answer.replace("```json", "").replace("```", "") + "\n"
            
        print(f"\trequest_box_collect : \n\t{request_box_collect}")
        answer = llm.invoke(
            REQUEST_REAL_CITATION_PROMPT.format(
                ref_paper_title=ref_paper_title,
                ref_paper_authors=ref_paper_authors,
                request_box_collect=request_box_collect
            )
        )
        if answer.content != 'NO':
            answer_dict = eval(answer.content.replace("```json", "").replace("```", ""))
            answer_dict['citation_count']['date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            return answer_dict
        else:
            return None

    else:
        print(f"get_citation_count_using_scholarly : BAD Response")
        return None


def paper_compare(ref_paper_title, ref_paper_authors, ref_paper_source, b_paper_title, b_paper_authors, b_paper_source):
    print(f"\tΓref_paper_title : {ref_paper_title}({ref_paper_authors[:20]}...)\n\tL  b_paper_title : {b_paper_title}({b_paper_authors[:20]}...)")
    prompt = PAPER_COMPARE_PROMPT.format(
        a_paper_title=ref_paper_title,
        a_paper_authors=ref_paper_authors,
        a_paper_source=ref_paper_source,
        b_paper_title=b_paper_title,  # 오타 수정
        b_paper_authors=b_paper_authors,
        b_paper_source=b_paper_source,
    )
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": prompt
            },
        ]
    )
    answer = response.choices[0].message.content
    print(f"\t{answer}")
    return answer

## 실행

In [702]:
for i, one_ref_info in ref_dict.items():
    if 'from_scholary' in one_ref_info:
        del one_ref_info['from_scholary']
    if 'from_request' in one_ref_info:
        del one_ref_info['from_request']

In [698]:
for i, one_ref_info in ref_dict.items():
    
    ref_paper_title = one_ref_info['from_paper']['title']
    ref_paper_authors = one_ref_info['from_paper']['authors']
    ref_paper_source = one_ref_info['from_paper']['source']
    print(f"".ljust(120, '-'))
    break
    scholary_result = get_citation_count_using_scholarly(ref_paper_title, ref_paper_authors, ref_paper_source)

    if scholary_result is not None:
        ref_dict[i]['from_scholary'] = scholary_result
    else:
        request_result = get_citation_count_using_request(ref_paper_title, ref_paper_authors)
        if request_result is not None:
            ref_dict[i]['from_request'] = request_result

    pprint(f"from_scholary : {ref_dict[i]['from_scholary'] if 'from_scholary' in ref_dict[i] else ''}")
    pprint(f"from_request : {ref_dict[i]['from_request'] if 'from_request' in ref_dict[i] else ''}")

------------------------------------------------------------------------------------------------------------------------


## tavily_search

In [292]:
import os
from langchain_community.retrievers import TavilySearchAPIRetriever

In [421]:
PAPER_URL_SEARCH_PROMPT = """The Paper '{paper_title}'s official webpage"""

def tavily_search(paper_title, k=3):
    
    retriever = TavilySearchAPIRetriever(k=k)
    result = retriever.invoke(PAPER_URL_SEARCH_PROMPT.format(paper_title=paper_title))

    """
    result

    [
        Document(
            metadata={
                        'title': '[PDF] Intelligent agents: theory ...',
                        'source': 'https://www.semanticscholar.org...',
                        'score': 0.67885995,
                        'images': [],
                     },
            page_content='The aim of thi...'
                )
         Document(...)
         Document(...),
    ]

    """
    
    return result

In [323]:
import requests
from bs4 import BeautifulSoup

# URL 지정
url = "https://aclanthology.org/2024.naacl-long.24/"

# HTTP GET 요청
response = requests.get(url)

# 상태 코드 확인
if response.status_code == 200:
    # HTML 파싱
    soup = BeautifulSoup(response.text, "html.parser")
    
    # 제목 추출 (예: 페이지의 `<title>` 태그)
    title = soup.title.string
    
    # 본문 내용 추출
    # ACL Anthology의 논문 본문은 <div id="content"> 내부에 들어 있음
    content_div = soup.find("div", {"id": "content"})
    
    if content_div:
        content_text = content_div.get_text(separator="\n").strip()  # 줄바꿈으로 텍스트 구분
        print("Page Title:", title)
        print("\nContent:\n")
        print(content_text)
    else:
        print("Content section not found.")
else:
    print(f"Failed to fetch the page. Status code: {response.status_code}")


Content section not found.


## parse_conference_paper_info

In [414]:
from llama_index.core.node_parser import HTMLNodeParser
from llama_index.core import Document


CONFERENCE_PAPER_PARSING_PROMPT = """Based on the information from the given website, return it in the provided format. Do not say anything else.

### Example
{{
    "conference" : "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
    "conference_abbreviation" : "NAACL",
    "published_year" : 2024,
    "published_month" : 6
}}

### Website's text
{page_content}

### Answer
{{
    "title" : "",
    "conference" : "",
    "conference_abbreviation" : "",
    "published_year" : ,
    "published_month" : 
}}

"""

def parse_conference_paper_info(url):
    try:
        response = requests.get(url)
    except:
        return None

    # 상태 코드 확인
    if response.status_code == 200:
        html_content = response.text  # HTML 원본 가져오기
        document = Document(text=html_content)
        parser = HTMLNodeParser()
        nodes = parser.get_nodes_from_documents([document])
        parsed_page_content = ""
        for node in nodes:
            parsed_page_content += node.get_text() + "\n"
        answer = llm.invoke(CONFERENCE_PAPER_PARSING_PROMPT.format(page_content=parsed_page_content))
        return answer.content
    else:
        return None

In [431]:
tavily_result = tavily_search(paper_title=ref_paper_title, k=10)

In [439]:
COMPREHENSIVE_DECISION_PROMPT = """Based on the website below and the information retrieved from it, make a comprehensive judgment and determine the accurate Conference, Conference abbreviation, Published year, and Published month for the paper below, then return the results. Do not say anything else.

### Paper's title : {paper_title}

### Information from Website

{information_from_website}

### Format example
{{
    "title" : "",
    "conference" : "",
    "conference_abbreviation" : "",
    "published_year" : ,
    "published_month" : 
}}
"""

def comprehensive_decision(paper_title, )

information_from_website = ""
for d in tavily_result:
    information_from_website += '\n webpage : ' + d.metadata['title']
    information_from_website += '\n url : ' + d.metadata['source']
    information_from_website += '\n' + parse_conference_paper_info(url=d.metadata['source'])
    information_from_website += '\n'

answer = llm.invoke(
    COMPREHENSIVE_DECISION_PROMPT.format(
        paper_title=paper_title,
        information_from_website=information_from_website
    )
)

In [455]:
ref_dict[i]

{'title': 'Language models are few-shot learners',
 'author(s)': 'Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al',
 'conference': 'Advances in neural information processing systems 33 (2020), 1877–1901',
 'year': 2020,
 'citation_count': {'value': 39232, 'date': '2025-01-30 18:28:04'}}

In [454]:
eval(answer.content)

{'title': 'Language Models are Few-Shot Learners',
 'conference': 'NeurIPS',
 'conference_abbreviation': 'NeurIPS',
 'published_year': 2020,
 'published_month': 12}

In [440]:
print(information_from_website)


 webpage : Paper Summary: Language Models are Few-Shot Learners
 url : https://queirozf.com/entries/paper-summary-language-models-are-few-shot-learners
{
    "title" : "Language Models are Few-Shot Learners",
    "conference" : "",
    "conference_abbreviation" : "",
    "published_year" : 2020,
    "published_month" : 5
}

 webpage : Review for NeurIPS paper: Language Models are Few-Shot Learners
 url : https://proceedings.neurips.cc/paper_files/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-MetaReview.html
{
    "title" : "Language Models are Few-Shot Learners",
    "conference" : "NeurIPS",
    "conference_abbreviation" : "NeurIPS",
    "published_year" : 2020,
    "published_month" : 12
}

 webpage : [2005.14165] Language Models are Few-Shot Learners - arXiv.org
 url : https://arxiv.org/abs/2005.14165
{
    "title" : "Language Models are Few-Shot Learners",
    "conference" : "",
    "conference_abbreviation" : "",
    "published_year" : ,
    "published_month" : 
}

 webpage : 

In [281]:
ref_dict_with_cnt = {}
for i, one_ref in ref_dict.items():
    if ('citation_count' in one_ref) and (one_ref['citation_count']['value'] is not None):
        ref_dict_with_cnt[i] = one_ref

In [None]:
{
  'Title': 'MACRec: A Multi-Agent Collaboration Framework for Recommendation',
  'Author(s)': 'Z. Wang, Y. Yu, W. Zheng, W. Ma, M. Zhang',
  'Conference': 'Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval (2024, July), 2760-2764',
  'citation_count': {'value': 'unknown', 'date': '2025-01-22 16:50:59'}
}


In [282]:
ref_dict_with_cnt

{1: {'Title': 'Language models are few-shot learners',
  'Author(s)': 'Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al.',
  'Conference': 'Advances in neural information processing systems 33 (2020), 1877–1901',
  'citation_count': {'value': 39209, 'date': '2025-01-28 21:45:37'}},
 2: {'Title': 'Trends in distributed artificial intelligence',
  'Author(s)': 'Brahim Chaib-Draa, Bernard Moulin, René Mandiau, and Patrick Millot',
  'Conference': 'Artificial Intelligence Review 6 (1992), 35–66',
  'citation_count': {'value': 282, 'date': '2025-01-28 21:45:46'}},
 3: {'Title': 'Agentverse: Facilitating multi-agent collaboration and exploring emergent behaviors in agents',
  'Author(s)': 'Weize Chen, Yusheng Su, Jingwei Zuo, Cheng Yang, Chenfei Yuan, Chen Qian, Chi-Min Chan, Yujia Qin, Yaxi Lu, Ruobing Xie, et al.',
  'Conference': 'arXiv preprint arXiv:2308.10848 (2023)',
  'citat

# Requests

In [248]:
from pprint import pprint

In [245]:
import requests
from bs4 import BeautifulSoup

def get_citation_count_using_scholarly(ref_paper_title):
    url = f"https://scholar.google.com/scholar?q={ref_paper_title}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    
    # 상태 코드 확인
    if response.status_code == 200:
        # HTML 파싱
        soup = BeautifulSoup(response.text, "html.parser")
        
        # 논문 제목과 관련 정보 추출
        results = soup.select(".gs_ri")
        for one_paper_box_html in results:
            html_code = results
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": f"""Parse the given HTML code like the given format. Never answer the other comments but formatted information.
                        
                        HTML : {one_paper_box_html}
                        
                        Format example : 
                        {{
                            "title" : "Language models are few-shot learners",
                            "authors" : "T Brown, B Mann, N Ryder",
                            "citation_count" : 39209
                        }}
                        """
                    },
                ]
            )
            answer = response.choices[0].message.content
            llm_parsed_result = eval(answer.replace("```json", "").replace("```", ""))
            
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")

# 테스트 실행
request_google_scholar_url("BERT language model")


Result 1:
Title: Patent classification by fine-tuning BERT language model
Authors and Year: JS Lee, J Hsiang - World Patent Information, 2020 - Elsevier
Snippet: … In this work we focus on fine-tuning a pre-trained BERT model and applying it to patent … Our 
contributions include: (1) a new state-of-the-art result based on pre-trained BERT model and …
Link: https://www.sciencedirect.com/science/article/pii/S0172219019300742
Citation Count: 0
--------------------------------------------------------------------------------
Result 2:
Title: BERT has a mouth, and it must speak: BERT as a Markov random field language model
Authors and Year: A Wang, K Cho - arXiv preprint arXiv:1902.04094, 2019 - arxiv.org
Snippet: … these questions by showing that BERT is a combination of a Markov random field language 
… BERT are well-formed and are assigned high probabilities by an off-theshelf language model…
Link: https://arxiv.org/abs/1902.04094
Citation Count: 0
---------------------------------------



In [232]:
query = ref_paper_title
query

'Trends in distributed artificial intelligence'

In [233]:
url = f"https://scholar.google.com/scholar?q={query}"

In [234]:
url

'https://scholar.google.com/scholar?q=Trends in distributed artificial intelligence'

In [235]:
response = requests.get(url, headers=headers)

In [236]:
response

<Response [200]>

In [237]:
soup = BeautifulSoup(response.text, "html.parser")

In [238]:
results = soup.select(".gs_ri")

In [240]:
for i, result in enumerate(results):
    title = result.select_one(".gs_rt").text
    authors_and_year = result.select_one(".gs_a").text
    snippet = result.select_one(".gs_rs").text if result.select_one(".gs_rs") else "No snippet available"
    link = result.select_one(".gs_rt a")["href"] if result.select_one(".gs_rt a") else "No link available"

    print(f"Result {i+1}:")
    print(f"Title: {title}")
    print(f"Authors and Year: {authors_and_year}")
    print(f"Snippet: {snippet}")
    print(f"Link: {link}")

Result 1:
Title: Trends in distributed artificial intelligence
Authors and Year: B Chaib-Draa, B Moulin, R Mandiau, P Millot - Artificial Intelligence Review, 1992 - Springer
Snippet: Distributed artificial intelligence (DAI) is a subfield of artificial intelligence that deals with interactions of intelligent agents. Precisely, DAI attempts to construct intelligent agents that make decisions that allow them to achieve their goals in a world populated by other intelligent agents with their own goals. This paper discusses major concepts used in DAI today. To do this, a taxonomy of DAI is presented, based on the social abilities of an individual agent, the organization of agents, and the dynamics of this organization through …
Link: https://link.springer.com/article/10.1007/BF00155579


In [205]:
import requests
from bs4 import BeautifulSoup

# Google Scholar 검색 URL
query = "MACRec: A Multi-Agent Collaboration Framework for Recommendation"
url = f"https://scholar.google.com/scholar?q={query}"

# 요청 헤더 설정 (실제 브라우저에서의 요청처럼 위장)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

# GET 요청 보내기
response = requests.get(url, headers=headers)

# 상태 코드 확인
if response.status_code == 200:
    # HTML 파싱
    soup = BeautifulSoup(response.text, "html.parser")

    # 논문 제목과 관련 정보 추출
    results = soup.select(".gs_ri")
    for i, result in enumerate(results):
        title = result.select_one(".gs_rt").text
        authors_and_year = result.select_one(".gs_a").text
        snippet = result.select_one(".gs_rs").text if result.select_one(".gs_rs") else "No snippet available"
        link = result.select_one(".gs_rt a")["href"] if result.select_one(".gs_rt a") else "No link available"

        print(f"Result {i+1}:")
        print(f"Title: {title}")
        print(f"Authors and Year: {authors_and_year}")
        print(f"Snippet: {snippet}")
        print(f"Link: {link}")
        print("-" * 80)
else:
    print(f"Failed to fetch the page. Status code: {response.status_code}")


In [204]:
results

[]

In [206]:
# 응답 HTML 저장 및 확인
with open("response.html", "w", encoding="utf-8") as f:
    f.write(response.text)

print("Response saved to response.html. Open it in a browser to check if it's a Captcha page.")


Response saved to response.html. Open it in a browser to check if it's a Captcha page.


# References

https://medium.com/kx-systems/rag-llamaparse-advanced-pdf-parsing-for-retrieval-c393ab29891b

https://www.devkuma.com/docs/d3-js/append/