In [1]:
from llama_parse import LlamaParse
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.kdbai import KDBAIVectorStore
from getpass import getpass
import kdbai_client as kdbai

from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
# API access to llama-cloud
# os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv('LLAMA_CLOUD_API_KEY')

KDBAI_ENDPOINT = (os.environ["KDBAI_ENDPOINT"] if "KDBAI_ENDPOINT" in os.environ else input("KDB.AI endpoint: "))
KDBAI_API_KEY = (os.environ["KDBAI_API_KEY"] if "KDBAI_API_KEY" in os.environ else getpass("KDB.AI API key: "))

#connect to KDB.AI
session = kdbai.Session(api_key=KDBAI_API_KEY, endpoint=KDBAI_ENDPOINT)

In [3]:
# Connect with kdbai database
db = session.database("default")

In [4]:
table = db.table("LlamaParse_Table")

In [5]:
table.query()

Unnamed: 0,document_id,text,embeddings


In [6]:
# The schema contains two metadata columns (document_id, text) and one embeddings column
schema = [
        dict(name="document_id", type="str"),
        dict(name="text", type="str"),
        dict(name="embeddings", type="float32s"),
    ]

# indexflat, define the index name, type, column to apply the index to (embeddings)
# and params which include thesearch metric (Euclidean distance), and dims
indexFlat = {
        "name": "flat",
        "type": "flat",
        "column": "embeddings",
        "params": {'dims': 1536, 'metric': 'L2'},
    }

KDBAI_TABLE_NAME = "LlamaParse_Table"

# First ensure the table does not already exist
try:
    db.table(KDBAI_TABLE_NAME).drop()
except kdbai.KDBAIException:
    pass

#Create the table
table = db.create_table(table=KDBAI_TABLE_NAME, schema=schema, indexes=[indexFlat])

In [7]:
EMBEDDING_MODEL  = "text-embedding-3-small"
GENERATION_MODEL = "gpt-4o"

llm = OpenAI(model=GENERATION_MODEL)
embed_model = OpenAIEmbedding(model=EMBEDDING_MODEL)

Settings.llm = llm
Settings.embed_model = embed_model

pdf_file_name = './MACRec.pdf'

In [11]:
# parsing_instructions = '''The document titled "LLM In-Context Recall is Prompt Dependent" is an academic preprint from April 2024, authored by Daniel Machlab and Rick Battle from the VMware NLP Lab. It explores the in-context recall capabilities of Large Language Models (LLMs) using a method called "needle-in-a-haystack," where a specific factoid is embedded in a block of unrelated text. The study investigates how the recall performance of various LLMs is influenced by the content of prompts and the biases in their training data. The research involves testing multiple LLMs with varying context window sizes to assess their ability to recall information accurately when prompted differently. The paper includes detailed methodologies, results from numerous tests, discussions on the impact of prompt variations and training data, and conclusions on improving LLM utility in practical applications. It contains many tables. Answer questions using the information in this article and be precise.'''
# print(parsing_instructions)

documents = LlamaParse(
    result_type="markdown", 
    # parsing_instructions=parsing_instructions
).load_data(pdf_file_name)
# print(documents[0].text[:1000])

Started parsing the file under job_id 4e55fd42-38b3-46d9-943c-a859f8a69d8b


In [12]:
type(documents)

list

In [13]:
len(documents)

5

In [14]:
for i, doc in enumerate(documents, start=1):
    print(f"{i}".ljust(100, '-'))
    print(doc.text)

1---------------------------------------------------------------------------------------------------
# MACRec: a Multi-Agent Collaboration Framework for Recommendation

Zhefan Wang∗, Yuanqing Yu∗, Wendi Zheng

DCST, Tsinghua University, Beijing 100084, China

wzf23@mails.tsinghua.edu.cn, yyq23@mails.tsinghua.edu.cn, zhengwd23@mails.tsinghua.edu.cn

Weizhi Ma†, Min Zhang†

AIR, Tsinghua University, Beijing 100084, China

mawz@tsinghua.edu.cn, z-m@tsinghua.edu.cn

arXiv:2402.15235v3 [cs.IR] 1 Nov 2024

# ABSTRACT

LLM-based agents have gained considerable attention for their decision-making skills and ability to handle complex tasks. Recognizing the current gap in leveraging agent capabilities for multi-agent collaboration in recommendation systems, we introduce MACRec, a novel framework designed to enhance recommendation systems through multi-agent collaboration. Unlike existing work on using agents for user/item simulation, we aim to deploy multi-agents to tackle recommendation tasks d

In [15]:
# Parse the documents using MarkdownElementNodeParser
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8).from_defaults()

In [16]:
# Retrieve nodes (text) and objects (table)
nodes = node_parser.get_nodes_from_documents(documents)

0it [00:00, ?it/s]
1it [00:00, 47662.55it/s]
1it [00:00, 23172.95it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [17]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [18]:
# insert the table markdown into the text of each table object
for i in range(len(objects)):
  objects[i].text = objects[i].obj.text[:]

In [19]:
vector_store = KDBAIVectorStore(table)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

#Create the index, inserts base_nodes and objects into KDB.AI
recursive_index = VectorStoreIndex(
    nodes= base_nodes + objects, storage_context=storage_context
)

# Query KDB.AI to ensure the nodes were inserted
table.query()

Unnamed: 0,document_id,text,embeddings
0,0aa38090-ee93-42fe-a6b7-bffd5fa179db,MACRec: a Multi-Agent Collaboration Framework ...,"[-0.00051903527, 0.04135387, 0.030073179, 0.02..."
1,60efd98c-1205-46ff-81aa-190e75e3090e,"SIGIR ’24, July 14–18, 2024, Washington, DC, U...","[-0.0015073667, 0.03517134, 0.057041712, 0.030..."
2,094e5cd1-cc75-4f87-9dd0-fa2affd3ec6d,Varying requirements for agents in different s...,"[-0.0076043373, 0.034892056, 0.04205691, 0.007..."
3,98cc21b6-3aae-4cba-abbf-60f9e0c86afb,MACRec: a Multi-Agent Collaboration Framework ...,"[-0.0117012225, 0.063337676, 0.025426285, 0.00..."
4,32082294-1570-4e6f-a9b7-6b3b869acd97,Code is available at https://github.com/wzf200...,"[-0.018466502, 0.047101013, 0.060254864, 0.020..."
5,ead7f736-b1fc-4126-a4e4-ddf08c6689ad,"SIGIR ’24, July 14–18, 2024, Washington, DC, U...","[-0.021346562, 0.053664234, 0.009454269, 0.010..."
6,06a73b29-f5da-4ba6-a9b3-a54b3c78d80e,MACRec: a Multi-Agent Collaboration Framework ...,"[-0.025487937, 0.009967705, 0.06465233, 0.0219..."
7,a86ac1a9-0246-4861-bec8-bfb60f0214ed,arXiv preprint arXiv:2308.09904 (2023).\n12. P...,"[-0.0062998543, 0.022451159, 0.04597343, 0.013..."
8,cffc1ed8-b272-415c-8673-b745fa9cd3cb,2022. Glm-130b: An open bilingual pre-trained ...,"[-0.0017375047, -0.0010577497, 0.041115653, 0...."
9,05716c1c-e476-4c1a-8de0-6be470dd7bfb,This table compares different models based on ...,"[-0.04859718, 0.03094295, 0.049393404, -0.0158..."


In [None]:
recursive_index

In [20]:
from openai import OpenAI
client = OpenAI()

def embed_query(query):
    query_embedding = client.embeddings.create(
            input=query,
            model="text-embedding-3-small"
        )
    return query_embedding.data[0].embedding

def retrieve_data(query):
    query_embedding = embed_query(query)
    results = table.search(vectors={'flat':[query_embedding]},n=5,filter=[('<>','document_id','4a9551df-5dec-4410-90bb-43d17d722918')])
    retrieved_data_for_RAG = []
    for index, row in results[0].iterrows():
        retrieved_data_for_RAG.append(row['text'])
    return retrieved_data_for_RAG

def RAG(query):
    question = "You will answer this question based on the provided reference material: " + query
    messages = "Here is the provided context: " + "\n"
    results = retrieve_data(query)
    if results:
        for data in results:
            messages += data + "\n"
    response = client.chat.completions.create(
      model="gpt-4o",
      messages=[
          {"role": "system", "content": question},
          {
          "role": "user",
          "content": [
              {"type": "text", "text": messages},
          ],
          }
      ],
      # max_tokens=300,
    )
    content = response.choices[0].message.content
    return content

In [21]:
print(RAG("이 논문의 핵심은 뭐야? 본문의 내용을 인용/발췌해서 설명해줘. 한글로 대답해."))

이 논문의 핵심은 MACRec라는 프레임워크를 제안하여 추천 시스템에서의 다중 에이전트 협업을 통해 개선을 이루는 것입니다. 기존의 연구들은 주로 에이전트를 사용자나 아이템의 시뮬레이션에 사용하여 사용자의 선호도를 분석하는 데 집중하지만, MACRec는 여러 전문화된 에이전트들이 각 추천 업무를 직접적으로 해결하는 방식으로 접근합니다. 이 프레임워크에서는 매니저, 사용자/아이템 분석가, 반사기능자(Reflector), 탐색자(Searcher), 임무 해석가(Task Interpreter) 등의 다양한 에이전트들이 협업하여 효율적으로 추천 업무를 수행합니다. MACRec는 평가 예측(Rating Prediction), 순차적 추천(Sequential Recommendation), 대화형 추천(Conversational Recommendation), 설명 생성(Explanation Generation)과 같은 다양한 추천 업무에 쉽게 적용될 수 있는 사례를 제공합니다.

"추천 작업은 매니저, 사용자/아이템 분석가, 반사기능자, 탐색자, 임무 해석가 등의 다양한 전문화된 에이전트들의 협력적 노력을 통해 해결됩니다."


# Reference 파싱

In [25]:
source_paper_answer = RAG(f"""Find this paper's title, authors like example.

EXAMPLE : 
{{
    "from_paper : 
                    {{
                        "title" : "Language models are few-shot learners",
                        "authors" : "Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al",
                    }}
}}

""")

In [31]:
source_dict = eval(source_paper_answer.replace("```json", "").replace("```", ""))

In [32]:
source_dict

{'from_paper': {'title': 'MACRec: a Multi-Agent Collaboration Framework for Recommendation',
  'authors': 'Zhefan Wang, Yuanqing Yu, Wendi Zheng, Weizhi Ma, Min Zhang'}}

In [33]:
answer = RAG(f"""Find this paper's References. Give me that References with the given json form. Don't return any other comments except that References

EXAMPLE : 
{{
    1 : {{
            "from_paper : 
                            {{
                                "title" : "Language models are few-shot learners",
                                "authors" : "Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al",
                                "source" : "Advances in neural information processing systems 33 (2020), 1877–1901",
                                "year" : 2020
                            }}
    }},
    2 : {{
        ...
    }},
    ...
}}
""")

In [23]:
ref_dict = eval(answer.replace("```json\n", "").replace("```", ""))

## neo4j 데이터 입력

In [41]:
from neo4j import GraphDatabase
import os

# Neo4j 연결 설정
uri = "neo4j+s://350e8633.databases.neo4j.io"
username = "neo4j"
password = os.getenv("NEO4J_PASSWORD")

# 드라이버 생성
driver = GraphDatabase.driver(uri, auth=(username, password))

# 쿼리 실행 예제
def print_nodes(tx):
    result = tx.run("MATCH (n) RETURN n LIMIT 10")
    for record in result:
        print(record)

# 세션 열고 쿼리 실행
with driver.session() as session:
    session.execute_read(print_nodes)  # ✅ 최신 방식으로 변경

# 드라이버 종료
driver.close()


In [44]:
source_dict

{'from_paper': {'title': 'MACRec: a Multi-Agent Collaboration Framework for Recommendation',
  'authors': 'Zhefan Wang, Yuanqing Yu, Wendi Zheng, Weizhi Ma, Min Zhang'}}

In [54]:
title = source_dict['from_paper']['title']

In [55]:
authors = source_dict['from_paper']['authors']

In [66]:
def _create_paper_if_not_exists(tx, title, authors):
    query = """
    MERGE (p:Paper {title: $title})
    ON CREATE SET p.authors = $authors
    RETURN p.title, p.authors
    """
    result = tx.run(query, title=title, authors=authors)
    print(f"result : {result}")
    # ✅ 실행 결과 확인 후 반환
    record = result.single()
    print(f"record : {record}")
    if record:
        return {"title": record["p.title"], "authors": record["p.authors"]}
    
    return None

In [67]:
with driver.session() as session:
    result = session.execute_write(
        _create_paper_if_not_exists, title, authors
    )




  with driver.session() as session:


result : <neo4j._sync.work.result.Result object at 0x305f42950>
record : <Record p.title='MACRec: a Multi-Agent Collaboration Framework for Recommendation' p.authors='Zhefan Wang, Yuanqing Yu, Wendi Zheng, Weizhi Ma, Min Zhang'>


In [65]:
result

{'title': 'MACRec: a Multi-Agent Collaboration Framework for Recommendation',
 'authors': 'Zhefan Wang, Yuanqing Yu, Wendi Zheng, Weizhi Ma, Min Zhang'}

In [None]:
query = """
MERGE (p:Paper {title: $title})
ON CREATE SET p.authors = $authors
RETURN p
"""
result = tx.run(query, title=title, authors=authors)

In [49]:
def add_node(tx, title, authors):
    query = "CREATE (p:Paper {title: $title, authors: $authors}) RETURN p"
    result = tx.run(query, title=title, authors=authors)
    return result.single()[0]  # 생성된 노드 반환

with driver.session() as session:
    person = session.execute_write(add_node, source_dict['from_paper']['title'], source_dict['from_paper']['authors'])
    print(person)


  with driver.session() as session:


<Node element_id='4:c3bad7c6-841c-4ced-8b4a-50bdf00a8cbe:0' labels=frozenset({'Paper'}) properties={'title': 'MACRec: a Multi-Agent Collaboration Framework for Recommendation', 'authors': 'Zhefan Wang, Yuanqing Yu, Wendi Zheng, Weizhi Ma, Min Zhang'}>


In [24]:
ref_dict

{1: {'from_paper': {'title': 'Language models are few-shot learners',
   'authors': 'Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al.',
   'source': 'Advances in neural information processing systems 33 (2020), 1877–1901',
   'year': 2020}},
 2: {'from_paper': {'title': 'Trends in distributed artificial intelligence',
   'authors': 'Brahim Chaib-Draa, Bernard Moulin, René Mandiau, and Patrick Millot',
   'source': 'Artificial Intelligence Review 6 (1992), 35–66',
   'year': 1992}},
 3: {'from_paper': {'title': 'Agentverse: Facilitating multi-agent collaboration and exploring emergent behaviors in agents',
   'authors': 'Weize Chen, Yusheng Su, Jingwei Zuo, Cheng Yang, Chenfei Yuan, Chen Qian, Chi-Min Chan, Yujia Qin, Yaxi Lu, Ruobing Xie, et al.',
   'source': 'arXiv preprint arXiv:2308.10848 (2023)',
   'year': 2023}},
 4: {'from_paper': {'title': 'Improving Factuality and R

In [51]:
driver

<neo4j._sync.driver.Neo4jDriver at 0x30528a350>

# 인용수 검색

## get_citation_count(인용수 검색)

In [21]:
PAPER_COMPARE_PROMPT = """🔹 Task Instruction
Determine whether A paper and B paper refer to the same research work. If they do, respond with "YES"; otherwise, respond with "NO".

When making this judgment, apply the following considerations:

🔹 Considerations for Matching Papers
1. Title Matching (Minor Differences Allowed)
✅ Match the papers even if:

The capitalization, punctuation, or spacing is slightly different.
Example: "GPT-4 Technical Report" vs. "Gpt-4 technical report" → Match
There are minor wording differences that do not change the meaning.
Example: "Large-scale language model society" vs. "Large language model society" → Match
🚨 Do NOT match the papers if:

The core meaning of the title is different.
Example: "GPT-4 Overview" vs. "GPT-3.5 Architecture" → Not the same paper
2. Source Matching (Preprints, Conferences, Journals, DOI, URLs)
✅ Match the papers even if:

One is an arXiv preprint, and the other is a published conference/journal version of the same research.
Example: arXiv preprint arXiv:2303.17760 → NeurIPS 2023 proceedings link → Same research work
The URLs are different but point to the same DOI, arXiv ID, or official publisher repository.
Example:
"https://arxiv.org/abs/2303.08774"
"https://proceedings.neurips.cc/.../2303.08774"
→ Same paper
The conference/journal version is an extended version of an arXiv paper, unless there is major content divergence.
🚨 Do NOT match the papers if:

The DOI/arXiv ID is different, and there is no indication that one is a revision of the other.
One is from a completely different publisher (e.g., IEEE vs. ACL Anthology) without a clear link between them.
3. Author Name Variations (Abbreviations & Institutional Naming Allowed)
✅ Match the papers even if:

Authors use initials instead of full names.
Example: "Guohao Li" vs. "G Li" → Same author
Authors are listed differently between an arXiv preprint and a published paper.
Example: "OpenAI" vs. "J Achiam, S Adler, S Agarwal" → Match if source matches
A company name is used instead of individual authors.
🚨 Do NOT match the papers if:

A completely different research group is listed.
The list of authors has no significant overlap.
4. Edition or Version Differences (Preprint vs. Published Paper)
✅ Match the papers even if:

One version is an early preprint and the other is a peer-reviewed conference/journal version.
The published version contains minor updates or additional experiments but is still based on the same research.
🚨 Do NOT match the papers if:

The newer version substantially changes the research (e.g., different methodology, new experiments, different conclusions).
The preprint was not accepted by the listed conference/journal.

A paper title : {a_paper_title}
A paper authors : {a_paper_authors}
A paper source : {a_paper_source}

B paper title : {b_paper_title}
B paper authors : {b_paper_authors}
B paper source : {b_paper_source}"""


REQUEST_HTML_PARSING_PROMPT = """Parse the given HTML code like the given format. Never answer the other comments but formatted information.

HTML : {one_paper_box_html}

Format example :
{{
    "title" : "Language models are few-shot learners",
    "authors" : "T Brown, B Mann, N Ryder",
    "citation_count" : 39209,
    "link_description" : ""
}}"""

REQUEST_REAL_CITATION_PROMPT = """What is the real citation count of the below paper title and authors?
- Return with given format using only Candidates' information.
- If an exact match for the paper cannot be found in Candidates, say only 'NO'.

### The paper whose citation count I want to know
paper title : {ref_paper_title}
paper authors : {ref_paper_authors}

### Candidates
{request_box_collect}

### Return Format
{{
    "title" : ,
    "authors" : ,
    "citation_count" : {{
                        'value' : citation_count,
                        }}
}}"""


In [22]:
import threading
import time
from scholarly import scholarly

# 검색할 논문 제목
ref_paper_title = "GPT-4 Technical Report"

# 타임아웃 설정 (예: 10초)
TIMEOUT = 5

# 결과 저장 변수
search_result = None

# ✅ 함수 실행을 위한 쓰레드 클래스
class ScholaryThread(threading.Thread):
    def __init__(self):
        super().__init__()
        self.result = None

    def run(self):
        try:
            # search_pubs 실행
            self.result = scholarly.search_pubs(ref_paper_title)
        except Exception as e:
            self.result = None

# ✅ 실행 시간 측정 및 타임아웃 적용
def search_with_timeout():
    global search_result

    thread = ScholaryThread()
    thread.start()
    thread.join(TIMEOUT)  # 타임아웃 적용

    if thread.is_alive():
        print("⏳ Timeout exceeded! Moving to the next step.")
        thread.join(0)  # 강제 종료
    else:
        search_result = thread.result  # 검색 결과 저장

# ✅ 실행
search_with_timeout()

# ✅ 결과 확인
if search_result:
    print("✅ Search successful!")
else:
    print("⚠ No results found or timeout occurred.")


✅ Search successful!


In [23]:
from scholarly import scholarly
import requests
from bs4 import BeautifulSoup

import threading
import time
from scholarly import scholarly
from datetime import datetime

def get_citation_count_using_scholarly(
    ref_paper_title, 
    ref_paper_authors, 
    ref_paper_source,
    comp_try_limit = 20,
    TIMEOUT = 20):  # 타임아웃 설정 추가
    
    print(f"\tSearch using scholarly")

    # ✅ 검색 수행을 위한 쓰레드 클래스
    class ScholarlyThread(threading.Thread):
        def __init__(self):
            super().__init__()
            self.result = None

        def run(self):
            try:
                self.result = scholarly.search_pubs(ref_paper_title)
            except Exception as e:
                self.result = None

    # ✅ 실행 시간 측정 및 타임아웃 적용
    def search_with_timeout():
        thread = ScholarlyThread()
        thread.start()
        thread.join(TIMEOUT)  # 타임아웃 적용

        if thread.is_alive():
            print(f"⏳ Timeout {TIMEOUT} sec exceeded! Moving to the next step.")
            return None  # 타임아웃 발생 시 None 반환
        return thread.result  # 성공하면 결과 반환

    # ✅ Google Scholar에서 논문 검색 (타임아웃 적용)
    search_query = search_with_timeout()
    
    if search_query is None:
        print(f"\tscholarly search timed out or failed")
        return None
    
    cnt = 0
    if not len(search_query._rows):  # 검색 결과 없을 때
        print(f"\tscholarly no result")
        return None
        
    for result in search_query:
        cnt += 1
        b_paper_title = result['bib']['title']
        b_paper_authors = ', '.join(result['bib']['author'])
        b_paper_source = result.get('pub_url', '')

        # ✅ 논문 비교 (paper_compare 함수 사용)
        if paper_compare(ref_paper_title, ref_paper_authors, ref_paper_source, 
                         b_paper_title, b_paper_authors, b_paper_source) == 'YES':
            citation_count = result.get('num_citations', 0)  # 인용수 가져오기
            return {
                "title": b_paper_title,
                "authors": b_paper_authors,
                "citation_count": {
                    'value': citation_count,
                    'date': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
            }

        # 검색 횟수 제한 도달 시 종료
        if cnt >= comp_try_limit:
            print(f"scholarly couldn't find a match within {comp_try_limit} attempts")
            return None


In [38]:
llm.invoke('hi')

AttributeError: 'OpenAI' object has no attribute 'invoke'

In [39]:
llm

<openai.OpenAI at 0x175974130>

In [41]:
llm.

SyntaxError: invalid syntax (670360929.py, line 1)

In [37]:
from openai import OpenAI

llm = OpenAI()

def get_citation_count_using_request(ref_paper_title, ref_paper_authors):
    print(f"\tSearch using Request")
    url = f"https://scholar.google.com/scholar?q={ref_paper_title}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    
    # 상태 코드 확인
    if (response.status_code == 200) and (response.text != ""):
        # HTML 파싱
        soup = BeautifulSoup(response.text, "html.parser")
        
        # 논문 제목과 관련 정보 추출
        results = soup.select(".gs_ri")
        print(f"\t# of results : {len(results)}")
        if not len(results):
            return None
        request_box_collect = ""
        for one_paper_box_html in results:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": REQUEST_HTML_PARSING_PROMPT.format(one_paper_box_html=one_paper_box_html)
                    },
                ]
            )
            answer = response.choices[0].message.content
            request_box_collect += answer.replace("```json", "").replace("```", "") + "\n"
            
        print(f"\trequest_box_collect : \n\t{request_box_collect}")
        answer = llm.invoke(
            REQUEST_REAL_CITATION_PROMPT.format(
                ref_paper_title=ref_paper_title,
                ref_paper_authors=ref_paper_authors,
                request_box_collect=request_box_collect
            )
        )
        if answer.content != 'NO':
            answer_dict = eval(answer.content.replace("```json", "").replace("```", ""))
            answer_dict['citation_count']['date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            return answer_dict
        else:
            return None

    else:
        print(f"get_citation_count_using_scholarly : BAD Response")
        return None


def paper_compare(ref_paper_title, ref_paper_authors, ref_paper_source, b_paper_title, b_paper_authors, b_paper_source):
    print(f"\tΓref_paper_title : {ref_paper_title}({ref_paper_authors[:20]}...)\n\tL  b_paper_title : {b_paper_title}({b_paper_authors[:20]}...)")
    prompt = PAPER_COMPARE_PROMPT.format(
        a_paper_title=ref_paper_title,
        a_paper_authors=ref_paper_authors,
        a_paper_source=ref_paper_source,
        b_paper_title=b_paper_title,  # 오타 수정
        b_paper_authors=b_paper_authors,
        b_paper_source=b_paper_source,
    )
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": prompt
            },
        ]
    )
    answer = response.choices[0].message.content
    print(f"\t{answer}")
    return answer

## 실행

In [25]:
for i, one_ref_info in ref_dict.items():
    if 'from_scholary' in one_ref_info:
        del one_ref_info['from_scholary']
    if 'from_request' in one_ref_info:
        del one_ref_info['from_request']

In [33]:
from pprint import pprint
for i, one_ref_info in ref_dict.items():
    
    ref_paper_title = one_ref_info['from_paper']['title']
    ref_paper_authors = one_ref_info['from_paper']['authors']
    ref_paper_source = one_ref_info['from_paper']['source']
    print(f"{i}/{len(ref_dict)}".ljust(120, '-'))
    print(f"ref_paper_title : {ref_paper_title}")
    print(f"ref_paper_authors : {ref_paper_authors}")
    print(f"ref_paper_source : {ref_paper_source}")

    scholary_result = get_citation_count_using_scholarly(ref_paper_title, ref_paper_authors, ref_paper_source)

    if scholary_result is not None:
        ref_dict[i]['from_scholary'] = scholary_result
    else:
        request_result = get_citation_count_using_request(ref_paper_title, ref_paper_authors)
        if request_result is not None:
            ref_dict[i]['from_request'] = request_result
    if 'from_scholary' in ref_dict[i]:
        print(f"from_scholary : ")
        pprint(ref_dict[i]['from_scholary'])
    if 'from_request' in ref_dict[i]:
        print(f"from_request : ")
        pprint(ref_dict[i]['from_request'])

1/25--------------------------------------------------------------------------------------------------------------------
ref_paper_title : Language models are few-shot learners
ref_paper_authors : Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al
ref_paper_source : Advances in neural information processing systems 33 (2020), 1877–1901
	Search using scholarly
	Γref_paper_title : Language models are few-shot learners(Tom Brown, Benjamin ...)
	L  b_paper_title : Language models are few-shot learners(T Brown, B Mann, N R...)
	YES
from_scholary : 
{'authors': 'T Brown, B Mann, N Ryder',
 'citation_count': {'date': '2025-02-06 17:20:26', 'value': 39609},
 'title': 'Language models are few-shot learners'}
2/25--------------------------------------------------------------------------------------------------------------------
ref_paper_title : Trends in distributed artificial intelligen

AttributeError: 'OpenAI' object has no attribute 'invoke'

In [762]:
ref_dict[21]

{'from_paper': {'title': 'React: Synergizing reasoning and acting in language models',
  'authors': 'Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao',
  'source': 'arXiv preprint arXiv:2210.03629 (2022)',
  'year': 2022}}

In [763]:
i = 21

# 학회 & 출판 연/월 검색

## tavily_search

In [292]:
import os
from langchain_community.retrievers import TavilySearchAPIRetriever

In [764]:
PAPER_URL_SEARCH_PROMPT = """The Paper '{paper_title}'s official webpage"""

def tavily_search(paper_title, k=3):    
    retriever = TavilySearchAPIRetriever(k=k)
    result = retriever.invoke(PAPER_URL_SEARCH_PROMPT.format(paper_title=paper_title))
    """
    result
    [
        Document(
            metadata={
                        'title': '[PDF] Intelligent agents: theory ...',
                        'source': 'https://www.semanticscholar.org...',
                        'score': 0.67885995,
                        'images': [],
                     },
            page_content='The aim of thi...'
                )
         Document(...)
         Document(...),
    ]
    """
    
    return result

In [765]:
tavily_result = tavily_search(paper_title=ref_paper_title, k=10)

In [884]:
from llama_index.core.node_parser import HTMLNodeParser
from llama_index.core import Document


CONFERENCE_PAPER_PARSING_PROMPT = """Based on the information from the given website, return it in the provided format. Do not say anything else.

### Example
{{
    "conference" : "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
    "conference_abbreviation" : "NAACL",
    "published_year" : 2024,
    "published_month" : 6
}}

### Website's text
{page_content}

### Answer
{{
    "title" : "",
    "conference" : "",
    "conference_abbreviation" : "",
    "published_year" : ,
    "published_month" : 
}}

"""

COMPREHENSIVE_DECISION_PROMPT = """Based on the website below and the information retrieved from it, make a comprehensive judgment and determine the accurate Conference, Conference abbreviation, Published year, and Published month for the paper below, then return the results. Do not say anything else.
Note: If any value is missing, exclude the corresponding key from the JSON output.

### Paper's title : {paper_title}

### Information from Website
{information_from_website}

### Format example
{{
    "title" : "",
    "conference" : "",
    "conference_abbreviation" : "",
    "published_year" : ,
    "published_month" : 
}}
"""

## comprehensive_decision

In [796]:
paper_title

'Intelligent agents: Theory and practice'

In [798]:
information_from_website = ""
for d in tavily_result:
    information_from_website += '\n webpage : ' + d.metadata['title']
    information_from_website += '\n url : ' + d.metadata['source']
    information_from_website += '\n' + parse_conference_paper_info(url=d.metadata['source'])
    information_from_website += '\n'

In [799]:
information_from_website

'\n webpage : Paper Summary: Language Models are Few-Shot Learners\n url : https://queirozf.com/entries/paper-summary-language-models-are-few-shot-learners\n{\n    "title" : "Language Models are Few-Shot Learners",\n    "conference" : "",\n    "conference_abbreviation" : "",\n    "published_year" : 2020,\n    "published_month" : 5\n}\n\n webpage : [2005.14165] Language Models are Few-Shot Learners - arXiv.org\n url : https://arxiv.org/abs/2005.14165\n{\n    "title" : "Language Models are Few-Shot Learners",\n    "conference" : "",\n    "conference_abbreviation" : "",\n    "published_year" : ,\n    "published_month" : \n}\n\n webpage : Review for NeurIPS paper: Language Models are Few-Shot Learners\n url : https://proceedings.neurips.cc/paper_files/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Review.html\n{\n    "title" : "Language Models are Few-Shot Learners",\n    "conference" : "NeurIPS 2020",\n    "conference_abbreviation" : "NeurIPS",\n    "published_year" : 2020,\n    "publis

In [800]:
    prompt = COMPREHENSIVE_DECISION_PROMPT.format(
        paper_title=ref_paper_title,
        information_from_website=information_from_website
    )

In [801]:
print(f"{prompt}")

Based on the website below and the information retrieved from it, make a comprehensive judgment and determine the accurate Conference, Conference abbreviation, Published year, and Published month for the paper below, then return the results. Do not say anything else.

### Paper's title : Language models are few-shot learners

### Information from Website


 webpage : Paper Summary: Language Models are Few-Shot Learners
 url : https://queirozf.com/entries/paper-summary-language-models-are-few-shot-learners
{
    "title" : "Language Models are Few-Shot Learners",
    "conference" : "",
    "conference_abbreviation" : "",
    "published_year" : 2020,
    "published_month" : 5
}

 webpage : [2005.14165] Language Models are Few-Shot Learners - arXiv.org
 url : https://arxiv.org/abs/2005.14165
{
    "title" : "Language Models are Few-Shot Learners",
    "conference" : "",
    "conference_abbreviation" : "",
    "published_year" : ,
    "published_month" : 
}

 webpage : Review for NeurIPS pa

In [802]:
answer = llm.invoke(prompt)

In [804]:
answer.content

'{\n    "title" : "Language Models are Few-Shot Learners",\n    "conference" : "NeurIPS 2020",\n    "conference_abbreviation" : "NeurIPS",\n    "published_year" : 2020,\n    "published_month" : 12\n}'

In [805]:
comprehensive_decision(tavily_result, ref_paper_title)

{'title': 'Language Models are Few-Shot Learners',
 'conference': 'NeurIPS 2020',
 'conference_abbreviation': 'NeurIPS',
 'published_year': 2020,
 'published_month': 12}

In [809]:
information_from_website = ""
for d in tavily_result:
    information_from_website += '\n webpage : ' + d.metadata['title']
    information_from_website += '\n url : ' + d.metadata['source']
    parsed = parse_conference_paper_info(url=d.metadata['source'])
    if parsed is not None:
        information_from_website += '\n' + parsed
    else:
        information_from_website += '\n' + 'Parsing failed.'
    information_from_website += '\n'

In [811]:
print(information_from_website)


 webpage : Trends in distributed artificial intelligence | Artificial Intelligence ...
 url : https://link.springer.com/article/10.1007/BF00155579
{
    "title" : "Trends in distributed artificial intelligence",
    "conference" : "Artificial Intelligence Review",
    "conference_abbreviation" : "AI Review",
    "published_year" : 1992,
    "published_month" : 3
}

 webpage : Trends in distributed artificial intelligence. - APA PsycNet
 url : https://psycnet.apa.org/record/1993-23882-001
Parsing failed.

 webpage : Trends in distributed artificial intelligence - ResearchGate
 url : https://www.researchgate.net/publication/220637937_Trends_in_distributed_artificial_intelligence
Parsing failed.

 webpage : Trends in distributed artificial intelligence - Academia.edu
 url : https://www.academia.edu/3271393/Trends_in_distributed_artificial_intelligence
{
    "title" : "Trends in distributed artificial intelligence",
    "conference" : "",
    "conference_abbreviation" : "",
    "published

In [None]:
information_from_website = ""
for di, d in enumerate(tavily_result, start=1):
    print(f"[{di}/{len(tavily_result)}]{d.metadata['source']}")
    parsed = parse_tavily_searched_url(url=d.metadata['source'])
    if parsed is None:
        continue
    information_from_website += ' webpage : ' + d.metadata['title']
    information_from_website += '\n url : ' + d.metadata['source']
    information_from_website += '\n' + parsed
    information_from_website += '\n\n'

In [881]:
def comprehensive_decision(tavily_result, ref_paper_title):
    print(f"Iterating tavily_result...")
    information_from_website = ""
    for di, d in enumerate(tavily_result, start=1):
        print(f"[{di}/{len(tavily_result)}]{d.metadata['source']}")
        parsed = parse_tavily_searched_url(url=d.metadata['source'])
        if parsed is None:
            continue
        information_from_website += ' webpage : ' + d.metadata['title']
        information_from_website += '\n url : ' + d.metadata['source']
        information_from_website += '\n' + parsed
        information_from_website += '\n\n'
    prompt = COMPREHENSIVE_DECISION_PROMPT.format(
        paper_title=ref_paper_title,
        information_from_website=information_from_website
    )
    print(f"prompt : \n{prompt}")
    answer = llm.invoke(prompt)
    print(f"answer : \n{answer.content}")
    return eval(answer.content)      


def parse_tavily_searched_url(url):

    if 'pdf' in url:
        print(f"pass. pdf")
        return None
    try:
        response = requests.get(url)
    except:
        print(f"pass. Request Error Exception")
        return None

    # 상태 코드 확인
    if response.status_code == 200:
        html_content = response.text  # HTML 원본 가져오기
        document = Document(text=html_content)
        parser = HTMLNodeParser()
        nodes = parser.get_nodes_from_documents([document])
        parsed_page_content = ""
        for node in nodes:
            parsed_page_content += node.get_text() + "\n"
        print(f"LLM is processing parsed page contents...")
        answer = llm.invoke(CONFERENCE_PAPER_PARSING_PROMPT.format(page_content=parsed_page_content))
        return answer.content
    else:
        print(f"pass. BAD Response")
        return None

## 실행

In [885]:
ref_paper_title


"Camel: Communicative agents for 'mind' exploration of large scale language model society"

In [886]:
comprehensive_decision_result = comprehensive_decision(tavily_result, ref_paper_title)

Iterating tavily_result...
[1/7]https://arxiv.org/pdf/2303.17760
pass. pdf
[2/7]https://ghli.org/publication/neurips2023camel/
LLM is processing parsed page contents...
[3/7]https://dblp.org/rec/journals/corr/abs-2303-17760
LLM is processing parsed page contents...
[4/7]https://github.com/joeccane/camelagents
LLM is processing parsed page contents...
[5/7]https://arxiv.org/abs/2303.17760
LLM is processing parsed page contents...
[6/7]https://neurips.cc/virtual/2023/poster/72905
LLM is processing parsed page contents...
[7/7]https://repository.kaust.edu.sa/handle/10754/692560
LLM is processing parsed page contents...
prompt : 
Based on the website below and the information retrieved from it, make a comprehensive judgment and determine the accurate Conference, Conference abbreviation, Published year, and Published month for the paper below, then return the results. Do not say anything else.
Note: If any value is missing, exclude the corresponding key from the JSON output.

### Paper's ti

1 ~ 18까지 했음

In [887]:
for i, one_ref_info in ref_dict.items():
    ref_paper_title = one_ref_info['from_paper']['title']
    print(f"[{i}/{len(ref_dict)}]{ref_paper_title}".ljust(120, '-'))
    tavily_result = tavily_search(paper_title=ref_paper_title, k=7)
    comprehensive_decision_result = comprehensive_decision(tavily_result, ref_paper_title)
    ref_dict[i]['from_tavily'] = comprehensive_decision_result

[1/25]Language models are few-shot learners-----------------------------------------------------------------------------
Iterating tavily_result...
[1/7]https://proceedings.neurips.cc/paper_files/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-MetaReview.html
LLM is processing parsed page contents...
[2/7]https://arxiv.org/abs/2005.14165
LLM is processing parsed page contents...
[3/7]https://plaintext.psu.edu/2024/03/21/paper-review-language-models-are-few-shot-learners-presented-by-rupak-das/
LLM is processing parsed page contents...
[4/7]https://proceedings.neurips.cc/paper_files/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Review.html
LLM is processing parsed page contents...
[5/7]https://medium.com/@dataturka/paper-summary-language-models-are-few-shot-learner-gpt-3-paper-a435d3052b4a
LLM is processing parsed page contents...
[6/7]https://papers.nips.cc/paper_files/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html
LLM is processing parsed page contents...
[7/7]http

ReadTimeout: HTTPSConnectionPool(host='api.tavily.com', port=443): Read timed out. (read timeout=100)

In [896]:
for i in range(19, 26):
    ref_paper_title = ref_dict[i]['from_paper']['title']
    print(f"[{i}/{len(ref_dict)}]{ref_paper_title}".ljust(120, '-'))
    tavily_result = tavily_search(paper_title=ref_paper_title, k=7)
    comprehensive_decision_result = comprehensive_decision(tavily_result, ref_paper_title)
    ref_dict[i]['from_tavily'] = comprehensive_decision_result

[19/25]Intelligent agents: Theory and practice--------------------------------------------------------------------------
Iterating tavily_result...
[1/7]https://forohistorico.coit.es/index.php/biblioteca/articulos-seminales/item/intelligent-agents-theory-and-practice
LLM is processing parsed page contents...
[2/7]https://consensus.app/papers/agents-practice-wooldridge/26ff4396023d54dc9a67d22e31c4935f/
LLM is processing parsed page contents...
[3/7]https://www.academia.edu/8890604/Intelligent_Agents_Theory_and_Practice
LLM is processing parsed page contents...
[4/7]https://chatbots.org/paper/intelligent_agents_theory_and_practice
LLM is processing parsed page contents...
[5/7]https://www.semanticscholar.org/paper/Intelligent-agents:-theory-and-practice-Wooldridge-Jennings/d621786b597687f555fae83dc1a021fd21713d90
pass. BAD Response
[6/7]https://www.cs.ox.ac.uk/people/michael.wooldridge/pubs/ker95/ker95-html.html
LLM is processing parsed page contents...
[7/7]https://oa.mg/work/10.1017/s0

In [889]:
ref_paper_title

'Intelligent agents: Theory and practice'

In [888]:
tavily_result = tavily_search(paper_title=ref_paper_title, k=7)

In [898]:
pprint(ref_dict)

{1: {'from_paper': {'authors': 'Tom Brown, Benjamin Mann, Nick Ryder, Melanie '
                               'Subbiah, Jared D Kaplan, Prafulla Dhariwal, '
                               'Arvind Neelakantan, Pranav Shyam, Girish '
                               'Sastry, Amanda Askell, et al',
                    'source': 'Advances in neural information processing '
                              'systems 33 (2020), 1877–1901',
                    'title': 'Language models are few-shot learners',
                    'year': 2020},
     'from_scholary': {'authors': 'T Brown, B Mann, N Ryder',
                       'citation_count': {'date': '2025-01-31 15:17:02',
                                          'value': 39328},
                       'title': 'Language models are few-shot learners'},
     'from_tavily': {'conference': 'NeurIPS',
                     'conference_abbreviation': 'NeurIPS',
                     'published_month': 12,
                     'published_year': 2020,


# js 전송

In [281]:
ref_dict_with_cnt = {}
for i, one_ref in ref_dict.items():
    if ('citation_count' in one_ref) and (one_ref['citation_count']['value'] is not None):
        ref_dict_with_cnt[i] = one_ref

In [None]:
{
  'Title': 'MACRec: A Multi-Agent Collaboration Framework for Recommendation',
  'Author(s)': 'Z. Wang, Y. Yu, W. Zheng, W. Ma, M. Zhang',
  'Conference': 'Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval (2024, July), 2760-2764',
  'citation_count': {'value': 'unknown', 'date': '2025-01-22 16:50:59'}
}


In [282]:
ref_dict_with_cnt

{1: {'Title': 'Language models are few-shot learners',
  'Author(s)': 'Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al.',
  'Conference': 'Advances in neural information processing systems 33 (2020), 1877–1901',
  'citation_count': {'value': 39209, 'date': '2025-01-28 21:45:37'}},
 2: {'Title': 'Trends in distributed artificial intelligence',
  'Author(s)': 'Brahim Chaib-Draa, Bernard Moulin, René Mandiau, and Patrick Millot',
  'Conference': 'Artificial Intelligence Review 6 (1992), 35–66',
  'citation_count': {'value': 282, 'date': '2025-01-28 21:45:46'}},
 3: {'Title': 'Agentverse: Facilitating multi-agent collaboration and exploring emergent behaviors in agents',
  'Author(s)': 'Weize Chen, Yusheng Su, Jingwei Zuo, Cheng Yang, Chenfei Yuan, Chen Qian, Chi-Min Chan, Yujia Qin, Yaxi Lu, Ruobing Xie, et al.',
  'Conference': 'arXiv preprint arXiv:2308.10848 (2023)',
  'citat

# Requests

In [248]:
from pprint import pprint

In [245]:
import requests
from bs4 import BeautifulSoup

def get_citation_count_using_scholarly(ref_paper_title):
    url = f"https://scholar.google.com/scholar?q={ref_paper_title}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    
    # 상태 코드 확인
    if response.status_code == 200:
        # HTML 파싱
        soup = BeautifulSoup(response.text, "html.parser")
        
        # 논문 제목과 관련 정보 추출
        results = soup.select(".gs_ri")
        for one_paper_box_html in results:
            html_code = results
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": f"""Parse the given HTML code like the given format. Never answer the other comments but formatted information.
                        
                        HTML : {one_paper_box_html}
                        
                        Format example : 
                        {{
                            "title" : "Language models are few-shot learners",
                            "authors" : "T Brown, B Mann, N Ryder",
                            "citation_count" : 39209
                        }}
                        """
                    },
                ]
            )
            answer = response.choices[0].message.content
            llm_parsed_result = eval(answer.replace("```json", "").replace("```", ""))
            
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")

# 테스트 실행
request_google_scholar_url("BERT language model")


Result 1:
Title: Patent classification by fine-tuning BERT language model
Authors and Year: JS Lee, J Hsiang - World Patent Information, 2020 - Elsevier
Snippet: … In this work we focus on fine-tuning a pre-trained BERT model and applying it to patent … Our 
contributions include: (1) a new state-of-the-art result based on pre-trained BERT model and …
Link: https://www.sciencedirect.com/science/article/pii/S0172219019300742
Citation Count: 0
--------------------------------------------------------------------------------
Result 2:
Title: BERT has a mouth, and it must speak: BERT as a Markov random field language model
Authors and Year: A Wang, K Cho - arXiv preprint arXiv:1902.04094, 2019 - arxiv.org
Snippet: … these questions by showing that BERT is a combination of a Markov random field language 
… BERT are well-formed and are assigned high probabilities by an off-theshelf language model…
Link: https://arxiv.org/abs/1902.04094
Citation Count: 0
---------------------------------------



In [232]:
query = ref_paper_title
query

'Trends in distributed artificial intelligence'

In [233]:
url = f"https://scholar.google.com/scholar?q={query}"

In [234]:
url

'https://scholar.google.com/scholar?q=Trends in distributed artificial intelligence'

In [235]:
response = requests.get(url, headers=headers)

In [236]:
response

<Response [200]>

In [237]:
soup = BeautifulSoup(response.text, "html.parser")

In [238]:
results = soup.select(".gs_ri")

In [240]:
for i, result in enumerate(results):
    title = result.select_one(".gs_rt").text
    authors_and_year = result.select_one(".gs_a").text
    snippet = result.select_one(".gs_rs").text if result.select_one(".gs_rs") else "No snippet available"
    link = result.select_one(".gs_rt a")["href"] if result.select_one(".gs_rt a") else "No link available"

    print(f"Result {i+1}:")
    print(f"Title: {title}")
    print(f"Authors and Year: {authors_and_year}")
    print(f"Snippet: {snippet}")
    print(f"Link: {link}")

Result 1:
Title: Trends in distributed artificial intelligence
Authors and Year: B Chaib-Draa, B Moulin, R Mandiau, P Millot - Artificial Intelligence Review, 1992 - Springer
Snippet: Distributed artificial intelligence (DAI) is a subfield of artificial intelligence that deals with interactions of intelligent agents. Precisely, DAI attempts to construct intelligent agents that make decisions that allow them to achieve their goals in a world populated by other intelligent agents with their own goals. This paper discusses major concepts used in DAI today. To do this, a taxonomy of DAI is presented, based on the social abilities of an individual agent, the organization of agents, and the dynamics of this organization through …
Link: https://link.springer.com/article/10.1007/BF00155579


In [205]:
import requests
from bs4 import BeautifulSoup

# Google Scholar 검색 URL
query = "MACRec: A Multi-Agent Collaboration Framework for Recommendation"
url = f"https://scholar.google.com/scholar?q={query}"

# 요청 헤더 설정 (실제 브라우저에서의 요청처럼 위장)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

# GET 요청 보내기
response = requests.get(url, headers=headers)

# 상태 코드 확인
if response.status_code == 200:
    # HTML 파싱
    soup = BeautifulSoup(response.text, "html.parser")

    # 논문 제목과 관련 정보 추출
    results = soup.select(".gs_ri")
    for i, result in enumerate(results):
        title = result.select_one(".gs_rt").text
        authors_and_year = result.select_one(".gs_a").text
        snippet = result.select_one(".gs_rs").text if result.select_one(".gs_rs") else "No snippet available"
        link = result.select_one(".gs_rt a")["href"] if result.select_one(".gs_rt a") else "No link available"

        print(f"Result {i+1}:")
        print(f"Title: {title}")
        print(f"Authors and Year: {authors_and_year}")
        print(f"Snippet: {snippet}")
        print(f"Link: {link}")
        print("-" * 80)
else:
    print(f"Failed to fetch the page. Status code: {response.status_code}")


In [204]:
results

[]

In [206]:
# 응답 HTML 저장 및 확인
with open("response.html", "w", encoding="utf-8") as f:
    f.write(response.text)

print("Response saved to response.html. Open it in a browser to check if it's a Captcha page.")


Response saved to response.html. Open it in a browser to check if it's a Captcha page.


# References

https://medium.com/kx-systems/rag-llamaparse-advanced-pdf-parsing-for-retrieval-c393ab29891b

https://www.devkuma.com/docs/d3-js/append/

https://velog.io/@woody_ahn/Tavily-LLM%EC%9D%84-%EC%9C%84%ED%95%9C-%EC%B5%9C%EC%A0%81%ED%99%94%EB%90%9C-%EA%B2%80%EC%83%89-API