In [13]:
from llama_parse import LlamaParse
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.kdbai import KDBAIVectorStore
from getpass import getpass
import kdbai_client as kdbai

from dotenv import load_dotenv

load_dotenv()

True

In [90]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
# API access to llama-cloud
# os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv('LLAMA_CLOUD_API_KEY')

KDBAI_ENDPOINT = (os.environ["KDBAI_ENDPOINT"] if "KDBAI_ENDPOINT" in os.environ else input("KDB.AI endpoint: "))
KDBAI_API_KEY = (os.environ["KDBAI_API_KEY"] if "KDBAI_API_KEY" in os.environ else getpass("KDB.AI API key: "))

#connect to KDB.AI
session = kdbai.Session(api_key=KDBAI_API_KEY, endpoint=KDBAI_ENDPOINT)

In [92]:
# Connect with kdbai database
db = session.database("default")

In [93]:
# The schema contains two metadata columns (document_id, text) and one embeddings column
schema = [
        dict(name="document_id", type="str"),
        dict(name="text", type="str"),
        dict(name="embeddings", type="float32s"),
    ]

# indexflat, define the index name, type, column to apply the index to (embeddings)
# and params which include thesearch metric (Euclidean distance), and dims
indexFlat = {
        "name": "flat",
        "type": "flat",
        "column": "embeddings",
        "params": {'dims': 1536, 'metric': 'L2'},
    }

KDBAI_TABLE_NAME = "LlamaParse_Table"

# First ensure the table does not already exist
try:
    db.table(KDBAI_TABLE_NAME).drop()
except kdbai.KDBAIException:
    pass

#Create the table
table = db.create_table(KDBAI_TABLE_NAME, schema, indexes=[indexFlat])

In [94]:
EMBEDDING_MODEL  = "text-embedding-3-small"
GENERATION_MODEL = "gpt-4o"

llm = OpenAI(model=GENERATION_MODEL)
embed_model = OpenAIEmbedding(model=EMBEDDING_MODEL)

Settings.llm = llm
Settings.embed_model = embed_model

pdf_file_name = './MACRec.pdf'

In [95]:
parsing_instructions = '''The document titled "LLM In-Context Recall is Prompt Dependent" is an academic preprint from April 2024, authored by Daniel Machlab and Rick Battle from the VMware NLP Lab. It explores the in-context recall capabilities of Large Language Models (LLMs) using a method called "needle-in-a-haystack," where a specific factoid is embedded in a block of unrelated text. The study investigates how the recall performance of various LLMs is influenced by the content of prompts and the biases in their training data. The research involves testing multiple LLMs with varying context window sizes to assess their ability to recall information accurately when prompted differently. The paper includes detailed methodologies, results from numerous tests, discussions on the impact of prompt variations and training data, and conclusions on improving LLM utility in practical applications. It contains many tables. Answer questions using the information in this article and be precise.'''
print(parsing_instructions)

The document titled "LLM In-Context Recall is Prompt Dependent" is an academic preprint from April 2024, authored by Daniel Machlab and Rick Battle from the VMware NLP Lab. It explores the in-context recall capabilities of Large Language Models (LLMs) using a method called "needle-in-a-haystack," where a specific factoid is embedded in a block of unrelated text. The study investigates how the recall performance of various LLMs is influenced by the content of prompts and the biases in their training data. The research involves testing multiple LLMs with varying context window sizes to assess their ability to recall information accurately when prompted differently. The paper includes detailed methodologies, results from numerous tests, discussions on the impact of prompt variations and training data, and conclusions on improving LLM utility in practical applications. It contains many tables. Answer questions using the information in this article and be precise.


In [96]:
documents = LlamaParse(
    result_type="markdown", 
    parsing_instructions=parsing_instructions
).load_data(pdf_file_name)
# print(documents[0].text[:1000])

Started parsing the file under job_id 22a50657-dbcc-4c0b-9b0f-b8c13410cbe4


In [97]:
# for i, doc in enumerate(documents, start=1):
#     print(f"{i}".ljust(100, '-'))
#     print(doc.text)

In [98]:
# Parse the documents using MarkdownElementNodeParser
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8).from_defaults()

# Retrieve nodes (text) and objects (table)
nodes = node_parser.get_nodes_from_documents(documents)

base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

# insert the table markdown into the text of each table object
for i in range(len(objects)):
  objects[i].text = objects[i].obj.text[:]

0it [00:00, ?it/s]
1it [00:00, 23831.27it/s]
1it [00:00, 17772.47it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [99]:
vector_store = KDBAIVectorStore(table)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

#Create the index, inserts base_nodes and objects into KDB.AI
recursive_index = VectorStoreIndex(
    nodes= base_nodes + objects, storage_context=storage_context
)

# Query KDB.AI to ensure the nodes were inserted
table.query()

Unnamed: 0,document_id,text,embeddings
0,356031b0-c77b-4d03-b2b9-555873bffdcc,MACRec: a Multi-Agent Collaboration Framework ...,"[-0.0006405428, 0.043710317, 0.023421837, 0.02..."
1,6d6f1a20-f4c9-405f-bd93-2845a76bcc49,"SIGIR ’24, July 14–18, 2024, Washington, DC, U...","[-0.0015050154, 0.035147697, 0.057046, 0.03038..."
2,e7342dbe-cc04-462e-9bfb-eacd86ac6aca,Varying requirements for agents in different s...,"[-0.00798005, 0.036461584, 0.043508563, 0.0049..."
3,8caf5599-cf3b-45b6-9362-48371014407c,MACRec: a Multi-Agent Collaboration Framework ...,"[-0.015942669, 0.057838522, 0.025760388, 0.002..."
4,4ec65281-8b81-4c32-a253-f618d4f27438,3.2.4 Searcher\n\nThe Searcher is responsible ...,"[-0.018861674, 0.062203117, 0.015368288, -0.02..."
5,bc48bf62-58de-48d1-b58c-07e1f2ddaf76,Table 2: The agents’ selection for four applic...,"[-0.016511856, 0.039699007, 0.07511063, 0.0198..."
6,24918e1a-a405-4731-a68f-bcb70d9fa171,"SIGIR ’24, July 14–18, 2024, Washington, DC, U...","[-0.025750577, 0.043229405, -0.009879337, 0.00..."
7,229db0a6-d1c8-4e52-ada9-1f90cbacb251,The number of relevant items in the sequence i...,"[-0.0059832996, 0.05207999, 0.013652036, 0.005..."
8,2d87fe8c-fec1-48e5-9b8e-d6095f712c89,MACRec: a Multi-Agent Collaboration Framework ...,"[-0.025487937, 0.009967705, 0.06465233, 0.0219..."
9,b28069b7-3869-4ec2-9657-8c82ebdb404b,arXiv preprint arXiv:2308.09904 (2023).\n12. P...,"[-0.0062998543, 0.022451159, 0.04597343, 0.013..."


In [100]:
from openai import OpenAI
client = OpenAI()

def embed_query(query):
    query_embedding = client.embeddings.create(
            input=query,
            model="text-embedding-3-small"
        )
    return query_embedding.data[0].embedding

def retrieve_data(query):
    query_embedding = embed_query(query)
    results = table.search(vectors={'flat':[query_embedding]},n=5,filter=[('<>','document_id','4a9551df-5dec-4410-90bb-43d17d722918')])
    retrieved_data_for_RAG = []
    for index, row in results[0].iterrows():
      retrieved_data_for_RAG.append(row['text'])
    return retrieved_data_for_RAG

def RAG(query):
  question = "You will answer this question based on the provided reference material: " + query
  messages = "Here is the provided context: " + "\n"
  results = retrieve_data(query)
  if results:
    for data in results:
      messages += data + "\n"
  response = client.chat.completions.create(
      model="gpt-4o",
      messages=[
          {"role": "system", "content": question},
          {
          "role": "user",
          "content": [
              {"type": "text", "text": messages},
          ],
          }
      ],
      # max_tokens=300,
  )
  content = response.choices[0].message.content
  return content

In [31]:
print(RAG("describe the needle in a haystack method only using the provided information"))

The "needle in a haystack" method is not specifically described in the provided context. However, based on the Searcher's role described in section 3.2.4 and the Search process outlined under "APPLICATIONS ON RECOMMENDATION SCENARIOS," one could infer that a "needle in a haystack" type process involves meticulously searching through a large volume of data (like Wikipedia or other databases) to find the specific, relevant information needed to fulfill a query or recommendation task. In the described framework, the Searcher is tasked with executing targeted queries to retrieve relevant passages or entries, and further narrowing down the search results to directly relevant information to summarize for the Manager. This process resembles finding a "needle" (specific information) in a "haystack" (large database or set of documents).


In [101]:
answer = RAG(f"""Find this paper's References. Give me that References with the given json form. Don't return any other comments except that References

EXAMPLE : 
{{
    1 : {{
    "Title" : "Language models are few-shot learners",
    "Author(s)" : "Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al",
    "Conference" : "Advances in neural information processing systems 33 (2020), 1877–1901"
    }},
    2 : {{
        ...
    }},
    ...
}}
""")

In [103]:
ref_dict = eval(answer.replace("```json\n", "").replace("```", ""))

In [1]:
from scholarly import scholarly

In [3]:
search_query = scholarly.search_pubs('MACRec')

KeyboardInterrupt: 

In [196]:
for result in search_query:
    break

In [201]:
b_paper_title = result['bib']['title']
b_paper_authors = ', '.join(result['bib']['author'])

In [216]:
result

{'container_type': 'Publication',
 'source': <PublicationSource.PUBLICATION_SEARCH_SNIPPET: 'PUBLICATION_SEARCH_SNIPPET'>,
 'bib': {'title': 'Language models are few-shot learners',
  'author': ['T Brown', 'B Mann', 'N Ryder'],
  'pub_year': '2020',
  'venue': 'Advances in neural …',
  'abstract': 'We demonstrate that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even becoming competitive with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text'},
 'filled': False,
 'gsrank': 1,
 'pub_url': 'https://proceedings.neurips.cc/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html',
 'author_id': ['RLvsC94AAAAJ'

In [12]:
b_paper_title

NameError: name 'b_paper_title' is not defined

In [204]:
b_paper_authors

'T Brown, B Mann, N Ryder'

In [205]:
a_paper_title

'Language models are few-shot learners'

In [206]:
a_paper_authors

'Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al'

In [220]:
paper_compare(a_paper_title, a_paper_authors, b_paper_title, b_paper_authors)

'YES'

In [None]:
cnt = 0
for result in search_query:
    cnt += 1
    b_paper_title = result['bib']['title']
    b_paper_authors = ', '.join(result['bib']['author'])
    if paper_compare(a_paper_title, a_paper_authors, b_paper_title, b_paper_authors) == 'YES':
        citation_count = result.get('num_citations', 0)  # 인용수 가져오기

    print(f"cnt : {cnt}")
    if cnt >= 20:

In [222]:
ref_paper_title

'Agentverse: Facilitating multi-agent collaboration and exploring emergent behaviors in agents'

In [223]:
ref_paper_authors

'Weize Chen, Yusheng Su, Jingwei Zuo, Cheng Yang, Chenfei Yuan, Chen Qian, Chi-Min Chan, Yujia Qin, Yaxi Lu, Ruobing Xie, et al'

In [1]:

from scholarly import ProxyGenerator, scholarly

# ProxyGenerator 설정
pg = ProxyGenerator()
pg.proxies = {
    'http': 'http://43.202.154.212:80',  # 프록시 주소
    'https': 'http://43.202.154.212:80',
}
scholarly.use_proxy(pg)

In [2]:
search_query = scholarly.search_pubs('Perception of physical stability and center of mass of 3D objects')

KeyboardInterrupt: 

In [5]:


# Scholarly 테스트
try:
    search_query = scholarly.search_pubs('Perception of physical stability and center of mass of 3D objects')
    result = next(search_query)
    scholarly.pprint(result)
except Exception as e:
    print(f"Scholarly error: {e}")


KeyboardInterrupt: 

In [1]:
from scholarly import scholarly

In [2]:
from scholarly import scholarly, ProxyGenerator
pg = ProxyGenerator()

In [3]:
success = pg.FreeProxies()

In [4]:
success

True

In [5]:


scholarly.use_proxy(pg)

In [7]:
author = next(scholarly.search_author('Steven A Cholewiak'))

KeyboardInterrupt: 

In [6]:


author = next(scholarly.search_author('Steven A Cholewiak'))
scholarly.pprint(author)

MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [2]:
from scholarly import ProxyGenerator

# Set up a ProxyGenerator object to use free proxies
# This needs to be done only once per session
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)

In [8]:
search_query = scholarly.search_pubs('Perception of physical stability and center of mass of 3D objects')

MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [3]:
from scholarly import ProxyGenerator

# Set up a ProxyGenerator object to use free proxies
# This needs to be done only once per session
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)

# Now search Google Scholar from behind a proxy
search_query = scholarly.search_pubs('Perception of physical stability and center of mass of 3D objects')
scholarly.pprint(next(search_query))

MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [236]:
for i, one_ref_info in ref_dict.items():
    ref_paper_title = one_ref_info['Title']
    ref_paper_authors = one_ref_info['Author(s)']
    citation_cnt = get_citation_count(ref_paper_title, ref_paper_authors)
    print(f"[{citation_cnt:10,}] [{i}] {ref_paper_title}")

MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [225]:
ref_paper_title

'Recommender ai agent: Integrating large language models for interactive recommendations'

In [226]:
ref_paper_authors

'Xu Huang, Jianxun Lian, Yuxuan Lei, Jing Yao, Defu Lian, and Xing Xie'

In [195]:
search_query = scholarly.search_pubs(a_paper_title)

# get_citation_count


In [209]:
prompt = PAPER_COMPARE_PROMPT.format(
                a_paper_title=a_paper_title,
                a_paper_authors=a_paper_authors,
                b_paper_title=b_paper_title,  # 오타 수정
                b_paper_authors=b_paper_authors)

In [213]:
print(prompt)

Compare the titles and authors of the two papers, and if they are the same, respond with YES; otherwise, respond with NO.
          
          A paper title : Language models are few-shot learners,
          A paper authors : Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al
          
          B paper title : Language models are few-shot learners,
          B paper authors : T Brown, B Mann, N Ryder


In [234]:
from scholarly import scholarly

def get_citation_count(a_paper_title, a_paper_authors, author=None):
    # Google Scholar에서 논문 검색
    search_query = scholarly.search_pubs(a_paper_title)
    cnt = 0
    for result in search_query:
        cnt += 1
        b_paper_title = result['bib']['title']
        b_paper_authors = ', '.join(result['bib']['author'])
        if paper_compare(a_paper_title, a_paper_authors, b_paper_title, b_paper_authors) == 'YES':
            citation_count = result.get('num_citations', 0)  # 인용수 가져오기
            return citation_count
        if cnt >= 20:
            return None

PAPER_COMPARE_PROMPT = """Compare the titles and authors of the two papers, and if they are the same, respond with YES; otherwise, respond with NO. Take into account that the authors' names might be abbreviated.
          
          A paper title : {a_paper_title}
          A paper authors : {a_paper_authors}
          
          B paper title : {b_paper_title}
          B paper authors : {b_paper_authors}"""

def paper_compare(a_paper_title, a_paper_authors, b_paper_title, b_paper_authors):
    print(f"\ta_paper_title : {a_paper_title}\n\tb_paper_title : {b_paper_title}")
    prompt = PAPER_COMPARE_PROMPT.format(
                    a_paper_title=a_paper_title,
                    a_paper_authors=a_paper_authors,
                    b_paper_title=b_paper_title,  # 오타 수정
                    b_paper_authors=b_paper_authors
    )
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": prompt
            },
        ]
    )
    return response.choices[0].message.content


# Requests

In [3]:
import requests
from bs4 import BeautifulSoup

# Google Scholar 검색 URL
query = "recommendation"
url = f"https://scholar.google.com/scholar?q={query}"

# 요청 헤더 설정 (실제 브라우저에서의 요청처럼 위장)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

# GET 요청 보내기
response = requests.get(url, headers=headers)

# 상태 코드 확인
if response.status_code == 200:
    # HTML 파싱
    soup = BeautifulSoup(response.text, "html.parser")

    # 논문 제목과 관련 정보 추출
    results = soup.select(".gs_ri")
    for i, result in enumerate(results):
        title = result.select_one(".gs_rt").text
        authors_and_year = result.select_one(".gs_a").text
        snippet = result.select_one(".gs_rs").text if result.select_one(".gs_rs") else "No snippet available"
        link = result.select_one(".gs_rt a")["href"] if result.select_one(".gs_rt a") else "No link available"

        print(f"Result {i+1}:")
        print(f"Title: {title}")
        print(f"Authors and Year: {authors_and_year}")
        print(f"Snippet: {snippet}")
        print(f"Link: {link}")
        print("-" * 80)
else:
    print(f"Failed to fetch the page. Status code: {response.status_code}")


In [4]:
# 응답 HTML 저장 및 확인
with open("response.html", "w", encoding="utf-8") as f:
    f.write(response.text)

print("Response saved to response.html. Open it in a browser to check if it's a Captcha page.")


Response saved to response.html. Open it in a browser to check if it's a Captcha page.
