In [None]:
import os
import sys

# Get the current working directory and add the parent directory to the Python path
current_working_directory = os.getcwd()
print(os.path.join(current_working_directory, ".."))
sys.path.append(os.path.join(current_working_directory, ".."))

In [2]:
from decouple import AutoConfig
config = AutoConfig(search_path='./../.env')

os.environ["AZURE_OPENAI_API_KEY"] = config('AZURE_OPENAI_API_KEY')
os.environ["AZURE_OPENAI_ENDPOINT"] = config('AZURE_ENDPOINT')

In [None]:
from langchain_community.document_loaders import WebBaseLoader
import bs4
from typing import List, Tuple

def load_doc_from_urls(urls:List[str],
                       tags:List[str],
                       tag_classes:List[str]):
    urls = tuple(urls)
    tag = tuple(tags)
    tag_classes = tuple(tag_classes)

    # Load, chunk and index the contents of the blog.
    loader = WebBaseLoader(
        web_paths=urls,
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer(tag,
                class_=tag_classes
            )
        ),
    )

    docs = loader.load_and_split()
    return docs

In [4]:
from pydantic import BaseModel, Field
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a detailed summary of the content.")
    highlights: str = Field(description="Provide the highlights of the content.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [12]:
from langchain.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

trend_prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", """Analyse and Extract the key themes/topics from the provided accepted papers in the confernce. 
     Also provide the papers and keywords related to the topic.
     Make sure all the papers are covered. If they do not fall into any theme or topic, add them to Others.
     {input}""")
])

In [6]:
from langchain_openai import AzureChatOpenAI
llm = AzureChatOpenAI(
                openai_api_version=config('AZURE_CHAT_OPENAI_API_VERSION'),
                azure_deployment=config('AZURE_GPT4o_mini_CHAT_OPENAI_DEPLOYMENT'),
                temperature=0,
                max_tokens=4096
            )

In [None]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

overview_tagging_function = [
    convert_pydantic_to_openai_function(Overview)
]
tagging_model = llm.bind(
    functions=overview_tagging_function,
    function_call={"name":"Overview"}
)
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()
trend_chain = trend_prompt | llm

In [8]:
def conference_trend_analysis():
    docs = load_doc_from_urls(
    urls=['https://2024.emnlp.org/program/accepted_main_conference/',
          'https://2024.aclweb.org/program/main_conference_papers/',
          'https://2024.naacl.org/program/accepted_papers/'],
    tags=['strong',],
    tag_classes=[] 
)

    result = tagging_chain.invoke({"input": docs})
    for key, value in result.items():
        print(f"{key}: {value}")

    result = trend_chain.invoke({"input": docs})
    print(result.content)

In [9]:
# conference_trend_analysis()

In [14]:
def arxiv_daily_trend_analysis():
    docs = load_doc_from_urls(
        urls=['https://arxiv.org/list/cs.AI/recent?skip=0&show=2000',
            'https://arxiv.org/list/cs.CL/recent?skip=0&show=2000',
            'https://arxiv.org/list/cs.CV/recent?skip=0&show=2000',
            'https://arxiv.org/list/cs.IR/recent?skip=0&show=2000'],
        tags=['div',],
        tag_classes=['list-title mathjax'] 
    )

    result = tagging_chain.invoke({"input": docs})
    for key, value in result.items():
        print(f"{key}: {value}")

    result = trend_chain.invoke({"input": docs})
    print(result.content)

In [None]:
arxiv_daily_trend_analysis()