# **Run LLMUtils script**

In [1]:
import re
import pandas as pd

from typing import List

In [2]:
# make pandas dataframe fit the entire width of the page, and display all content in each cell
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 1000)
# make rows display in full
pd.set_option("display.max_rows", None)

In [3]:
# import modules from utils.py
from utils import *
import os

In [4]:
authors = [
    "Shi Feng",
    "Zhijing Jin",
    "Jacob Foerster",
    "Elad Hazan",
    "Florian Tramèr",
    "He He",
    "Ilija Bogunovic",
    "Yoav Artzi",
    "Stefano Ermon",
    "Jacob Andreas",
    "Dylan Hadfield-Menell",
    "Pierre-Luc Bacon",
    "Yejin Choi",
    "Natasha Jacques",
    "Zico Kotler",
    "Dawn Song",
    "Yuekai Sun",
    "Scott Niekum",
    "Kyunghyun Cho",
    "Rajesh Ranganath",
    "Fei Fang",
    "Steven Wu",
    "Andrea Zanette",
    "Hima Lakkaraju",
    "Matt Fredrikson",
    "Sham Kakade",
    "Diyi Yang",
    "Micah Goldblum",
    "Yu Su",
    "Daniel Kang",
    "Pavel Izmailov",
    "Ding Zhao",
    "Tatsunori Hashimoto",
    "Sewon Min",
    "Bo Li",
    "Gagandeep Singh",
    "Lionel Levine",
    "Tanya Goyal",
    "Brad Knox",
    "Roger Grosse",
    "Chris Callison-Burch",
    "Edgar Dobriban",
    "Eric Wong",
    "Yu Zhou",
    "Sanjeev Arora",
    "Danqi Chen",
    "Boaz Barak",
    "Kiantè Brantley",
    "Sheila McIllraith",
    "Chi Jin",
    "Andreea Bobu",
    "Yoon Kim",
    "John Hewitt",
    "Tim Althoff",
    "Tim Rocktaschel",
    "Alessandro Abate",
    "Buck Shlegeris",
    "Ethan Perez",
    "Samuel R. Bowman",
    "Akbir Khan",
]

In [5]:
# Open file for reading contents

# path to txts
path = "/Users/joanvelja/Documents/neurips_posters"

# Merge all contents of txt files starting in 'session' into one raw string variable
raw = r""
for file in os.listdir(path):
    if file.startswith("session"):
        with open(os.path.join(path, file), "r") as f:
            raw += f.read()

In [6]:
# Open file for interested papers extracted via Gemini

interests = r""
for file in os.listdir(path):
    if file.startswith("interests"):
        with open(os.path.join(path, file), "r") as f:
            interests += f.read()

In [None]:
# Regex pattern to extract paper titles and split them from shortened abstracts (abstract contained within parentheses)
# Papers are separated by a double newline character

interest_pattern = r"(.+?)\((.+?)\)\n\n"
interests_matches = re.findall(interest_pattern, interests, re.DOTALL)

# Strip leading and trailing whitespace from each tuple in the list
interests_matches = [
    (title.strip(), abstract.strip()) for title, abstract in interests_matches
]

# create a dataframe from the extracted papers
interests_df = pd.DataFrame(interests_matches, columns=["title", "abstract"])


In [8]:
interests_df["title"].tolist()

['RepLiQA: A Question-Answering Dataset for Benchmarking LLMs on Unseen Reference Content\n\nAgentBoard: An Analytical Evaluation Board of Multi-turn LLM Agents\n\nFindingEmo: An Image Dataset for Emotion Recognition in the Wild',
 'Do causal predictors generalize better to new domains?',
 'Can LLMs Implicitly Learn Numeric Parameter Constraints in Data Science APIs?\n\nRAGChecker: A Fine-grained Framework for Diagnosing Retrieval-Augmented Generation\n\nRoleAgent: Building, Interacting, and Benchmarking High-quality Role-Playing Agents from Scripts\n\nSubjECTive-QA: A dataset for the subjective evaluation of answers in Earnings Call Transcripts',
 'Evaluating Numerical Reasoning in Text-to-Image Models',
 'Benchmarking LLMs via Uncertainty Quantification\n\nSpreadsheetBench: Towards Challenging Real World Spreadsheet Manipulation\n\nBLEnD: A Benchmark for LLMs on Everyday Knowledge in Diverse Cultures and Languages\n\nEmbodied Agent Interface: Benchmarking LLMs for Embodied Decision M

In [9]:
def extract_session_info(text: str, paper_start_pos: int) -> dict | None:
    # Pattern to match session details above paper
    session_pattern = r"### Poster Session\n\nPoster Session (\d+)([A-Za-z]+)\n---------------------\n\n##### ([^\n]+)\n\n([^\n]+)"

    # Find all session headers
    sessions = list(re.finditer(session_pattern, text))

    # Find the closest session header before paper
    closest_session = None
    for session in sessions:
        if session.end() < paper_start_pos:
            closest_session = session
        else:
            break

    if closest_session:
        return {
            "session_number": closest_session.group(1),
            "session_wing": closest_session.group(2),
            "location": closest_session.group(3),
            "time": closest_session.group(4),
        }
    return None


def extract_papers(
    text: str, authors_of_interest: List[str], titles_crossref: List[str] = []
) -> pd.DataFrame:
    # Updated pattern to capture URL
    paper_pattern = (
        r"#{5}\s+\*\*\[([^\]]+)\]\(([^\)]+)\)\*\*\n\n(.*?)\n\n(.*?)(?=\n#{5}|\Z)"
    )

    papers = []

    for match in re.finditer(paper_pattern, text, re.DOTALL):
        title = match.group(1)
        url = f"www.neurips.cc{match.group(2)}"
        authors = match.group(3)
        abstract = match.group(4)

        # Get session info for this paper
        session_info = extract_session_info(text, match.start())

        if any(
            author.lower() in authors.lower() for author in authors_of_interest
        ) or any(
            title.lower() in title_crossref.lower()
            for title_crossref in titles_crossref
            if titles_crossref is not None
        ):
            paper_data = {
                "title": title,
                "url": url,
                "authors": authors,
                "abstract": abstract,
            }

            # Add session info if found
            if session_info:
                paper_data.update(
                    {
                        "session_number": session_info["session_number"],
                        "session_wing": session_info["session_wing"],
                        "location": session_info["location"],
                        "time": session_info["time"],
                    }
                )

            papers.append(paper_data)

    # Create DataFrame
    df = pd.DataFrame(papers)
    return df

In [10]:
# Extract matching papers
df = extract_papers(raw, authors)

In [11]:
df

Unnamed: 0,title,url,authors,abstract
0,NewTerm: Benchmarking Real-Time New Terms for Large Language Models with Annual Updates,www.neurips.cc/virtual/2024/poster/97724,Hexuan Deng · Wenxiang Jiao · Xuebo Liu · Min Zhang · Zhaopeng Tu,"Despite their remarkable abilities in various tasks, large language models (LLMs) still struggle with real-time information (e.g., new facts and terms) due to the knowledge cutoff in their development process. However, existing benchmarks focus on outdated content and limited fields, facing difficulties in real-time updating and leaving new terms unexplored. To address this problem, we propose an adaptive benchmark, NewTerm, for real-time evaluation of new terms. We design a highly automated construction method to ensure high-quality benchmark construction with minimal human effort, allowing flexible updates for real-time information. Empirical results on various LLMs demonstrate over 20% performance reduction caused by new terms. Additionally, while updates to the knowledge cutoff of LLMs can cover some of the new terms, they are unable to generalize to more distant new terms. We also analyze which types of terms are more challenging and why LLMs struggle with new terms, paving the way for future research. Finally, we construct NewTerm 2022 and 2023 to evaluate the new terms updated each year and will continue updating annually. The benchmark and codes can be found at https://anonymous.4open.science/r/NewTerms.\n\n \n\nPoster\n\n#5105\n"
1,TorchSpatial: A Location Encoding Framework and Benchmark for Spatial Representation Learning,www.neurips.cc/virtual/2024/poster/97807,Nemin Wu · Qian Cao · Zhangyu Wang · Zeping Liu · Yanlin Qi · Jielu Zhang · Joshua Ni · X. Yao · Hongxu Ma · Lan Mu · Stefano Ermon · Tanuja Ganu · Akshay Nambi · Ni Lao · Gengchen Mai,"Spatial representation learning (SRL) aims at learning general-purpose neural network representations from various types of spatial data (e.g., points, polylines, polygons, networks, images, etc.) in their native formats. Learning good spatial representations is a fundamental problem for various downstream applications such as species distribution modeling, weather forecasting, trajectory generation, geographic question answering, etc. Even though SRL has become the foundation of almost all geospatial artificial intelligence (GeoAI) research, we have not yet seen significant efforts to develop an extensive deep learning framework and benchmark to support SRL model development and evaluation. To fill this gap, we propose TorchSpatial, a learning framework and benchmark· for location (point) encoding, which is one of the most fundamental data types of spatial representation learning. TorchSpatial contains three key components: 1) a unified location encoding framework that consolidates 15 commonly recognized location encoders, ensuring scalability and reproducibility of the implementations; 2) the LocBench benchmark tasks encompassing 7 geo-aware image classification and 4 geo-aware image regression datasets; 3) a comprehensive suite of evaluation metrics to quantify geo-aware models’ overall performance as well as their geographic bias, with a novel Geo-Bias Score metric. Finally, we provide a detailed analysis and insights into the model performance and geographic bias of different location encoders. We believe TorchSpatial will foster future advancement of spatial representation learning and spatial fairness in GeoAI research. The TorchSpatial model framework, LocBench, and Geo-Bias Score evaluation framework are available at https://github.com/seai-lab/TorchSpatial.\n\n \n\nPoster\n\n#5109\n"
2,DataComp-LM: In search of the next generation of training sets for language models,www.neurips.cc/virtual/2024/poster/97814,Amro Abbas · Alon Albalak · Kushal Arora · Hritik Bansal · Yonatan Bitton · Yair Carmon · Khyathi Chandu · Mayee Chen · Giannis Daras · Achal Dave · Alex Dimakis · Alaaeldin El-Nouby · Fartash Faghri · Alex Fang · Samir Yitzhak Gadre · Josh Gardner · Saurabh Garg · Dhruba Ghosh · Aaron Gokaslan · Dirk Groeneveld · Etash Guha · Suchin Gururangan · Reinhard Heckel · Cheng-Yu Hsieh · Gabriel Ilharco · Maor Ivgi · Jenia Jitsev · Matt Jordan · Sham Kakade · Sedrick Scott Keh · Maciej Kilian · Pang Wei Koh · Thomas Kollar · Jeffrey Li · Kyle Lo · Kalyani Marathe · Jean Mercat · Niklas Muennighoff · Marianna Nezhurina · Thao Nguyen · Sewoong Oh · Hadi Pouransari · Sarah Pratt · Sunny Sanyal · Ludwig Schmidt · Vaishaal Shankar · Rulin Shao · Georgios Smyrnis · Luca Soldaini · Shuran Song · Alexander Toshev · Igor Vasiljevic · Stephanie Wang · Mitchell Wortsman · Rui Xin · Luke Zettlemoyer · Hanlin Zhang · Jieyu Zhang,"We introduce DataComp for Language Models, a testbed for controlled dataset experiments with the goal of improving language models.As part of DCLM, we provide a standardized corpus of 240T tokens extracted from Common Crawl, effective pretraining recipes based on the OpenLM framework, and a broad suite of 53 downstream evaluations.Participants in the DCLM benchmark can experiment with data curation strategies such as deduplication, filtering, and data mixing atmodel scales ranging from 412M to 7B parameters.As a baseline for DCLM, we conduct extensive experiments and find that model-based filtering is key to assembling a high-quality training set.The resulting dataset, DCLM-Baseline, enables training a 7B parameter language model from scratch to 63% 5-shot accuracy on MMLU with 2T training tokens.Compared to MAP-Neo, the previous state-of-the-art in open-data language models, DCLM-Baseline represents a 6 percentage point improvement on MMLU while being trained with half the compute.Our results highlight the importance of dataset design for training language models and offer a starting point for further research on data curation.\n\n \n\nSpotlight Poster\n\n#5110\n"
3,Web2Code: A Large-scale Webpage-to-Code Dataset and Evaluation Framework for Multimodal LLMs,www.neurips.cc/virtual/2024/poster/97572,Sukmin Yun · haokun lin · Rusiru Thushara · Mohammad Bhat · Yongxin Wang · zutao jiang · Mingkai Deng · Jinhong Wang · Tianhua Tao · Junbo Li · Haonan Li · Preslav Nakov · Timothy Baldwin · Zhengzhong Liu · Eric Xing · Xiaodan Liang · Zhiqiang Shen,"Multimodal large language models (MLLMs) have shown impressive success across modalities such as image, video, and audio in a variety of understanding and generation tasks. However, current MLLMs are surprisingly poor at understanding webpage screenshots and generating their corresponding HTML code. To address this problem, we propose Web2Code, a benchmark consisting of a new large-scale webpage-to-code dataset for instruction tuning and an evaluation framework for the webpage understanding and HTML code translation abilities of MLLMs. For dataset construction, we leveraging pretrained LLMs to enhance existing webpage-to-code datasets as well as generate a diverse pool of new webpages rendered into images. Specifically, the inputs are webpage images and instructions, while the responses are the webpage’s HTML code. We further include diverse natural language QA pairs about the webpage content in the responses to enable more comprehensive understanding of the web content. To evaluate model performance in these tasks, we develop an evaluation framework for testing MLLMs’ abilities in webpage understanding and web-to-code generation. Extensive experiments show that our proposed dataset is beneficial not only to our proposed tasks but also in the general visual domain, while previous datasets result in worse performance. We hope our work will contribute to the development of general MLLMs suitable for web-based content generation and task automation. Our data and code are available at https://github.com/MBZUAI-LLM/web2code\n\n \n\nPoster\n\n#5400\n"
4,Global Rewards in Restless Multi-Armed Bandits,www.neurips.cc/virtual/2024/poster/96734,Naveen Raman · Zheyuan Shi · Fei Fang,"Restless multi-armed bandits (RMAB) extend multi-armed bandits so arm pulls impact future arm states. Despite the success of RMABs, a key limiting assumption is the separability of rewards into a sum across arms. We address this deficiency by proposing restless-multi-armed bandit with global rewards (RMAB-G), a generalization of RMABs to global non-separable rewards. To solve RMAB-G, we develop the Linear-Whittle and Shapley-Whittle indices, which extend Whittle indices from RMABs to RMAB-Gs. We prove approximation bounds which demonstrate how Linear and Shapley-Whittle indices fail for non-linear rewards. To overcome this limitation, we propose two sets of adaptive policies: the first computes indices iteratively and the second combines indices with Monte-Carlo Tree Search (MCTS). Empirically, we demonstrate that adaptive policies outperform both pre-computed index policies and baselines in synthetic and real-world food rescue datasets.\n\n \n\nPoster\n\n#5501\n"
5,TorchOpt: An Efficient Library for Differentiable Optimization,www.neurips.cc/virtual/2024/poster/98326,Jie Ren · Xidong Feng · Bo Liu · Xuehai Pan · Yao Fu · Luo Mai · Yaodong Yang,"Differentiable optimization algorithms often involve expensive computations of various meta-gradients. To address this, we design and implement TorchOpt, a new PyTorch-based differentiable optimization library. TorchOpt provides an expressive and unified programming interface that simplifies the implementation of explicit, implicit, and zero-order gradients. Moreover, TorchOpt has a distributed execution runtime capable of parallelizing diverse operations linked to differentiable optimization tasks across CPU and GPU devices. Experimental results demonstrate that TorchOpt achieves a 5.2× training time speedup in a cluster. TorchOpt is open-sourced at https://github.com/metaopt/torchopt and has become a PyTorch Ecosystem project.\n\n \n\nPoster\n\n#5802\n"
6,Communication Efficient Distributed Training with Distributed Lion,www.neurips.cc/virtual/2024/poster/93167,Bo Liu · Lemeng Wu · Lizhang Chen · Kaizhao Liang · Jiaxu Zhu · Chen Liang · Raghuraman Krishnamoorthi · Qiang Liu,"The Lion optimizer has been a promising competitor with the AdamW for training large AI models, with advantages in memory, computation, and sample efficiency. In this paper, we introduce Distributed Lion, an innovative adaptation of Lion for distributed training environments. Leveraging the sign operator in Lion, our Distributed Lion only requires to communicate binary or lower-precision vectorsbetween workers to the center server, significantly reducing the communication cost. Our theoretical analysis confirms Distributed Lion's convergence properties. Empirical results demonstrate its robustness across a range of tasks, worker counts, and batch sizes, on both vision and language problems. Notably, Distributed Lion attains comparable performance to standard Lion or AdamW optimizers applied on aggregated gradients, but with significantly reduced communication bandwidth. This feature is particularly advantageous for training large models. In addition, we also demonstrate that \\mavolion{} presents a more favorable performance-bandwidth balance compared to existing efficient distributed methods such as deep gradient compression and ternary gradients.\n\n \n\nPoster\n\n#5911\n"
7,The Limits of Differential Privacy in Online Learning,www.neurips.cc/virtual/2024/poster/96120,Bo Li · Wei Wang · Peng Ye,"Differential privacy (DP) is a formal notion that restricts the privacy leakage of an algorithm when running on sensitive data, in which privacy-utility trade-off is one of the central problems in private data analysis. In this work, we investigate the fundamental limits of differential privacy in online learning algorithms and present evidence that separates three types of constraints: no DP, pure DP, and approximate DP. We first describe a hypothesis class that is online learnable under approximate DP but not online learnable under pure DP under the adaptive adversarial setting. This indicates that approximate DP must be adopted when dealing with adaptive adversaries. We then prove that any private online learner must make an infinite number of mistakes for almost all hypothesis classes. This essentially generalizes previous results and shows a strong separation between private and non-private settings since a finite mistake bound is always attainable (as long as the class is online learnable) when there is no privacy requirement.\n\n \n\nPoster\n\n#6001\n"
8,OASIS: Conditional Distribution Shaping for Offline Safe Reinforcement Learning,www.neurips.cc/virtual/2024/poster/96709,Yihang Yao · Zhepeng Cen · Wenhao Ding · Haohong Lin · Shiqi Liu · Tingnan Zhang · Wenhao Yu · DING ZHAO,"Offline safe reinforcement learning (RL) aims to train a policy that satisfies con- straints using a pre-collected dataset. Most current methods struggle with the mismatch between imperfect demonstrations and the desired safe and rewarding performance. In this paper, we mitigate this issue from a data-centric perspective and introduce OASIS (cOnditionAl diStributIon Shaping), a new paradigm in offline safe RL designed to overcome these critical limitations. OASIS utilizes a conditional diffusion model to synthesize offline datasets, thus shaping the data dis- tribution toward a beneficial target domain. Our approach makes compliance with safety constraints through effective data utilization and regularization techniques to benefit offline safe RL training. Comprehensive evaluations on public benchmarks and varying datasets showcase OASIS’s superiority in benefiting offline safe RL agents to achieve high-reward behavior while satisfying the safety constraints, out- performing established baselines. Furthermore, OASIS exhibits high data efficiency and robustness, making it suitable for real-world applications, particularly in tasks where safety is imperative and high-quality demonstrations are scarce. More details are available at the website https://sites.google.com/view/saferl-oasis/home.\n\n \n\nPoster\n\n#6007\n"
9,"Multi-Agent Imitation Learning: Value is Easy, Regret is Hard",www.neurips.cc/virtual/2024/poster/95215,Jingwu Tang · Gokul Swamy · Fei Fang · Steven Wu,"We study a multi-agent imitation learning (MAIL) problem where we take the perspective of a learner attempting to _coordinate_ a group of agents based on demonstrations of an expert doing so. Most prior work in MAIL essentially reduces the problem to matching the behavior of the expert _within_ the support of the demonstrations. While doing so is sufficient to drive the _value gap_ between the learner and the expert to zero under the assumption that agents are non-strategic, it does not guarantee robustness to deviations by strategic agents. Intuitively, this is because strategic deviations can depend on a counterfactual quantity: the coordinator's recommendations outside of the state distribution their recommendations induce. In response, we initiate the study of an alternative objective for MAIL in Markov Games we term the _regret gap_ that explicitly accounts for potential deviations by agents in the group. We first perform an in-depth exploration of the relationship between the value and regret gaps. First, we show that while the value gap can be efficiently minimized via a direct extension of single-agent IL algorithms, even _value equivalence_ can lead to an arbitrarily large regret gap. This implies that achieving regret equivalence is harder than achieving value equivalence in MAIL. We then provide a pair of efficient reductions to no-regret online convex optimization that are capable of minimizing the regret gap _(a)_ under a coverage assumption on the expert (MALICE) or _(b)_ with access to a queryable expert (BLADES).\n\n \n\nPoster\n\n#6102\n"
