# Homework 4
Name: Grace Su

## Data Collection

In [3]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
import random
import json
from tqdm import tqdm

# arXiv API endpoint
ARXIV_API = "http://export.arxiv.org/api/query"
MAX_PAPERS = 1000

# Calculate date range (past 3 months)
end_date = datetime.now()
start_date = end_date - timedelta(days=90)
start_date_str = start_date.strftime("%Y%m%d")
end_date_str = end_date.strftime("%Y%m%d")

# Search query for cs.CL papers in date range
query = f"search_query=cat:cs.CL+AND+submittedDate:[{start_date_str}0000+TO+{end_date_str}2359]&sortBy=lastUpdatedDate&sortOrder=descending&max_results={MAX_PAPERS}"

# Fetch papers
response = requests.get(f"{ARXIV_API}?{query}")
root = ET.fromstring(response.content)

# randomly save 50 papers
papers = dict()
for entry in tqdm(root.findall(".//{http://www.w3.org/2005/Atom}entry"), desc="Getting papers"):
    title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip().replace("\n", " ").replace("\t", " ")
    authors = [author.find("{http://www.w3.org/2005/Atom}name").text for author in entry.findall(".//{http://www.w3.org/Atom}author")]
    summary = entry.find("{http://www.w3.org/2005/Atom}summary").text.strip().replace("\n", " ").replace("\t", " ")
    published = entry.find("{http://www.w3.org/2005/Atom}published").text
    pdf_link = entry.find("{http://www.w3.org/2005/Atom}link[@title='pdf']").get("href")
    
    papers[title] = {   
        'authors': authors,
        'summary': summary,
        'published': published,
        'pdf_link': pdf_link
    }


Getting papers: 100%|██████████| 1000/1000 [00:00<00:00, 180144.48it/s]


In [None]:
# randomly save 50 papers
# set random seed
random.seed(42)
titles = random.sample(list(papers.keys()), 50)

# sort papers by title and make key the index
titles.sort()
for i, title in enumerate(titles):
    papers[i] = {"title": title, **papers[title]}
    del papers[title]

assert len(papers) == 50
    

In [None]:
# save pdf files to pdfs folder
for title in tqdm(titles, desc="Saving pdfs"):
    response = requests.get(papers[title]['pdf_link'])
    with open(f"pdfs/{title}.pdf", "wb") as f:
        f.write(response.content)

# save papers metadata to papers.json
with open("papers.json", "w") as f:
    json.dump(papers, f, indent=4)

Saving pdfs: 100%|██████████| 50/50 [00:11<00:00,  4.19it/s]


## Text Extraction

In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path: str, header_height: int = 72, footer_height: int = 72) -> str:
    """
    Open a PDF and extract all text as a single string.
    """
    doc = fitz.open(pdf_path)
    pages = []
    for page in doc:
        # clip header and footer
        rect = page.rect
        clip = fitz.Rect(0, header_height, rect.width, rect.height - footer_height)
        page_text = page.get_text(clip=clip)  # get raw text from page
        page_text = page_text.replace("\n", " ")
        page_text = page_text.replace("\t", " ")
        pages.append(page_text)
    full_text = " ".join(pages)
    return full_text


In [23]:
import os

l_pdfs = os.listdir("pdfs")

# update papers.json with text
for pdf_path in tqdm(l_pdfs, desc="Extracting text"):
    text = extract_text_from_pdf(os.path.join("pdfs", pdf_path))
    title = pdf_path.replace(".pdf", "")
    papers[title].update({'text': text})
    

Extracting text: 100%|██████████| 50/50 [00:02<00:00, 19.45it/s]


In [None]:
with open("papers.json", "w") as f:
    json.dump(papers, f, indent=4)

## Get and Save Embeddings 

In [2]:
def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50):
    tokens = text.split()
    chunks = []
    step = max_tokens - overlap
    for i in range(0, len(tokens), step):
        chunk = tokens[i:i + max_tokens] 
        chunks.append(" ".join(chunk))
    return chunks


In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json

model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
from tqdm import tqdm


with open("papers.json", "r") as f:
    papers = json.load(f)

# save chunk id and chunk text
chunk_data = dict()
global_chunk_id = 0
for paper_id in tqdm(papers, desc="Embedding papers"):
    chunks = chunk_text(papers[paper_id]['text'])
    embeddings = model.encode(chunks)  
    for i, chunk in enumerate(chunks):
        # save each chunk to a file
        os.makedirs(f"embeddings/{paper_id}", exist_ok=True)
        np.save(f"embeddings/{paper_id}/{global_chunk_id}.npy", embeddings[i])
        chunk_data[str(global_chunk_id)] = {
            "paper_id": paper_id,
            "chunk_text": chunk
        }
        global_chunk_id += 1
    

# save chunk_data to a file
with open("chunks.json", "w") as f:
    json.dump(chunk_data, f, indent=4)



Embedding papers:   0%|          | 0/50 [00:00<?, ?it/s]

Embedding papers: 100%|██████████| 50/50 [00:01<00:00, 34.33it/s]


In [6]:
embeddings.shape

(12, 384)

## Save FAISS index

In [26]:
import faiss
import glob

embeddings = []
for file in sorted(glob.glob("embeddings/**/*.npy"), key=lambda x: int(x.split("/")[-1].split(".")[0])):
    temp = np.load(file)
    embeddings.append(temp)

embeddings = np.vstack(embeddings)
embeddings.shape


(1078, 384)

In [None]:
# Assume embeddings is a 2D numpy array of shape (num_chunks, dim)
dim = embeddings.shape[1]
# normalize embeddings
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

index = faiss.IndexFlatIP(dim)  
index.add(np.array(embeddings))  # add all chunk vectors

In [37]:
# save index
faiss.write_index(index, "embeddings.index")

### Test FAISS query code

In [54]:
index = faiss.read_index("embeddings.index")

query_embedding = model.encode("What is the main idea of the Psyche-R1 paper?")
query_embedding = query_embedding.reshape(1, -1)
# normalize query vector
query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)

k = 5
distances, indices = index.search(query_embedding, k)
# indices[0] holds the top-k chunk indices

# get chunk text
with open("chunks.json", "r") as f:
    chunks = json.load(f)

print(distances)
for i, idx in enumerate(indices[0]):
    print('-'*100)
    print(f"Paper title: {papers[chunks[str(idx)]['paper_id']]['title']}")
    # print distance
    print(f"Distance: {distances[0][i]:.3f}")
    print(chunks[str(idx)]["chunk_text"])


[[0.45846927 0.4022849  0.37142125 0.3577277  0.34684435]]
----------------------------------------------------------------------------------------------------
Paper title: Psyche-R1: Towards Reliable Psychological LLMs through Unified Empathy,   Expertise, and Reasoning
Distance: 0.458
scientific research, yet is concerned about others' perceptions of her own attitude, which indicates that their cognition has become neurotic, leading to internal contradictions. In summary, this client's types of psycho-logical conflict include: B. Approach-avoidance, A. Neurotic. Therefore, the correct answers are B and A. Figure 3: A Qualitative example from the CPsyExam test set comparing Psyche-R1 and Qwen2.5-72B-Instruct. tions, thereby yielding more accurate and emotionally in- formed responses within relevant contexts. Case Study. We present a case study examining how Psyche-R1 and Qwen2.5-72B-Instruct (Team 2024b) derive conclusions from narrative evidence, as shown in Figure 3. This case invol

## FastAPI run instructions

Launch the FastAPI server with the following command:

```bash
uvicorn main:app --port 6096
```


Then, you can query the API and get the response as a JSON with the following Python code:

```python
query = "What is is a paper that uses agents?"
response = requests.get(f"http://localhost:6096/search?q={query}")
print(response.json())
```






## Retrieval Report on 5 Sample Queries

In [None]:
import requests

def print_query_result(query):
    response = requests.get(f"http://localhost:6096/search?q={query}")
    for i, result in enumerate(response.json()["results"]):
        print(f"{i}: {result}")


In [65]:
query = "What is is a paper that uses agents?"
print_query_result(query)

0: agent performance. 2 RELATED WORK Evaluating LLMs in Executive Environments As LLMs advance in tackling real-world challenges (Hurst et al., 2024; Jaech et al., 2024; OpenAI, 2025; Anthropic, 2025b;a; Comanici et al., 2025), there is a growing shift toward evaluating their capabilities in dynamic, executive environments rather than static datasets. Beyond text-based games (Cˆot´e et al., 2018; Shridhar et al., 2020), recent research increasingly simulates realistic scenarios to assess agents’ proficiency in tool use (Deng et al., 2023; Qin et al., 2023a; Zhuang et al., 2023; Qin et al., 2023b; L`u et al., 2024; Wang et al., 2024b; Shen et al., 2024; Xu et al., 2024a; Sutela & Lindstr¨om, 2024). Current benchmarks, such as WebArena (Zhou et al., 2023), AgentBench (Paranjape et al., 2023), WindowsArena (Bonatti et al., 2024), and OfficeBench (Wang et al., 2024d), provide valuable evaluation settings focused on web and office environments. However, these platforms primarily measure ato

In [66]:
query = "What papers use diffusion to generate text?"
print_query_result(query)

0: Bhavya Kailkhura, and Ferdinando Fioretto. 2025. Speculative Diffusion Decoding: Accelerating Language Generation through Diffusion. Annual Conference of the Nations of the Americas Chapter of the Association .... [26] Cheng Da, Peng Wang, and Cong Yao. 2022. Levenshtein OCR. arXiv:2209.03594 [cs.CV] https://arxiv.org/abs/2209. 03594 [27] Tri Dao. [n.d.]. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In The Twelfth International Conference on Learning Representations. [28] Valentin De Bortoli, Alexandre Galashov, Arthur Gretton, and Arnaud Doucet. [n.d.]. Accelerated Diffusion Models via Speculative Sampling. In Forty-second International Conference on Machine Learning. [29] Justin Deschenaux and Caglar Gulcehre. 2024. Beyond Autoregression: Fast LLMs via Self-Distillation Through Time. arXiv preprint arXiv:2410.21035 (2024). [30] Liang Ding, Longyue Wang, Xuebo Liu, Derek F Wong, Dacheng Tao, and Zhaopeng Tu. 2021. Rejuvenating low- frequency wor

In [67]:
query = "What biases do LLMs commonly have?"
print_query_result(query)

0: number of biased thoughts. bias they have. Table A.3 in Appendix C provides more details on the bias level of each score. The output of LLM-as-a-judge method has five ordinal bias categories, which are binarized after applying a threshold (as explained in Appendix C). Similarly, the output of other methods (to be discussed in the coming sections) is also binarized to describe whether or not the thoughts are biased. The binarized scores of each method are then compared with the ground truth to compute the F1-scores, which reflect the performance of each method. It is important to note that Llama 70b is used as an annotator for all the baselines, including the LLM-as-a-judge. 4.2 Confidence score This method quantifies the bias in the thoughts as the degree of confidence of an external classifier in the biased answer, using the thoughts of the model to be assessed as input. More specifically, we first train an external model (DeBERTa-large by He et al. (2021) in our case) that uses th

In [70]:
query = "What are techniques to improve LLM reasoning?"
print_query_result(query)

0: been used to entirely train new models (DeepSeek-AI et al., 2025) or improve model prompting (Pternea et al., 2024), showing great potential for the future. Despite these advances, LLMs remain prone to hallucinations—generating fluent but factu- ally incorrect or logically inconsistent outputs (Huang et al., 2025), (Srivastava et al., 2023), (Ji et al., 2023b). This is especially problematic in knowledge-intensive tasks requiring factual ground- ing, multi-hop reasoning, or domain-specific exper- tise (Ji et al. (2023a), Opsahl (2024)). These issues stem in part from the implicit nature of knowledge storage in model parameters, which limits their abil- ity to verify facts or reason explicitly over external knowledge (Petroni et al. (2019), Bommasani et al. (2021)). Recent work has explored augmenting LLMs with tool use, such as code interpreters (Pi et al., 2022), equation solvers (He-Yueya et al., 2023) or symbolic solvers (Lam et al., 2024) (Pan et al., 2023), to externalize and v

In [None]:
query = "How do we improve LLM model alignment?"
print_query_result(query)

0: with a base model and perform forward analysis by computing a breadth-first search (BFS) following the outgoing edges to get that from our LLM supply chain graph. This leads to a forward subgraph, which denotes all the models that depend on the base model, including fine-tuned, adapted, quantized, or merged models. Table 3 shows the top-10 base models sorted by the forward sub- graph size, which is the number of impacted task-specific models. We make two interesting observations. (i) A base model can sig- nificantly impact the LLM supply chain ecosystem. For example, Llama-3.1-8B is a base model from Meta used for efficient text gen- eration, code assistance, and research [19]. Due to its relatively small 5 Table 3: Top-10 base models sorted by forward subgraph size. Base model Total Fine- tune Adapter Quanti- zation Merge Level Llama-3.1-8B 7544 1710 1542 3473 1693 25 Mistral-7B-v0.1 6744 2105 2187 1435 1254 27 Qwen2.5-7B 6733 1972 1764 2516 1132 11 Meta-Llama-3-8B 5633 967 1511 22

: 