# Ollama Introduction
This little notebook is a brief introduction to Ollama, a tool for interacting with open LLMs deployed on Ollama server (the server can also run locally).

In [1]:
from ollama import Client

# initialize the Ollama client with the specified host
ollama_host = "http://10.167.31.201:11434/"
# ollama_host = "http://localhost:11434/"
client = Client(host=ollama_host)

In [2]:
# Get a list of all models that are currently downloaded
models = client.list()
print(models.models)

MODEL_NAME = "gemma3:27b"
#MODEL_NAME = "llama3.3:latest"


[Model(model='test:latest', modified_at=datetime.datetime(2025, 5, 6, 10, 7, 2, 955865, tzinfo=TzInfo(UTC)), digest='cffa12cd509c35382b142562f4c786471a2f8f72044c490b85d47304f8b545e1', size=47415724883, details=ModelDetails(parent_model='', format='gguf', family='qwen2', families=['qwen2'], parameter_size='72.7B', quantization_level='Q4_K_M')), Model(model='qwen2.5:3b', modified_at=datetime.datetime(2025, 4, 28, 15, 39, 44, 731776, tzinfo=TzInfo(UTC)), digest='357c53fb659c5076de1d65ccb0b397446227b71a42be9d1603d46168015c9e4b', size=1929912432, details=ModelDetails(parent_model='', format='gguf', family='qwen2', families=['qwen2'], parameter_size='3.1B', quantization_level='Q4_K_M')), Model(model='qwen2.5:7b', modified_at=datetime.datetime(2025, 4, 28, 15, 39, 21, 364334, tzinfo=TzInfo(UTC)), digest='845dbda0ea48ed749caafd9e6037047aa19acfcfd82e704d7ca97d631a0b697e', size=4683087332, details=ModelDetails(parent_model='', format='gguf', family='qwen2', families=['qwen2'], parameter_size='7.

# Question extraction

In [3]:
import json
import os
from pydantic import BaseModel
from typing import List
import re
import unicodedata


def clean_text_for_model(text):
    # 1. Remove Unicode replacement characters (�)
    text = text.replace('\uFFFD', '')
    
    # 2. Normalize Unicode (removes weird byte leftovers)
    text = unicodedata.normalize('NFKC', text)
    
    # 3. Remove non-ASCII characters, but keep newlines
    text = re.sub(r'[^\x20-\x7E\n]+', '', text)
    
    # 4. Collapse excessive whitespace
    text = re.sub(r'[ \t]{2,}', ' ', text)  # multiple spaces/tabs → one space
    text = re.sub(r'\n{3,}', '\n\n', text)  # 3+ newlines → 2

    return text.strip()


class QuestionResponse(BaseModel):
    questions: List[str]

def get_questions_from_review(review: str) -> List[str]:
    """
    Extracts scientific questions from a peer review.
    """
    response = client.chat(
        model=MODEL_NAME,
        messages=[
            {
                "role": "system",
                "content": (
                    "You extract and rephrase scientific questions from peer reviews. "
                    "Follow the INCLUDE/EXCLUDE rules and rephrasing style from the examples below."
                )
            },
            {
                "role": "user",
                "content": (
                    "### Task:\n"
                    "Extract information-seeking scientific questions explicitly posed by the reviewer in the given peer review text. "
                    
                    "### Rules:\n"
                    "1. **INCLUDE**: Scientific questions about methods, assumptions, data, or interpretations.\n"
                    "2. **EXCLUDE**: Figures/tables related, editorial (e.g. structural, grammatical), or rhetorical comments.\n"
                    "3. **REPHRASE**: Make questions standalone and precise. Remove reference to any mention of line numbers, paragraph numbers, table numbers, or figure numbers. (e.g., 'L12', 'lines 300-309', 'paragraph 3', 'Table 2'). If the question becomes incomplete without the reference, DISCARD the whole question.\n\n"
                    
                    "### Examples of valid extractions:\n"
                    "'Why was X method used?' → 'What was the rationale for using X method?'\n"
                    "'The data in Fig. 3 seems inconsistent.' → '' \n"
                    "'It might be important to demonstrate this phenomenon.' → '' \n"
                                        
                    "### Peer Review to Analyze:\n"
                    f"{review}\n\n"

                )
            }
        ],
        options={
            "temperature": 0.3,
            "num_predict": 1024,
        },
        format=QuestionResponse.model_json_schema()
    )
    structured_response = QuestionResponse.model_validate_json(response.message.content)
    return structured_response.questions

def filter_questions(questions: List[str]) -> List[str]:
    """
    Filters out any questions that are not scientific or information-seeking.
    """
    response = client.chat(
        model=MODEL_NAME,
        messages=[
            {"role": "user",
            "content": (
                '''
                You are given a list of questions extracted from a peer review of a scientific article. Your task is to 
                - filter out any questions that are not scientific or information-seeking.
                - filter out any questions about figures.
                - remove reference to line numbers, paragraph numbers, table numbers, or figure numbers. Rephrase and keep the question if it is still understandable without the reference.

                Here is the list of questions:\n\n
                ''' + json.dumps(questions, indent=2) + '''
                If no questions remain, return an empty array.
                '''
            )
            }   
        ],
        options={
            "temperature": 0.2, 
            "num_predict": 1024,
        },
        format=QuestionResponse.model_json_schema()
    )
    structured_response = QuestionResponse.model_validate_json(response.message.content)

    # remove any empty questions
    structured_response.questions = [q for q in structured_response.questions if q.strip()]

    # ensure each question is unique
    structured_response.questions = list(set(structured_response.questions))

    return structured_response.questions


In [16]:


base_dir = 'test-set'

for dir in os.listdir(base_dir):
    dir_path = os.path.join(base_dir, dir)
    if os.path.isdir(dir_path):
        print(f"Processing directory: {dir_path}")

        questions_json = os.path.join(dir_path, 'questions.json')
        # if os.path.exists(questions_json):
        #     print(f"Questions file already exists: {questions_json}")
        #     continue

        # Load the peer reviews from a JSON file
        review_file = os.path.join(dir_path, 'all_reviews.json')

        with open(review_file, 'r', encoding='utf-8') as file:
            reviews = json.load(file)
        
        result = []
        # Process each review
        for review in reviews:
            review = clean_text_for_model(review)
            questions = get_questions_from_review(review)
            result.extend(questions)

        # Save the results 
        with open(questions_json, 'w', encoding='utf-8') as file:
            json.dump(result, file, ensure_ascii=False, indent=4)
        print(f"Questions saved to {questions_json}")

        # Filter the questions
        filtered_questions = filter_questions(
            json.load(open(questions_json, 'r', encoding='utf-8'))
        )
        filtered_questions_json = os.path.join(dir_path, 'filtered_questions.json')
        with open(filtered_questions_json, 'w', encoding='utf-8') as file:
            json.dump(filtered_questions, file, ensure_ascii=False, indent=4)
        print(f"Filtered questions saved to {filtered_questions_json}")

Processing directory: test-set\1-10


KeyboardInterrupt: 

In [None]:
# def get_questions_from_review(review: str) -> List[str]:
#     """
#     Extracts scientific questions from a peer review.
#     """
#     response = client.chat(
#         model=MODEL_NAME,
#         messages=[
#             {"role": "user", 
#              "content": (
#                     "You are given a text which contain peer review of a scientific article.\n\n"
#                     "Your task is to identify and extract all **scientific, information-seeking questions** posed by the reviewer.\n\n"
#                     "Only include questions related to the **scientific content** of the article — such as methodology, experimental design, data analysis, results, interpretation, assumptions, or scientific relevance.\n\n"
#                     "**Exclude** any questions about grammar, spelling, formatting, or writing style.\n\n"
#                     "If necessary, **rephrase the questions** so they are self-contained and preserve relevant scientific context. If a question is based on a specific sentence or result, include that sentence as context in the question.\n\n"
#                     "Return the extracted questions as JSON.\n\n"
#                     f"Peer review:\n{review}"
#             )}
#         ],
#         options={
#             "temperature": 0.5,
#             "num_predict": 1024,
#         },
#         format=QuestionResponse.model_json_schema()
#     )
#     structured_response = QuestionResponse.model_validate_json(response.message.content)
#     return structured_response.questions



import json
with open("all_reviews.json", 'r', encoding='utf-8') as file:
    reviews = json.load(file)

clean_text = clean_text_for_model(reviews[0])
print(clean_text)

get_questions_from_review(clean_text)

*This file contains all reviewer reports in order by version, followed by all author rebuttals in order by*

*version.*

# **** ****

**** 

 

The authors sincerely thank the reviewers for their thorough and highly positive reviews. Please
see our point-by-point responses below. **All page and paragraph numbers described in my**
**responses refer to the marked up (changes tracked) version of the manuscript.**

*Reviewer #1 (Remarks to the Author):*

This manuscript describes the discovery of a potent Kv7.3-selective potassium channel openers
from a polynesian traditional botanical anticonvulsant. The primary active component appears to
be gentisic acid which is an extremely potent activator and very selective at least when
compared to other KV7 channels and a small number of other KV channel types, effective at nM
concentrations. As such, the bulk of the manuscript focuses on the actions of gentisic acid in
isolation.

The content of the manuscript is fairly straightforward but it is 

['What is the primary focus of this research report?',
 'What is the significance of the discovery of gentisic acid (GA) as a Kv7.2/3 channel activator?',
 'What is the mechanism of action of GA on Kv7.2/3 channels, and what specific residue is involved?',
 'What are the potential clinical implications of GA as an anticonvulsant agent?',
 'What are the limitations of the current study, and what future research directions are suggested?',
 'What is the rationale behind the counter-screen against Kv1.3?',
 'What is the significance of the comparison between functional and physical selectivity in Kv7 channel opener discovery?',
 'What is the role of alanine scanning mutagenesis and MD simulations in understanding Kv7 channel opener selectivity?',
 'What is the relevance of the comparison between GA and GABA in terms of their binding to Kv7 channels?',
 "What is the final recommendation regarding the journal's scope based on the research findings?"]

# Answer extraction


In [None]:
from pydantic import BaseModel
from typing import Dict

class AnswerResponse(BaseModel):
    QAPairs: Dict[str, str]

def find_answer_in_rebuttal(question, rebuttal_text):
    """
    Finds the answer to a given question within a rebuttal text.
    If the rebuttal does not answer the question, return an empty string.
    """
    response = client.chat(
        model=MODEL_NAME,
        messages=[
            {
                "role": "user",
                "content": (
                    "You are given a scientific question and a rebuttal text.\n\n"
                    "Your task is to extract the **corresponding response** given by the authors in the rebuttal text. \n\n"
                    "The rebuttal text might contain both the **reviewer’s comments** and the **authors’ replies**. You should first find the part of the rebuttal where the **reviewer raises the issue described in the question** in order to find the **authors' response**.\n\n"
                    "If the authors give a clear answer, return only that answer.\n"
                    "If the authors do **not** address the question or if the issue is not mentioned in the rebuttal, return an **empty** string.\n\n"
                    "Do **not** summarize. Do **not** return the question or any explanation. Only return the answer as plain text.\n\n"
                    f"Question: {question}\n\n"
                    f"Rebuttal text:\n{rebuttal_text}\n\n"
                    "Answer:"
                )
            }
        ],
        options={
            "temperature": 0.2,
            "num_predict": 1024,
        }
    )
    structured_response = AnswerResponse(QAPairs={question: response.message.content.strip()})
    return structured_response.QAPairs


In [12]:
import spacy
from textwrap import wrap
import json

# spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md") 

MAX_CHARS = 5500  

def chunk_text_by_sentence(text, max_tokens=MAX_CHARS):
    """
    Splits a long text into chunks of roughly `max_tokens`, without breaking sentences.
    Each chunk is a list of sentences.
    """
    doc = nlp(text)
    chunks = []
    current_chunk = []
    current_len = 0

    for sent in doc.sents:
        sent_words = sent.text.strip().split()
        sent_chars = sum(len(word) for word in sent_words)

        # If adding this sentence exceeds limit, start new chunk
        if current_len + sent_chars > max_tokens and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sent.text.strip()]
            current_len = sent_chars
        else:
            current_chunk.append(sent.text.strip())
            current_len += sent_chars

    # Add the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks



def chunk_text(text, max_chars=MAX_CHARS):
    return wrap(text, max_chars, break_long_words=False, break_on_hyphens=False)

def find_answer_from_long_rebuttal(question, long_rebuttal_text):
    chunks = chunk_text(long_rebuttal_text)
    for chunk in chunks:
        answer_dict = find_answer_in_rebuttal(question, chunk)
        answer = answer_dict[question]
        if answer.strip():
            return answer_dict  # Found the answer
    return {question: ""}  # No answer found in any chunk


def extract_answer(paper_dir):
    questions_json = os.path.join(paper_dir, 'questions.json')
    with open(questions_json, 'r', encoding='utf-8') as file:
        questions = json.load(file)

    final_qa = {}

    for question in questions:
        print(f"Finding answer for question: {question}")
        rebuttal_json = os.path.join(paper_dir, 'all_rebuttals.json')
        with open(rebuttal_json, "r", encoding="utf-8") as f:
            rebuttals = json.load(f)
        
        for rebuttal in rebuttals:
            rebuttal_text = clean_text_for_model(rebuttal)
            
            qa = find_answer_from_long_rebuttal(question, rebuttal_text)
            print(qa)
            if qa[question].strip():
                print(f"Answer found: {qa[question]}")
                final_qa[question] = qa[question]
                break
    
    # Save the final Q&A pairs to a JSON file
    final_qa_json = os.path.join(paper_dir, 'final_qa.json')
    with open(final_qa_json, 'w', encoding='utf-8') as file:
        json.dump(final_qa, file, ensure_ascii=False, indent=4)
    
    return final_qa



In [13]:


paper_dir = 'test-set/s43247-025-02414-x'
extract_answer(paper_dir)

Finding answer for question: What was the specific reason for discarding 40% of transects from non-sandy beaches?
{'What was the specific reason for discarding 40% of transects from non-sandy beaches?': 'Sandy beaches are highly dynamic systems that experience frequent changes due to erosion and accretion over short timescales (episodic, seasonal, interannual) up to geological timescales. In contrast, rocky coasts (main type of non-sandy beach along the NAWC) change more gradually over geological timescales. Since our study focuses on the influence of seasonal to interannual climate variability on coastal change, we excluded non-sandy coasts to maintain consistency in the timescales analyzed. Furthermore, the detection methods used in this study were specifically designed to monitor sandy beaches, and non-sandy coasts exhibit much lower rates of change, which would likely result in a high noise-to-signal ratio and reduced reliability of'}
Answer found: Sandy beaches are highly dynamic 

{'What was the specific reason for discarding 40% of transects from non-sandy beaches?': 'Sandy beaches are highly dynamic systems that experience frequent changes due to erosion and accretion over short timescales (episodic, seasonal, interannual) up to geological timescales. In contrast, rocky coasts (main type of non-sandy beach along the NAWC) change more gradually over geological timescales. Since our study focuses on the influence of seasonal to interannual climate variability on coastal change, we excluded non-sandy coasts to maintain consistency in the timescales analyzed. Furthermore, the detection methods used in this study were specifically designed to monitor sandy beaches, and non-sandy coasts exhibit much lower rates of change, which would likely result in a high noise-to-signal ratio and reduced reliability of',
 'What is the reference source for the wave power formula used in the study?': 'The actual formula used to generate the data is] *P* *wave* *g* [2] *H* *s* [2] *

## Example

In [None]:
# Send a simple prompt to the model
# model: select a model from the list of models obtained from client.list()
# messages: a list of messages containing the conversation history. Some models also 
# have a system message, to add this, make the first message:
# {"role": "system", "content": "Your system message here"}
# Gemma3 does not have a system message, so we can start with the user message.
# To add responses from the model, you can use the "assistant" role, i.e.:
# {"role": "assistant", "content": "The capital of France is Paris."}

response = client.chat(
    model=MODEL_NAME,
    messages=[
        {"role": "user", "content": "What is the capital of France?"},
        {"role": "assistant", "content": "The capital of France is Paris."},
        {"role": "user", "content": "And who many people live there?"},
    ],
)
print(response.message.content)

<think>
Alright, let me break down what's happening here. The user previously asked about the capital of France and I told them it's Paris. Now they're following up with a question about how many people live there. 

Hmm, the user wrote "who many people" instead of "how many." That's a common typo, so I should make sure to correct that in my response without pointing out the mistake explicitly.

They’re probably looking for population statistics. Since Paris is a major city, they might be planning a trip, doing research, or just curious about its size relative to other cities. 

I remember that Paris has around 2.165 million people as of recent estimates. But it's also part of a larger metropolitan area called Île-de-France, which has over 12 million residents. That makes the metro area one of the largest in Europe.

I should provide both numbers because the user might be interested in either the city proper or the broader area. It gives them a clearer picture depending on their needs.

In [None]:
# To control the decoding parameters, such as temperature, maxium number of tokens, etc.,
# you can pass additional parameters to the chat method.
# For a complete list of options, check the Ollama API documentation at:
# https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values
x = [
    "How does the unit of wave power translate to W/m (Watts per meter) from the provided formula, which is the product of squared significant height and wave period?",
    "What is the reference source for the wave power formula used in the study?",
    "How can wave power be negative, given the observed range of -2000 to 2000 W/m in Figure 2?",
    "Why does the latitude scale on the y-axis in Figures 2b, c, d, and e not maintain a fixed distance, as it does in Figure 2a?",
    "What is the rationale behind the chosen color palette, where higher wave power and sea level are represented in reddish tones and lower values in bluish tones, while this representation is reversed for wave direction and waterline position?",
    "What was the specific reason for removing 40% of transects from non-sandy beaches?",
    "What is the unit of measurement for wave energy in Figure S3?",
    "Why is the wave energy formula in Figure S3 the same as the wave power formula in Figure 2?",
    "What was the basis for dividing the North American West Coast (NAWC) into five subregions?",
    "What is the rationale for using a rectangular boundary in Figure 1?",
    "Were any additional studies conducted to determine if parameters such as beach slope, substrate lithology,or riverine sediment inputs significantly affect waterline positions on a seasonal scale?"]
    
response = client.chat(
    model=MODEL_NAME,
    messages=[
        {"role": "user", 
         "content": "From the following list of questions, extract only information seeking question and question with no reference to figrues, line etc : "
         "How does the unit of wave power translate to W/m (Watts per meter) from the provided formula, which is the product of squared significant height and wave period?"
         "What is the reference source for the wave power formula used in the study?"
         "How can wave power be negative, given the observed range of -2000 to 2000 W/m in Figure 2?"
         "Why does the latitude scale on the y-axis in Figures 2b, c, d, and e not maintain a fixed distance, as it does in Figure 2a?"
         },
    ],
    options={
        "temperature": 0.5,
        "num_predict": 1024,
    }
)
print(response.message.content)

Here are the information-seeking questions from the list, excluding those referencing figures/lines/specific locations:

*   **What is the reference source for the wave power formula used in the study?**
*   **How does the unit of wave power translate to W/m (Watts per meter) from the provided formula, which is the product of squared significant height and wave period?**



The other two questions specifically ask about elements *within* figures (Figure 2, Figures 2b,c,d,e) and are therefore excluded based on your criteria.


In [None]:
# To force the model to generate a structured response, i.e. a JSON object,
# you can define a schema for the response and pass it. The schema can be defined using 
# Pydantic models and the built-in python types (more complex types are also supported, 
# check the pydantic documentation for more details).

from pydantic import BaseModel
class PopulationResponse(BaseModel):
    city: str
    population: int

response = client.chat(
    model=MODEL_NAME,
    messages=[
        {"role": "user", "content": "What is the capital of France?"},
        {"role": "assistant", "content": "The capital of France is Paris."},
        {"role": "user", "content": "And who many people live there?"},
    ],
    format=PopulationResponse.model_json_schema()
)
structured_response = PopulationResponse.model_validate_json(response.message.content)
print(structured_response)

city='Paris' population=2148000


In [None]:
prompt = (
    "Identify and extract all **scientific, information-seeking questions** posed by the reviewer.\n\n"
    "Each question must:\n"
    "- Seek clarification or justification about methods, data, assumptions, results, or interpretations\n"
    "- Be answerable by the authors with scientific explanation\n"
    "Do NOT include:\n"
    "- Editorial question about grammar, style, or formatting\n"
    "- Rhetorical comment\n"
    "- Question regarding figures\n"
    
    "Rephrase the question to be self-contained and precise. Remove reference to any line or table number. If context loses after removal, discard the question.\n\n"

    "### Examples:\n"
    "Input: 'In Table 2, the authors claim the signal-to-noise ratio improved. Was it over multiple trials or one run?'\n"
    "Output: [\"How was the signal-to-noise ratio measured in the modified setup—over multiple trials or a single run?\"]\n\n"

    "Input: 'The manuscript overinterprets the data.'\n"
    "Output: []\n\n"

    "Now extract the questions from this review:\n\n{REVIEW}"
)

In [None]:
import openai
from openai import OpenAI
from pydantic import BaseModel

openai.api_key =

review_text = "I am now satisfied with the changes that have been made by the authors, and these revisions have made an improvement from the first version of the article. \nThere are a few other minor concerns that the author may want to be consider; 1. All the data presented in this manuscript are generated by T cell lines or T cell \u201cclone\u201d. D10 cells are T lymphoblasts that continuously proliferate without stimulation, thereby exhibiting cancer cell properties. It is important to demonstrate this phenomenon occuring in vivo using primary T cells. \n2. It isn\u2019t very unclear how TCR internalization was measured by using anti-TCR antibody in the beginning and the end of incubation; this could be explained better. \n3. In the abstract, it would read better if the word \u201cinterestingly\u201d was deleted from the start of the second paragraph. "


client = OpenAI(api_key=openai.api_key)

class ScientificQuestions(BaseModel):
    questions: list[str]

response = client.responses.parse(
    model="o3-2025-04-16",
    input=[
        {"role": "system", "content": "Extract scientific questions from given peer reviews and return them as a structured list."},
        {"role": "user", "content": prompt.replace("{REVIEW}", review_text)}
    ],
    text_format=ScientificQuestions
)

# Access the parsed output
questions = response.output_parsed
print(questions)

NotFoundError: Error code: 404 - {'error': {'message': 'Your organization must be verified to use the model `o3-2025-04-16`. Please go to: https://platform.openai.com/settings/organization/general and click on Verify Organization. If you just verified, it can take up to 15 minutes for access to propagate.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}