# Ollama Introduction
This little notebook is a brief introduction to Ollama, a tool for interacting with open LLMs deployed on Ollama server (the server can also run locally).

In [1]:
from ollama import Client

# initialize the Ollama client with the specified host
ollama_host = "http://10.167.31.201:11434/"
# ollama_host = "http://localhost:11434/"
client = Client(host=ollama_host)

In [2]:
# Get a list of all models that are currently downloaded
models = client.list()
print(models.models)

MODEL_NAME = "llama3.3:latest"


[Model(model='test:latest', modified_at=datetime.datetime(2025, 5, 6, 10, 7, 2, 955865, tzinfo=TzInfo(UTC)), digest='cffa12cd509c35382b142562f4c786471a2f8f72044c490b85d47304f8b545e1', size=47415724883, details=ModelDetails(parent_model='', format='gguf', family='qwen2', families=['qwen2'], parameter_size='72.7B', quantization_level='Q4_K_M')), Model(model='qwen2.5:3b', modified_at=datetime.datetime(2025, 4, 28, 15, 39, 44, 731776, tzinfo=TzInfo(UTC)), digest='357c53fb659c5076de1d65ccb0b397446227b71a42be9d1603d46168015c9e4b', size=1929912432, details=ModelDetails(parent_model='', format='gguf', family='qwen2', families=['qwen2'], parameter_size='3.1B', quantization_level='Q4_K_M')), Model(model='qwen2.5:7b', modified_at=datetime.datetime(2025, 4, 28, 15, 39, 21, 364334, tzinfo=TzInfo(UTC)), digest='845dbda0ea48ed749caafd9e6037047aa19acfcfd82e704d7ca97d631a0b697e', size=4683087332, details=ModelDetails(parent_model='', format='gguf', family='qwen2', families=['qwen2'], parameter_size='7.

In [4]:
import json
import os
from pydantic import BaseModel
from typing import List

class QuestionResponse(BaseModel):
    questions: List[str]

def get_questions_from_review(review: str) -> List[str]:
    """
    Extracts scientific questions from a peer review.
    """
    response = client.chat(
        model=MODEL_NAME,
        messages=[
            {"role": "user",
            "content": (
                "You are given a peer review of a scientific article.\n\n"
                "Your task is to extract all **scientific, information-seeking questions** that the reviewer asked.\n\n"

                "These questions should:\n"
                "- Be explicitly posed by the reviewer, not implied or inferred. Not a comment. Must be a question.\n"
                "- Seek clarification or justification about methods, data, results etc \n"
                "- Be grounded in the article’s scientific content\n"
                "- Be answerable by the authors with more scientific explanation or evidence\n\n"

                "**DO NOT include** questions that are:\n"
                "- Editorial such as grammar, spelling, formatting, or structure\n"
                "- Rhetorical, evaluative, or based on the reviewer’s opinions \n"
                "- Referring to figures(e.g., “Figure 2, Figs. 3”)\n\n"

                "Rephrase valid questions to be self-contained and precise. Each question should focus on a single scientific subject. Add any missing context that is necessary to make the question understandable on its own. \n\n"

                "### Example 1:\n"
                "Input: \"In Table 2, he authors claim the signal-to-noise ratio improved significantly , but they don’t explain how it was measured. Was this ratio calculated over multiple trials or just a single run?\"\n"
                "Output: How was the signal-to-noise ratio measured in the modified setup—over multiple trials or a single run?\n\n"
                "### Example 2:\n"
                "Input: \"In line 32-35, why is f(\\nu_1) being biquadrate exponential distribution function, etc\"\n"
                "Output: What is the reasoning behind choosing a biquadrate exponential distribution function for f(ν₁)?\n\n"
                "### Example 3:\n"
                "Input: \"It is important to demonstrate this phenomenon occuring in vivo using primary T cells. The manuscript over interprets some of the data. \"\n"
                "Output: [] (No scientific, information-seeking question found.) \n\n"
                "### Example 4:\n"
                "Input: \"L300-309. This seems to be a footnote, or possibly, an endnote. Please be clear about how it links to the other material.\"\n"
                "Output: [] (No scientific, information-seeking question found.) \n\n"
                "### Example 5:\n"
                "Input: \"The authors state that the results are statistically significant, but they do not provide the p-values. What are them?\"\n"
                "Output: What are the p-values for the key results presented in the study?\n\n"
                
                f"Now apply the same process to the following review:\n\n{review}"
            )
            }
        ],
        options={
            "temperature": 0.4,
            "num_predict": 1024,
        },
        format=QuestionResponse.model_json_schema()
    )
    structured_response = QuestionResponse.model_validate_json(response.message.content)
    return structured_response.questions

def filter_questions(questions: List[str]) -> List[str]:
    """
    Filters out any questions that are not scientific or information-seeking.
    """
    response = client.chat(
        model=MODEL_NAME,
        messages=[
            {"role": "user",
            "content": (
                '''
                You are given a list of questions extracted from a peer review of a scientific article. Your task is to 
                - filter out any questions that are not scientific or information-seeking.
                - filter out any questions about figures.
                - remove reference to line numbers, paragraph numbers, table numbers, or figure numbers. Rephrase and keep the question if it is still understandable without the reference.

                Here is the list of questions:\n\n
                ''' + json.dumps(questions, indent=2) + '''
                If no questions remain, return an empty array.
                '''
            )
            }   
        ],
        options={
            "temperature": 0.2, 
            "num_predict": 1024,
        },
        format=QuestionResponse.model_json_schema()
    )
    structured_response = QuestionResponse.model_validate_json(response.message.content)

    # remove any empty questions
    structured_response.questions = [q for q in structured_response.questions if q.strip()]

    # ensure each question is unique
    structured_response.questions = list(set(structured_response.questions))

    return structured_response.questions


In [None]:


base_dir = 'test-set'

for dir in os.listdir(base_dir):
    dir_path = os.path.join(base_dir, dir)
    if os.path.isdir(dir_path):
        print(f"Processing directory: {dir_path}")

        questions_json = os.path.join(dir_path, 'questions.json')
        # if os.path.exists(questions_json):
        #     print(f"Questions file already exists: {questions_json}")
        #     continue

        # Load the peer reviews from a JSON file
        review_file = os.path.join(dir_path, 'all_reviews.json')

        with open(review_file, 'r', encoding='utf-8') as file:
            reviews = json.load(file)
        
        result = []
        # Process each review
        for review in reviews:
            questions = get_questions_from_review(review)
            result.extend(questions)

        # Save the results 
        with open(questions_json, 'w', encoding='utf-8') as file:
            json.dump(result, file, ensure_ascii=False, indent=4)
        print(f"Questions saved to {questions_json}")

        # Filter the questions
        filtered_questions = filter_questions(
            json.load(open(questions_json, 'r', encoding='utf-8'))
        )
        filtered_questions_json = os.path.join(dir_path, 'filtered_questions.json')
        with open(filtered_questions_json, 'w', encoding='utf-8') as file:
            json.dump(filtered_questions, file, ensure_ascii=False, indent=4)
        print(f"Filtered questions saved to {filtered_questions_json}")

Processing directory: test-set\1-10


In [4]:
# def get_questions_from_review(review: str) -> List[str]:
#     """
#     Extracts scientific questions from a peer review.
#     """
#     response = client.chat(
#         model=MODEL_NAME,
#         messages=[
#             {"role": "user", 
#              "content": (
#                     "You are given a text which contain peer review of a scientific article.\n\n"
#                     "Your task is to identify and extract all **scientific, information-seeking questions** posed by the reviewer.\n\n"
#                     "Only include questions related to the **scientific content** of the article — such as methodology, experimental design, data analysis, results, interpretation, assumptions, or scientific relevance.\n\n"
#                     "**Exclude** any questions about grammar, spelling, formatting, or writing style.\n\n"
#                     "If necessary, **rephrase the questions** so they are self-contained and preserve relevant scientific context. If a question is based on a specific sentence or result, include that sentence as context in the question.\n\n"
#                     "Return the extracted questions as JSON.\n\n"
#                     f"Peer review:\n{review}"
#             )}
#         ],
#         options={
#             "temperature": 0.5,
#             "num_predict": 1024,
#         },
#         format=QuestionResponse.model_json_schema()
#     )
#     structured_response = QuestionResponse.model_validate_json(response.message.content)
#     return structured_response.questions


import re
import unicodedata

def clean_text_for_model(text):
    # 1. Remove Unicode replacement characters (�)
    text = text.replace('\uFFFD', '')
    
    # 2. Normalize Unicode (removes weird byte leftovers)
    text = unicodedata.normalize('NFKC', text)
    
    # 3. Remove non-ASCII characters, but keep newlines
    text = re.sub(r'[^\x20-\x7E\n]+', '', text)
    
    # 4. Collapse excessive whitespace
    text = re.sub(r'[ \t]{2,}', ' ', text)  # multiple spaces/tabs → one space
    text = re.sub(r'\n{3,}', '\n\n', text)  # 3+ newlines → 2

    return text.strip()


import json
with open("all_reviews.json", 'r', encoding='utf-8') as file:
    reviews = json.load(file)

clean_text = clean_text_for_model(reviews[0])
print(clean_text)

get_questions_from_review(clean_text)

*This file contains all reviewer reports in order by version, followed by all author rebuttals in order by*

*version.*

# **** ****

**** 

 

The authors sincerely thank the reviewers for their thorough and highly positive reviews. Please
see our point-by-point responses below. **All page and paragraph numbers described in my**
**responses refer to the marked up (changes tracked) version of the manuscript.**

*Reviewer #1 (Remarks to the Author):*

This manuscript describes the discovery of a potent Kv7.3-selective potassium channel openers
from a polynesian traditional botanical anticonvulsant. The primary active component appears to
be gentisic acid which is an extremely potent activator and very selective at least when
compared to other KV7 channels and a small number of other KV channel types, effective at nM
concentrations. As such, the bulk of the manuscript focuses on the actions of gentisic acid in
isolation.

The content of the manuscript is fairly straightforward but it is 

['What is the primary focus of this research report?',
 'What is the significance of the discovery of gentisic acid (GA) as a Kv7.2/3 channel activator?',
 'What is the mechanism of action of GA on Kv7.2/3 channels, and what specific residue is involved?',
 'What are the potential clinical implications of GA as an anticonvulsant agent?',
 'What are the limitations of the current study, and what future research directions are suggested?',
 'What is the rationale behind the counter-screen against Kv1.3?',
 'What is the significance of the comparison between functional and physical selectivity in Kv7 channel opener discovery?',
 'What is the role of alanine scanning mutagenesis and MD simulations in understanding Kv7 channel opener selectivity?',
 'What is the relevance of the comparison between GA and GABA in terms of their binding to Kv7 channels?',
 "What is the final recommendation regarding the journal's scope based on the research findings?"]

## Example

In [None]:
# Send a simple prompt to the model
# model: select a model from the list of models obtained from client.list()
# messages: a list of messages containing the conversation history. Some models also 
# have a system message, to add this, make the first message:
# {"role": "system", "content": "Your system message here"}
# Gemma3 does not have a system message, so we can start with the user message.
# To add responses from the model, you can use the "assistant" role, i.e.:
# {"role": "assistant", "content": "The capital of France is Paris."}

response = client.chat(
    model=MODEL_NAME,
    messages=[
        {"role": "user", "content": "What is the capital of France?"},
        {"role": "assistant", "content": "The capital of France is Paris."},
        {"role": "user", "content": "And who many people live there?"},
    ],
)
print(response.message.content)

As of 2023, the population of Paris is approximately **2.1 million** people.

It’s important to note that this refers to the city proper (the administrative limits). The Greater Paris metropolitan area, which includes surrounding suburbs, has a much larger population – over 11 million! 

Would you like to know more about the population of Paris or its surrounding area?


In [43]:
# To control the decoding parameters, such as temperature, maxium number of tokens, etc.,
# you can pass additional parameters to the chat method.
# For a complete list of options, check the Ollama API documentation at:
# https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values
x = [
    "How does the unit of wave power translate to W/m (Watts per meter) from the provided formula, which is the product of squared significant height and wave period?",
    "What is the reference source for the wave power formula used in the study?",
    "How can wave power be negative, given the observed range of -2000 to 2000 W/m in Figure 2?",
    "Why does the latitude scale on the y-axis in Figures 2b, c, d, and e not maintain a fixed distance, as it does in Figure 2a?",
    "What is the rationale behind the chosen color palette, where higher wave power and sea level are represented in reddish tones and lower values in bluish tones, while this representation is reversed for wave direction and waterline position?",
    "What was the specific reason for removing 40% of transects from non-sandy beaches?",
    "What is the unit of measurement for wave energy in Figure S3?",
    "Why is the wave energy formula in Figure S3 the same as the wave power formula in Figure 2?",
    "What was the basis for dividing the North American West Coast (NAWC) into five subregions?",
    "What is the rationale for using a rectangular boundary in Figure 1?",
    "Were any additional studies conducted to determine if parameters such as beach slope, substrate lithology,or riverine sediment inputs significantly affect waterline positions on a seasonal scale?"]
    
response = client.chat(
    model=MODEL_NAME,
    messages=[
        {"role": "user", 
         "content": "From the following list of questions, extract only information seeking question and question with no reference to figrues, line etc : "
         "How does the unit of wave power translate to W/m (Watts per meter) from the provided formula, which is the product of squared significant height and wave period?"
         "What is the reference source for the wave power formula used in the study?"
         "How can wave power be negative, given the observed range of -2000 to 2000 W/m in Figure 2?"
         "Why does the latitude scale on the y-axis in Figures 2b, c, d, and e not maintain a fixed distance, as it does in Figure 2a?"
         },
    ],
    options={
        "temperature": 0.5,
        "num_predict": 1024,
    }
)
print(response.message.content)

Here are the information-seeking questions from the list, excluding those referencing figures/lines/specific locations:

*   **What is the reference source for the wave power formula used in the study?**
*   **How does the unit of wave power translate to W/m (Watts per meter) from the provided formula, which is the product of squared significant height and wave period?**



The other two questions specifically ask about elements *within* figures (Figure 2, Figures 2b,c,d,e) and are therefore excluded based on your criteria.


In [None]:
# To force the model to generate a structured response, i.e. a JSON object,
# you can define a schema for the response and pass it. The schema can be defined using 
# Pydantic models and the built-in python types (more complex types are also supported, 
# check the pydantic documentation for more details).

from pydantic import BaseModel
class PopulationResponse(BaseModel):
    city: str
    population: int

response = client.chat(
    model=MODEL_NAME,
    messages=[
        {"role": "user", "content": "What is the capital of France?"},
        {"role": "assistant", "content": "The capital of France is Paris."},
        {"role": "user", "content": "And who many people live there?"},
    ],
    format=PopulationResponse.model_json_schema()
)
structured_response = PopulationResponse.model_validate_json(response.message.content)
print(structured_response)

city='Paris' population=2141000
