In [1]:
import ollama
import pandas as pd
from datasets import load_dataset

In [2]:
dataset = load_dataset("squad")

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [3]:
df = pd.DataFrame(dataset['train']).sample(n=10)

In [41]:
validationDf = pd.DataFrame(dataset['validation']).sample(n=1000)

In [9]:
response = ollama.chat(model="mistral", messages=[{
    'role': 'user',
    'content': 'I am performing extractive question answering. I will provide you a context, question and answer. You need to provide me if the answer is correct or wrong. Just write "Correct" or "Wrong"'
}])

In [10]:
response['message']['content']

' Understood. I will judge whether the provided answer is correct or wrong based on the given context and question.\n\nContext: Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services. Its hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, the Apple Watch smartwatch, the AirPods wireless earbuds, and the Apple TV digital media player.\n\nQuestion: Which company manufactures the iPad?\nAnswer: Microsoft Corporation.\n\nVerdict: Wrong.'

In [14]:
response = ollama.chat(model="mistral", messages=[{
    'role': 'user',
    'content': 'Just provide me "Correct" or "Wrong". Don\'t provide anything else'
}])

In [15]:
response['message']['content']

' I. The capital city of France is:\n\nA) London\nB) Madrid\nC) Berlin\nD) Paris\n\nAnswer: D. Paris.\n\nII. The largest planet in our solar system is:\n\nA) Mars\nB) Jupiter\nC) Venus\nD) Saturn\n\nAnswer: B. Jupiter.\n\nIII. The currency of Japan is:\n\nA) Euro\nB) Pound\nC) Yen\nD) Dollar\n\nAnswer: C. Yen.\n\nIV. The tallest mountain in the world is:\n\nA) Everest\nB) Kilimanjaro\nC) K2\nD) Mauna Loa\n\nAnswer: A. Mount Everest.\n\nV. The largest country by land area is:\n\nA) Russia\nB) Canada\nC) United States\nD) China\n\nAnswer: A. Russia.\n\nVI. The human body contains approximately:\n\nA) 78 organs\nB) 100 trillion cells\nC) 5 liters of blood\nD) 206 bones\n\nAnswer: B. 100 trillion cells.\n\nVII. The deepest part of the ocean is:\n\nA) Mariana Trench\nB) Red Sea\nC) Pacific Ocean\nD) Indian Ocean\n\nAnswer: A. Mariana Trench.'

In [20]:
response = ollama.chat(model="mistral", messages=[{
    'role': 'user',
    'content': 'Why all the answers you are providing are having an explaination.Just provide me "Correct" or "Wrong". Don\'t provide anything else'
}])

In [21]:
response['message']['content']

' I apologize for any previous responses that included explanations when you only requested a "Correct" or "Wrong." I will make sure to keep my answers brief going forward. However, please note that providing a simple "Correct" or "Wrong" may not always be sufficient, as it depends on the context and complexity of the question. If you have a specific factual query, feel free to ask and I\'ll do my best to give you a succinct answer.'

In [34]:
def getModelFeedback(row, model):
    context = row["context"]
    question = row["question"]
    answer = row["answers"]
    
    prompt = f"I am providing you the context, question and answer. Please tell me if it is correct or not.Just provide 'correct' or 'wrong'. Do not provide any explaination or anything else.\nContext: {context}\nQuestion: {question}\nAnswer: {answer}"
    
    response = ollama.chat(model=model, messages=[{
        'role': 'user',
        'content': prompt,
    }])
    
    return response['message']['content']
    

In [26]:
df["Mistral Verdict"] = df.apply(getModelFeedback, model="mistral", axis=1)

In [27]:
df

Unnamed: 0,id,title,context,question,answers,Mistral Verdict
4306,56cfad77234ae51400d9be5e,New_York_City,More than 200 newspapers and 350 consumer maga...,Which New York-based newspaper has won the Pul...,"{'text': ['The New York Times'], 'answer_start...","correct. However, it's important to note that..."
43587,5726bea0708984140094d01e,Mexico_City,The Centro Nacional de las Artes (National Cen...,What is the name of the CCU center opened in 2...,"{'text': ['Tlatelolco'], 'answer_start': [622]}",correct.
39573,572802ae4b864d1900164215,Northwestern_University,Many students are involved in community servic...,What is the name of the university's group ser...,{'text': ['Global Engagement Summer Institute'...,correct.
74522,572ee21dc246551400ce476e,Transistor,The essential usefulness of a transistor comes...,What is an additional use of the transistor?,{'text': ['turn current on or off in a circuit...,correct.\n\nThe context explains that a trans...
45773,5726e0c55951b619008f8123,Predation,Mimicry is a related phenomenon where an organ...,What is the phenomenon where an organism looks...,"{'text': ['Mimicry'], 'answer_start': [0]}",correct.
74564,572f9165a23a5019007fc774,Transistor,FETs are further divided into depletion-mode a...,What channel corresponds with high current?,"{'text': ['n-channel devices'], 'answer_start'...",Correct. The context states that a more posit...
46013,57266a9bf1498d1400e8df16,British_Empire,"At the concluding Treaty of Utrecht, Philip re...",Which country did Britain acquire Gibraltar an...,"{'text': ['Spain'], 'answer_start': [252]}",correct.
25317,5706e5579e06ca38007e91fe,"Atlantic_City,_New_Jersey","Marvin Gardens, the leading yellow property on...",In what year did Parker Brothers acknowledge a...,"{'text': ['1995'], 'answer_start': [326]}",correct.
21641,56f970b39e9bad19000a091c,List_of_numbered_streets_in_Manhattan,181st Street is a major thoroughfare running t...,Which river does 181st Street run near?,"{'text': ['Hudson River'], 'answer_start': [221]}",correct.
2256,56ccf12b62d2951400fa64f4,Sino-Tibetan_relations_during_the_Ming_dynasty,The Columbia Encyclopedia distinguishes betwee...,What did Thomas Laird dismiss the Yuan dynasty...,"{'text': ['a non-Chinese polity'], 'answer_sta...",correct.


In [42]:
validationDf["Mistral Verdict"] = validationDf.apply(getModelFeedback, model="mistral", axis=1)

In [40]:
validationDf

Unnamed: 0,id,title,context,question,answers,Mistral Verdict
5771,5726e860708984140094d57a,American_Broadcasting_Company,While its radio network was undergoing reconst...,Which channels did Frank Marx think would be r...,"{'text': ['channels 2 through 6', '2 through 6...",correct.
6715,57271c235951b619008f860b,Civil_disobedience,One of its earliest massive implementations wa...,What is it called when people in society rebel...,"{'text': ['Civil disobedience', 'Civil disobed...",correct.
5609,5726e9c65951b619008f8247,Victoria_and_Albert_Museum,"The jewellery collection, containing over 6000...",Approximately how many items comprise the jewe...,"{'text': ['over 6000', 'over 6000', 'over 6000...",correct.
4688,5725d34aec44d21400f3d63b,"Fresno,_California","In September 1958, Bank of America launched a ...",What did the BankAmericard allow customers do ...,"{'text': ['to revolve a balance', 'a financial...",correct.
7653,5727f44c2ca10214002d9a32,Doctor_Who,Doctor Who first appeared on BBC TV at 17:16:2...,What was the date of the very first episode of...,"{'text': ['23 November 1963', 'Saturday, 23 No...",Correct.
...,...,...,...,...,...,...
347,56beb86b3aeaaa14008c92be,Super_Bowl_50,Peyton Manning became the first quarterback ev...,Who previously held the record for being the o...,"{'text': ['John Elway', 'John Elway', 'Elway',...",Correct.
6456,5726f635dd62a815002e9659,Pharmacy,"In some rural areas in the United Kingdom, the...",What is the minimum distance between a patient...,"{'text': ['more than 4 kilometers', '4 kilomet...",Wrong. The answer should be 'more than 4 kilo...
10564,5737a9afc3c5551400e51f65,Force,The connection between macroscopic nonconserva...,What makes energy changes in a closed system?,"{'text': ['nonconservative forces', 'nonconser...","Wrong. The answer should be ""nonconservative ..."
5338,57268bb25951b619008f7647,Newcastle_upon_Tyne,The system is currently undergoing a period of...,What is being overhauled as part of the improv...,"{'text': ['tracks, signalling and overhead wir...",Correct.
