# 🤖 The Turing Apex Challenge – Scientific MCQ Solver (RAG-based)
This notebook presents a pipeline for training an AI model to solve scientific multiple-choice questions using Retrieval-Augmented Generation (RAG) and an open-source LLM. The model is optimized using in-context learning and external information from Wikipedia.

## Setup Environment

In [None]:
# install required libraries
!pip install -qU keybert wikipedia langchain langchain_community sentence-transformers faiss-cpu tqdm

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m79.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.7/345.7 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m437.7/437.7 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00

In [None]:
# mount Google Drive to access data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Data

In [None]:
import pandas as pd

# load the training and test data
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DCL/Task/data/train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DCL/Task/data/test_data.csv')

# display the first row of the training data
train_data.head(1)

Unnamed: 0,prompt,A,B,C,D,E,answer
0,What is the main sequence in astronomy?,The main sequence is a type of galaxy that con...,The main sequence is a type of black hole that...,The main sequence is a continuous and distinct...,The main sequence is a group of planets that o...,The main sequence is a type of nebula that is ...,C


## Build Optimized Queries for Retrieval

In [None]:
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

# function to extract optimized keywords from a question using KeyBERT and TF-IDF
def build_optimized_query(question):

    kw_model = KeyBERT()
    tfidf = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')

    # extract top 5 keywords using KeyBERT
    keywords = kw_model.extract_keywords(question, keyphrase_ngram_range=(1, 3), top_n=5)
    keybert_kws = [k[0] for k in keywords]

    # fit TF-IDF and get top keyword from TF-IDF score
    tfidf_matrix = tfidf.fit_transform([question])
    feature_names = tfidf.get_feature_names_out()
    tfidf_kws = [feature_names[i] for i in tfidf_matrix.toarray().argsort()[0][-1:]]

    # combine keywords and removing duplicates
    all_keywords = list(set(keybert_kws + tfidf_kws))

    return all_keywords

In [None]:
train_data_question_queries = []
test_data_question_queries = []

# generate optimized keyword queries for all training questions
for i in range(len(train_data['prompt'])):
    question_train = train_data['prompt'][i]
    train_data_question_queries.append(build_optimized_query(question_train))

# generate optimized keyword queries for all test questions
for i in range(len(test_data['prompt'])):
    question_test = test_data['prompt'][i]
    test_data_question_queries.append(build_optimized_query(question_test))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
train_data_question_queries[0]

['sequence astronomy',
 'main sequence',
 'main sequence astronomy',
 'sequence',
 'astronomy']

## Retrieve Wikipedia Context Using LangChain

In [None]:
from langchain.utilities import WikipediaAPIWrapper

# function to perform a Wikipedia search using LangChain's wrapper
def langchain_wiki_search(query: str, lang: str = "en", top_k: int = 1) -> str:
    wiki = WikipediaAPIWrapper(lang=lang, top_k_results=top_k, doc_content_chars_max=1000)
    try:
        return wiki.run(query)
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
seen_titles = set()

train_data_wikipedia_results = []
test_data_wikipedia_results = []

# store Wikipedia results for training questions
for sublist in train_data_question_queries:
    sub_results = []
    for query in sublist:
        result = langchain_wiki_search(query)
        if result not in seen_titles:
            sub_results.append(result)
            seen_titles.add(result)
    train_data_wikipedia_results.append(sub_results)

# store Wikipedia results for test questions
for sublist in test_data_question_queries:
    sub_results = []
    for query in sublist:
        result = langchain_wiki_search(query)
        if result not in seen_titles:
            sub_results.append(result)
            seen_titles.add(result)
    test_data_wikipedia_results.append(sub_results)



  lis = BeautifulSoup(html).find_all('li')


In [None]:
train_data_wikipedia_results[0]

["Page: Main sequence\nSummary: In astronomy, the main sequence is a classification of stars which appear on plots of stellar color versus brightness as a continuous and distinctive band. Stars on this band are known as main-sequence stars or dwarf stars, and positions of stars on and off the band are believed to indicate their physical properties, as well as their progress through several types of star life-cycles. These are the most numerous true stars in the universe and include the Sun. Color-magnitude plots are known as Hertzsprung–Russell diagrams after Ejnar Hertzsprung and Henry Norris Russell. \nAfter condensation and ignition of a star, it generates thermal energy in its dense core region through nuclear fusion of hydrogen into helium. During this stage of the star's lifetime, it is located on the main sequence at a position determined primarily by its mass but also based on its chemical composition and age. The cores of main-sequence stars are in hydrostatic equilibrium, whe

In [None]:
# combine train and test Wikipedia results into one list
wiki_results = train_data_wikipedia_results + test_data_wikipedia_results

In [None]:
import gc
import torch

# delete variables to free up memory
del train_data_question_queries
del test_data_question_queries
del train_data_wikipedia_results
del test_data_wikipedia_results
del seen_titles

gc.collect()

if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
from langchain.schema import Document

wiki_docs = []

# convert Wikipedia search results into LangChain Document objects, attaching metadata such as the query index
for i, result in enumerate(wiki_results):
    # if it's a single string, wrap it in a Document
    if isinstance(result, str):
        wiki_docs.append(Document(
            page_content=result,
            metadata={"query_index": i}
        ))

    # if its a list of strings, wrap each in a Document
    elif isinstance(result, list):
        for j, content in enumerate(result):
            wiki_docs.append(Document(
                page_content=content,
                metadata={"query_index": i, "result_index": j}
            ))

In [None]:
wiki_docs[0]

Document(metadata={'query_index': 0, 'result_index': 0}, page_content="Page: Main sequence\nSummary: In astronomy, the main sequence is a classification of stars which appear on plots of stellar color versus brightness as a continuous and distinctive band. Stars on this band are known as main-sequence stars or dwarf stars, and positions of stars on and off the band are believed to indicate their physical properties, as well as their progress through several types of star life-cycles. These are the most numerous true stars in the universe and include the Sun. Color-magnitude plots are known as Hertzsprung–Russell diagrams after Ejnar Hertzsprung and Henry Norris Russell. \nAfter condensation and ignition of a star, it generates thermal energy in its dense core region through nuclear fusion of hydrogen into helium. During this stage of the star's lifetime, it is located on the main sequence at a position determined primarily by its mass but also based on its chemical composition and age.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split Wikipedia documents into smaller chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "!", "?", " "]
)

splitted_wiki_docs = splitter.split_documents(wiki_docs)

In [None]:
splitted_wiki_docs[0]

Document(metadata={'query_index': 0, 'result_index': 0}, page_content='Page: Main sequence')

In [None]:
from sentence_transformers import SentenceTransformer

# generate embeddings for the Wikipedia document chunks
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

wiki_embeddings = [embedding_model.encode(doc.page_content) for doc in splitted_wiki_docs]

In [None]:
len(wiki_embeddings)

1744

## Build FAISS Index of Wikipedia Embeddings

In [None]:
import numpy as np
import faiss

# build a FAISS index for efficient vector search using Wikipedia embeddings.
wiki_embeddings = np.array(wiki_embeddings)
dimension = wiki_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(wiki_embeddings)

## Save Documents

In [None]:
faiss.write_index(index, "/content/drive/MyDrive/Colab Notebooks/DCL/Task/data/wiki_embeddings_index.faiss")

In [None]:
import json

# save the Wikipedia documents as a JSON file
wiki_docs_serializable = []

for doc in wiki_docs:
    doc_data = {
        "page_content": doc.page_content,
        "metadata": doc.metadata
    }
    wiki_docs_serializable.append(doc_data)

with open("/content/drive/MyDrive/Colab Notebooks/DCL/Task/data/wiki_docs.json", "w") as f:
    json.dump({"docs": wiki_docs_serializable}, f, indent=4)


# save the Wikipedia documents as a CSV file
docs_data = []

for doc in wiki_docs:
    doc_data = {
        "page_content": doc.page_content,
        "metadata": doc.metadata
    }
    docs_data.append(doc_data)

df_wiki_docs = pd.DataFrame(docs_data)

df_wiki_docs.to_csv("/content/drive/MyDrive/Colab Notebooks/DCL/Task/data/wiki_docs.csv", index=False)

In [None]:
# save the splitted Wikipedia documents as a JSON file
splitted_wiki_docs_serializable = []

for doc in splitted_wiki_docs:
    doc_data = {
        "page_content": doc.page_content,
        "metadata": doc.metadata
    }
    splitted_wiki_docs_serializable.append(doc_data)

with open("/content/drive/MyDrive/Colab Notebooks/DCL/Task/data/splitted_wiki_docs.json", "w") as f:
    json.dump({"docs": splitted_wiki_docs_serializable}, f, indent=4)

# save the splitted Wikipedia documents as a CSV file
splitted_docs_data = []

for doc in splitted_wiki_docs:
    doc_data = {
        "page_content": doc.page_content,
        "metadata": doc.metadata
    }
    splitted_docs_data.append(doc_data)

df_splitted_wiki_docs = pd.DataFrame(splitted_docs_data)
df_splitted_wiki_docs.to_csv("/content/drive/MyDrive/Colab Notebooks/DCL/Task/data/splitted_wiki_docs.csv", index=False)


## Build Question Answering System with RAG

In [None]:
from transformers import pipeline

# initialize the language model pipeline for text generation (PHI-2 model used here)
llm = pipeline("text-generation",
               model="microsoft/phi-2",
               device_map="auto")

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# del llm

# gc.collect()
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()

In [None]:
# answer the question using only the context provided above. Reply with a single letter (A, B, C, D, or E). No explanation. If unsure, reply with 'no answer'.

In [None]:
def answer_question_with_rag(question, choices):
    # embed the question
    question_embedding = embedding_model.encode([question])

    # search for relevant context in the FAISS index
    D, I = index.search(np.array(question_embedding), k=8)
    context = "\n".join([splitted_wiki_docs[i].page_content for i in I[0]])

    # prepare a prompt by formatting the question and choices
    options = "\n".join([f"{key}. {val}" for key, val in choices.items()])
    prompt = f""" You are a multiple-choice question solver. Select the correct answer based ONLY on the provided context and respond with only the corresponding letter (A, B, C, D, or E). Do not provide any explanation or justification. If you cannot determine the correct answer, return 'no answer'.
    {context}\n\nQuestion: {question}\nChoices:\n{options}\nAnswer:"""

    # generate an answer using the language model
    result = llm(prompt, max_new_tokens=5, do_sample=False, return_full_text=False)
    output = result[0]['generated_text'].strip()
    # print(output)

    # extract first valid letter from model output
    for c in output:
        if c in "ABCDE":
            return c
    return "no answer"

In [None]:
print(train_data['prompt'][0])
print(train_data['A'][0])
print(train_data['B'][0])
print(train_data['C'][0])
print(train_data['D'][0])
print(train_data['E'][0])
print(train_data['answer'][0])

What is the main sequence in astronomy?
The main sequence is a type of galaxy that contains a large number of stars.
The main sequence is a type of black hole that is formed from the collapse of a massive star.
The main sequence is a continuous and distinctive band of stars that appears on plots of stellar color versus brightness. Stars on this band are known as main-sequence stars or dwarf stars.
The main sequence is a group of planets that orbit around a star in a solar system.
The main sequence is a type of nebula that is formed from the explosion of a supernova.
C


In [None]:
# testing the model on first sample of train data
temp = answer_question_with_rag(train_data['prompt'][0], train_data.loc[0, ['A', 'B', 'C', 'D', 'E']])
temp

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'C'

## Evaluate Model Accuracy on Train Data

In [None]:
train_answers = []

# generate answers for train data questions
for i in range(len(train_data)):
    question = train_data.loc[i, 'prompt']
    choices = train_data.loc[i, ['A', 'B', 'C', 'D', 'E']].to_dict()

    prompt = answer_question_with_rag(question, choices)
    train_answers.append(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [None]:
# convert predicted answers to a DataFrame
train_answers = pd.DataFrame(train_answers, columns=['predicted_answer'])

In [None]:
train_answers

Unnamed: 0,predicted_answer
0,C
1,B
2,A
3,A
4,B
5,E
6,D
7,B
8,D
9,A


In [None]:
train_answers.to_csv('/content/drive/MyDrive/Colab Notebooks/DCL/Task/data/train_predictions.csv', index=False)

In [None]:
# accuracy
correct = (train_answers['predicted_answer'] == train_data['answer']).sum()
total = len(train_data)
accuracy = correct / total

print(f"Accuracy on train data: {accuracy * 100:.2f}%")

Accuracy on train data: 74.00%


## Generate Predictions for Test Data

In [None]:
test_answers = []

# generate answers for test data questions
for i in range(len(test_data)):
    question = test_data.loc[i, 'prompt']
    choices = test_data.loc[i, ['A', 'B', 'C', 'D', 'E']].to_dict()

    prompt = answer_question_with_rag(question, choices)
    test_answers.append(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [None]:
test_answers = pd.DataFrame(test_answers, columns=['predicted_answer'])

In [None]:
test_answers

Unnamed: 0,predicted_answer
0,B
1,C
2,B
3,B
4,C
...,...
145,A
146,B
147,C
148,C


In [None]:
test_answers.to_csv('/content/drive/MyDrive/Colab Notebooks/DCL/Task/data/test_predictions.csv', index=False)

## Explore

### Datasets from Kaggle Hub

In [None]:
# import kagglehub

# jjinho_wikipedia_20230701_path = kagglehub.dataset_download('jjinho/wikipedia-20230701')
# jjinho_wikipedia_2023_07_faiss_index_path = kagglehub.dataset_download('jjinho/wikipedia-2023-07-faiss-index')

### paraphrase-MiniLM-L6-v2

In [None]:
# from sentence_transformers import SentenceTransformer

# generate embeddings for the Wikipedia document chunks
# embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# wiki_embeddings = [embedding_model.encode(doc.page_content) for doc in splitted_wiki_docs]

### google/flan-t5-base
### Accuracy: 42%

In [None]:
# from transformers import pipeline

# llm = pipeline("text-generation",
#                model="google/flan-t5-base",
#                device_map="auto")

# def answer_question_with_rag(question, choices):
#     question_embedding = embedding_model.encode([question])

#     D, I = index.search(np.array(question_embedding), k=7)
#     context = "\n".join([splitted_wiki_docs[i].page_content for i in I[0]])
#     # print(context)

#     options = "\n".join([f"{opt}: {text}" for opt, text in choices.items()])
#     prompt = f"""
# You are a multiple-choice question solver.
# Select the correct answer based ONLY on the provided context and respond with only the corresponding letter (A, B, C, D, or E).
# Do not provide any explanation or justification.
# If you cannot determine the correct answer, return 'no answer'.

# Context:
# {context}

# Question:
# {question}

# Options:
# {options}

# Answer:"""

#     # Step 4: Generate answer
#     result = llm(prompt, max_new_tokens=1)
#     output = result[0]['generated_text'].strip()
#     return output[0] if output and output[0] in "ABCDE" else "no answer"

### TheBloke/Mistral-7B-Instruct-v0.1-GGUF
### Accuracy: 62%

In [None]:
# from llama_cpp import Llama

# llm = Llama(model_path="mistral.gguf",
#             n_ctx=2048,  # context length
#             n_threads=4, # depends on CPU cores
#             n_gpu_layers=50)


# def ask_mistral(question, choices):

#     question_embedding = embedding_model.encode([question])

#     D, I = index.search(np.array(question_embedding), k=7)
#     context = "\n".join([splitted_wiki_docs[i].page_content for i in I[0]])

#     options = "\n".join([f"{key}. {val}" for key, val in choices.items()])
#     # prompt = f"{context}\n\nQuestion: {question}\nChoices:\n{options}\nAnswer:"
#     prompt = f"""


# Context:
# {context}

# Question:
# {question}

# Choices:
# {options}

# Answer:"""

#     response = llm(prompt, max_tokens=5, stop=["\n"])
#     text = response['choices'][0]['text'].strip()

#     for c in text:
#         if c in "ABCDE":
#             return c
#     return "no answer"
