# Dataset for out-of-scope negative finetuning

## Imports

In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import json
import pandas as pd

In [None]:
data_path = "../data/domain_bound_data/"

## stanford-question-answering-dataset - Generate Q & A dataset

In [None]:

# Set the path to the file you'd like to load
file_path = "train-v1.1.json"

# Load the latest version
general_qna_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "stanfordu/stanford-question-answering-dataset",
  file_path
)


In [None]:
general_qna_df.head()

In [None]:
for title in general_qna_df["data"].apply(lambda data: data["title"]).sort_values():
    print(title)

In [None]:
medical_healthcare_topics = {
    "Antibiotics",
    "Genome",
    "Gene",
    "Brain",
    "Immunology",
    "Digestion",
    "Myocardial_infarction",
    "Tuberculosis",
    "Asthma",
    "Diarrhea",
    "Pain",
    "Bacteria",
    "Infection",
    "Circadian_rhythm",
    "Pharmaceutical_industry",
    "Nutrition",
    "Human_Development_Index",
}

In [None]:
general_qna_df_filtered = general_qna_df[general_qna_df["data"].apply(lambda data: data["title"] not in medical_healthcare_topics)]

In [None]:
general_qna_df_filtered.iloc[0, 0]["paragraphs"][0]["qas"][0]["question"]

In [None]:
questions = []

def find_questions(data):
    for item in data["paragraphs"]:
        for qas in item["qas"]:
            questions.append(qas["question"])

general_qna_df_filtered["data"].apply(find_questions)
questions

In [None]:
len(questions)

In [None]:
non_med_df = pd.DataFrame({"question": questions})

In [None]:
non_med_df.shape

In [None]:
non_med_df = non_med_df.sample(frac=1, random_state=32).reset_index(drop=True)
non_med_df.head()

In [None]:
non_med_df.to_csv(data_path + "general_full.csv")

In [None]:
non_med_df = pd.read_csv(data_path + "general_full.csv")

In [None]:
non_med_df.shape

## Coding questions

Add some coding question to negative sampling

In [None]:
coding_df = pd.read_csv(data_path + "coding_questions.csv")
coding_df.head()

In [None]:
coding_df = coding_df.rename(columns={"Question": "question"})

In [None]:
non_med_qna_df = pd.concat([non_med_df.iloc[:2000, :], coding_df["question"]])
non_med_qna_df = non_med_qna_df.sample(frac=1, random_state=32).reset_index(drop=True)
non_med_qna_df.head(30)

In [None]:
non_med_qna_df.shape

In [None]:
non_med_responses_df = pd.read_csv(data_path + "non_med_responses.csv")

In [None]:
with open(data_path + "non_med_responses.txt", "r") as file:
    non_med_responses = pd.Series(file.read().split("\n\n"))
non_med_responses

In [None]:
non_med_responses.sample().iloc[0]

In [None]:
# pair each non med question with a possible out of scope response
non_med_qna_df["answer"] = non_med_qna_df["question"].apply(lambda _: non_med_responses.sample().iloc[0])

In [None]:
# add the class
non_med_qna_df["class"] = non_med_qna_df["question"].apply(lambda _: "non_med")

In [None]:
non_med_qna_df.head()

In [None]:
non_med_qna_df.to_csv(data_path + "v7/non_med_qna.csv")

In [None]:
non_med_qna_df = pd.read_csv(data_path + "v2/non_med_qna.csv")

In [None]:
non_med_qna_df.shape

In [None]:
non_med_qna_df.head(30)

# Medical domain questions

In [None]:
med_qna_df = pd.read_csv(data_path + "qna_long_seq_filtered.csv")
med_qna_df = med_qna_df.loc[112300:, ["question", "answer"]]
med_qna_df.head()

In [None]:
med_qna_df.shape

In [None]:
med_qna_df = med_qna_df.iloc[:5000, :].reset_index(drop=True)
med_qna_df.head()

In [None]:
med_qna_df.shape

In [None]:
med_qna_df["class"] = med_qna_df["question"].apply(lambda _: "med")
med_qna_df.head()

In [None]:
med_qna_df.to_csv(data_path + "med_qna.csv")

In [None]:
med_qna_df = pd.read_csv(data_path + "med_qna.csv")
med_qna_df.head()

## Combined data set

In [None]:
domain_bound_qna_df = pd.concat([non_med_qna_df, med_qna_df])

In [None]:
domain_bound_qna_df.head(30)

In [None]:
domain_bound_qna_df = domain_bound_qna_df.sample(frac=1, random_state=32).reset_index(drop=True)

In [None]:
domain_bound_qna_df.shape

In [None]:
train_df = domain_bound_qna_df.iloc[:6500, :]
test_df = domain_bound_qna_df.iloc[6500:7000, :]
val_df = domain_bound_qna_df.iloc[7000:, :]

In [None]:
train_df.shape, test_df.shape, val_df.shape

In [None]:
train_df.to_csv(data_path + "v7/train.csv", index=False)
test_df.to_csv(data_path + "v7/test.csv", index=False)
val_df.to_csv(data_path + "v7/val.csv", index=False)

In [None]:
train_df.head(-50)