# preprocess data

In [93]:
import os
import pandas as pd
from pathlib import Path
import argparse
from datasets import Dataset
from omegaconf import OmegaConf

In [101]:
wd = Path.cwd().parent

"""-------------------------- options --------------------------"""
parser = argparse.ArgumentParser(description="Preprocess data for finetuning")
parser.add_argument("--config", "-c", type=str, default="finetune_base_config")

args, _ = parser.parse_known_args()

config = OmegaConf.load(
    f"{wd}/configs/{args.config}.yaml"
)

In [102]:
df = pd.read_csv(os.path.join(wd, config.data.rawdata_path))

In [103]:
df.head()

Unnamed: 0,index,PMID,text,questions,answers
0,0,932188,Title: Hormone-fuel concentrations in anephric...,1. Can you provide a one-word answer for the h...,1. hormone that showed elevated levels in the ...
1,1,1000028,Title: Plasma prolactin response to L-dopa TRH...,1. Can L-dopa be used as a treatment for thyro...,1. Treatment for thyrotoxicosis to suppress pl...
2,2,100045,Title: The Morquio syndrome: neuropathology an...,Instructions:\n\n1. What is the specific enzym...,1. Specific enzymatic activity decreased in Mo...
3,3,1000469,Title: Assessment of serial carcinoembryonic a...,1. Can serial CEA assays alone be used as a su...,1. Can serial CEA assays alone be used as a su...
4,4,1000475,Title: Carcinoembryonic antigen (CEA) activity...,1. Can you provide the exact CEA activity leve...,1. CEA activity levels in plasma and pancreati...


In [104]:
df.iloc[216]["questions"]

'1. What are the specific mediators released by activated alveolar macrophages in lung injury associated with experimental acute pancreatitis?\n2. Can you provide a one-word answer to describe the role of the liver in activating alveolar macrophages during acute pancreatitis-induced lung injury?\n3. How can the activation of alveolar macrophages be prevented in the context of experimental acute pancreatitis?\n4. Please describe the time frame within which neutrophil infiltration into the lungs is observed after induction of acute pancreatitis.\n5. Is leukotriene B4 released by alveolar macrophages during lung injury in acute pancreatitis? (Yes/No question)'

In [71]:
if "index" in df.columns:
    df.drop("index", axis=1, inplace=True)
if "PMID" in df.columns:
    df.drop("PMID", axis=1, inplace=True)


df["questions"] = df["questions"].apply(lambda x: x.split("2.")[0].replace("1. ", "").replace("\n", ""))
df["answers"] = df["answers"].apply(lambda x: x.split("2.")[0].replace("1. ", "").replace("\n", ""))
df["text"] = df["text"].apply(lambda x: x.replace("Title: ", "").replace("Abstract: ", "").replace("\n", ""))

df.rename(columns={"text":"context"}, inplace=True)
df.rename(columns={"questions":"question"}, inplace=True)
df.rename(columns={"answers":"answer"}, inplace=True)

In [72]:
df.head()

Unnamed: 0,context,question,answer
0,Hormone-fuel concentrations in anephric subjec...,Can you provide a one-word answer for the horm...,hormone that showed elevated levels in the art...
1,Plasma prolactin response to L-dopa TRH and me...,Can L-dopa be used as a treatment for thyrotox...,Treatment for thyrotoxicosis to suppress plasm...
2,The Morquio syndrome: neuropathology and bioch...,Instructions:What is the specific enzymatic ac...,Specific enzymatic activity decreased in Morqu...
3,Assessment of serial carcinoembryonic antigen ...,Can serial CEA assays alone be used as a subst...,Can serial CEA assays alone be used as a subst...
4,Carcinoembryonic antigen (CEA) activity in pan...,Can you provide the exact CEA activity levels ...,CEA activity levels in plasma and pancreatic j...


In [89]:
if not os.path.exists(os.path.join(wd, "data/preprocessed")):
    os.makedirs(os.path.join(wd, "data/preprocessed"))

df.to_csv(os.path.join(wd, "data/preprocessed", config.data.rawdata_path.split("/")[-1].replace(".csv", "") + "_preprocessed.csv"), index=False)

# upload dataset on huggingface

In [98]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [91]:
df = pd.read_csv(wd / "data" / "preprocessed" / "evidence_1_preprocessed.csv")

In [94]:
dataset = Dataset.from_pandas(df)

In [99]:
dataset.push_to_hub("StoneSeller/OpenTarget_pubmed_qa_sample", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]