In [4]:
import pandas as pd
import re
import json

In [2]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r"[^a-zA-Z0-9.,\-\s]", "", text)  
    text = re.sub(r"\s+", " ", text).strip() 
    return text

In [3]:
def preprocess_dataset(file_path, output_path):
    df = pd.read_csv(file_path)

    df["instruction"] = df["instruction"].apply(clean_text)
    df["input"] = df["input"].apply(clean_text)
    df["output"] = df["output"].apply(clean_text)

    df["output"] = df["output"].apply(
        lambda x: x if x.startswith("According to") else "According to research, " + x
    )

    df = df.drop_duplicates()

    df.to_csv(output_path, index=False)
    print(f"Preprocessed dataset saved to {output_path}")

In [5]:
preprocess_dataset(
    "/Users/priyal/Documents/honours/datasets/qa_dataset_big.csv",
    "/Users/priyal/Documents/honours/datasets/qa_dataset_cleaned.csv",
)

Preprocessed dataset saved to /Users/priyal/Documents/honours/datasets/qa_dataset_cleaned.csv


In [5]:
def csv_to_json(csv_path, json_path):
    df = pd.read_csv(csv_path)

    records = df.to_dict(orient="records")

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=4, ensure_ascii=False)

    print(f"Converted dataset saved to {json_path}")

In [6]:
csv_to_json(
    "/Users/priyal/Documents/honours/datasets/qa_dataset_cleaned.csv",
    "/Users/priyal/Documents/honours/json/qa_dataset_cleaned.json",
)

Converted dataset saved to /Users/priyal/Documents/honours/json/qa_dataset_cleaned.json


In [9]:
df = pd.read_json("/Users/priyal/Documents/honours/json/qa_dataset_cleaned.json")
print(df.dtypes)

instruction    object
input          object
output         object
dtype: object
