In [None]:
# instruciton datasetの生成

import pandas as pd
import os
from datasets import load_dataset
import json
import random
from tqdm import tqdm

data_folder="data/0516code"

#dataフォルダ内をリセット
os.system(f"mkdir {data_folder}")
os.system(f"rm -rf {data_folder}/*")


ds_dict={}

def clean_autogen(text):
    if text is None:
        return ""
    text=text.strip()
    return text

question_template="以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n"
answer_template="\n\n### 応答:\n"

#!pip install rapidfuzz
#from rapidfuzz.process import cdist

jmt_bench_df=pd.read_csv("reference_data/jmtbench.csv")
bench_questions=jmt_bench_df["問い"].tolist()

def check_jmt_similarity(q,bench_questions):
    return 1  #一度チェックして問題ないなら、チェックしない
    scores = cdist([q], bench_questions,workers=1)
    score=max(scores[0])
    return score


records=[]

# # mixtralで自動生成したQ&A

score_threshold=4
sim_threshold=80


ng_words=[
          #回答を避けるプロンプトの削除
          "申し訳","分からない","分かりません","すみません",
          #図表などへの言及
          "図","表",
          #日本関係の事項はハルシネーションが多いので消す
          "日本","京都","東京","寿司", 
          ]


In [None]:

exclude_count=0

datasets=[
    load_dataset("hatakeyama-llm-team/AutoGeneratedJapaneseQA",split="train"),
    load_dataset("kanhatakeyama/OrcaJaMixtral8x22b",split="train"),
    load_dataset("kanhatakeyama/ChatbotArenaJaMixtral8x22b",split="train"),

]
for dataset in datasets:
    for original_record in tqdm(iter(dataset)):
        q=clean_autogen(original_record["question"])
        a=clean_autogen(original_record["answer"])
        if q=="" or a=="":
            continue

        if "score" in original_record:
            if original_record["score"] is None:
                continue
            if int(original_record["score"])<score_threshold:
                continue

        if check_jmt_similarity(q,bench_questions)>sim_threshold:
            print("too similar to jmt bench",q)
            continue

        exclude_flag=False

        #回答しないパターンのrecordを除外
        for ng_word in ng_words:
            if a.find(ng_word)>=0 or q.find(ng_word)>=0:
                #print("excluded:",a)
                exclude_flag=True
                exclude_count+=1
                continue

        if exclude_flag:
            continue
        #if len(a)<10:
        #    print("too short answer",a)
        #    continue

        text=f"{question_template}{q}{answer_template}{a}"
        if a!="":
            records.append(text)


ds_dict["auto_gen_mixtral"]=records


In [None]:

# %% [markdown]
# # hachiさんのalpaca + mixtral dataset

# %%

hachi_datasets=[
    load_dataset("HachiML/Hachi-Alpaca",split='v1.0_cleaned'),
    load_dataset("HachiML/Evol-Alpaca-gen3-500",split='train'),
]

# %%
records=[]
for hachi_ds in hachi_datasets:
    for record in tqdm(hachi_ds):
        q=record["instruction"]
        if "input" in record:
            inp=record["input"]
        else:
            inp=""
        if inp!="":
            q+="\n"+inp
        a=record["output"]
        if q=="" or a=="":
            continue
        text=f"{question_template}{q}{answer_template}{a}"
        records.append(text)
        

    ds_dict["hachi_alpaca"]=records


In [None]:

# %% [markdown]
# # Bumpo dataset

#文法理解に関するデータセット
ds2=load_dataset("hatakeyama-llm-team/BumpoRikai",split="train")
# %%
records=[]
for original_record in iter(ds2):
    q=(original_record["question"])
    a=(original_record["answer"])
    inst=(original_record["instruction"])
    if q=="" or a=="":
        continue
    text=f"{question_template}{q}{answer_template}{a}"
    records.append(text)
ds_dict["bumpo_rikai"]=records
records[1]



In [None]:

#minnade
m_ds=load_dataset("minnade/chat-daily",split="train")

id_to_content={}
for record in m_ds:
    id_to_content[record["id"]]=record["body"]

questions=[]
for record in m_ds:
    if record["role"]=="assistant":
        q=id_to_content[record["parent_id"]]
        a=record["body"]
        if a is None:
            continue
        if len(a)<4:
            continue
        #questions.append((q,a))
        text=f"{question_template}{q}{answer_template}{a}"
        questions.append(text)

ds_dict["minnade"]=questions

In [None]:


# %%
all_recrds=[]
for k,v in ds_dict.items():
    all_recrds+=v

# %%

def write_jsonl(records,
    output_path="data/all.jsonl",
    n_eval=500,
    n_train=10**7,
    ):

    random.shuffle(records)
    df=pd.DataFrame()
    df["text"] =records[:-n_eval][:n_train]
    df["text"]=df["text"].astype(str)
    df=df.reset_index()
    df.to_parquet(output_path)
    
    #eval
    df=pd.DataFrame()
    df["text"] =records[-n_eval:]
    df["text"]=df["text"].astype(str)
    df=df.reset_index()
    df.to_parquet(output_path.replace(".parquet","_eval.parquet"))
    return df

In [None]:
n_train=10**10
df=write_jsonl(all_recrds,f"{data_folder}/all_{n_train}.parquet",n_train=n_train)

# code dataset

In [None]:

def count_half_width_ratio(text):
    # 全文字数
    total_chars = len(text)
    # 半角文字数
    half_width_chars = sum(1 for char in text if ord(char) < 128)
    
    # 半角文字の割合を計算
    if total_chars == 0:
        return 0  # 文字列が空の場合は0を返す
    return half_width_chars / total_chars * 100
code_keywords=[
    "Python","python","code","コード","JSON","Java","XML","csv","CSV","def","list","html","HTML",
    "プログラム","スクリプト","script","Script"
]
code_records=[]

for record in all_recrds:
    for keyword in code_keywords:
        if record.find(keyword)>=0:
            if count_half_width_ratio(record)>12:
                code_records.append(record)
                break

len(code_records)


In [None]:
code_ds_dict={}

In [None]:
#openmathinst
openmath_ds=load_dataset("kunishou/OpenMathInstruct-1-1.8m-ja",split="train")

In [None]:
records=[]
for original_record in iter(openmath_ds):
    q=(original_record["question_ja"])
    a=(original_record["generated_solution_ja"])
    #inst=(original_record["instruction"])
    if q=="" or a=="":
        continue
    text=f"{question_template}{q}{answer_template}{a}"
    records.append(text)
code_ds_dict["openmathja"]=records
records[1]



In [None]:

ds=load_dataset("HachiML/alpaca_jp_python",split="v0.2")

records=[]
for original_record in iter(ds):
    q=(original_record["instruction"])
    a=(original_record["output"])
    inp=(original_record["input"])
    if inp!="":
        q+="\n"+inp
    if q=="" or a=="":
        continue
    text=f"{question_template}{q}{answer_template}{a}"
    records.append(text)
code_ds_dict["code_hachi"]=records
records[1]



In [None]:
ds=load_dataset("kunishou/amenokaku-code-instruct",split="train")

records=[]
for original_record in iter(ds):
    q=(original_record["instruction"])
    a=(original_record["output"])
    inp=(original_record["input"])
    if inp!="":
        q+="\n"+inp
    if q=="" or a=="":
        continue
    text=f"{question_template}{q}{answer_template}{a}"
    records.append(text)
code_ds_dict["amenokaku"]=records
records[1]



In [None]:

ds=load_dataset("https://huggingface.co/datasets/saldra/sakura_japanese_dataset",split="train")

records=[]
for original_record in iter(ds):
    q=(original_record["instruction"])
    a=(original_record["output"])
    inp=(original_record["input"])
    if inp!="":
        q+="\n"+inp
    if q=="" or a=="":
        continue
    text=f"{question_template}{q}{answer_template}{a}"
    records.append(text)
code_ds_dict["sakura"]=records
records[1]



In [None]:
# meta math

ds=load_dataset("meta-math/MetaMathQA",split="train")

records=[]
for original_record in iter(ds):
    q=(original_record["query"])
    a=(original_record["response"])
    if q=="" or a=="":
        continue
    text=f"{question_template}{q}{answer_template}{a}"
    records.append(text)
code_ds_dict["meta_math"]=records
records[1]



In [None]:

ds=load_dataset("microsoft/orca-math-word-problems-200k",split="train")

records=[]
for original_record in iter(ds):
    q=(original_record["question"])
    a=(original_record["answer"])
    if q=="" or a=="":
        continue
    text=f"{question_template}{q}{answer_template}{a}"
    records.append(text)
code_ds_dict["orca_math"]=records
records[1]



In [None]:


ds=load_dataset("m-a-p/CodeFeedback-Filtered-Instruction",split="train")

records=[]
for original_record in iter(ds):
    q=(original_record["query"])
    a=(original_record["answer"])
    if q=="" or a=="":
        continue
    text=f"{question_template}{q}{answer_template}{a}"
    records.append(text)
code_ds_dict["codefeedback"]=records
records[1]



In [None]:
# %%
all_code_records=[]
for k,v in code_ds_dict.items():
    all_code_records+=v
all_code_records+=code_records
len(all_code_records)

In [None]:
n_train=10**10
_=write_jsonl(all_code_records,f"{data_folder}/code_{n_train}.parquet",n_train=n_train)

In [None]:

n_train=10**10
_=write_jsonl(all_recrds+all_code_records,f"{data_folder}/code_all_{n_train}.parquet",n_train=n_train)