In [16]:
import json
import re
import pandas as pd
from mlxtend.frequent_patterns import apriori
from io import StringIO

# načtení vstupních dat (soubor json s náhodnými CSV od ChatGPT)
input_file = "empty_synthetic.json"
output_file = "synthetic.json"

# Definice funkce pro vyplňování datasetu
def process_prompt(prompt):
    import json
    import pandas as pd
    from mlxtend.frequent_patterns import apriori
    from io import StringIO
    import re

    # Nalezení <csv>...</csv>
    csv_match = re.search(r"<csv>(.*?)</csv>", prompt, re.DOTALL)
    if not csv_match:
        return "CSV data not found."
    csv_text = csv_match.group(1).strip()

    # Nalezení support (číslo po "support ")
    support_match = re.search(r"support\s+([0-9]*\.?[0-9]+)", prompt)
    if not support_match:
        return "Support value not found."
    support_value = float(support_match.group(1))

    # Načtení CSV
    try:
        df = pd.read_csv(StringIO(csv_text))
    except Exception as e:
        return f"CSV read error: {e}"

    # Převedení každého řádku na seznam položek ve formátu "Sloupec_Hodnota"
    transactions = []
    for _, row in df.iterrows():
        trans = []
        for col in df.columns:
            val = row[col]
            if pd.notna(val):
                trans.append(f"{col}_{val}")
        transactions.append(trans)

    # One-hot encoding pro algoritmus apriori
    all_items = sorted(set(item for t in transactions for item in t))
    encoded_df = pd.DataFrame([
        {item: item in t for item in all_items}
        for t in transactions
    ])

    # Spuštění apriori
    try:
        freq_items = apriori(encoded_df, min_support=support_value, use_colnames=True)

        # Převod frozenset → seřazený string (rozdílné datové typy mezi pandas DataFrame a vstupem pro apriori)
        freq_items['itemsets'] = freq_items['itemsets'].apply(lambda x: ", ".join(sorted(list(x))))

        # Vytvoření textové výstupní tabulky
        output_lines = ["support\titemsets"]
        for _, row in freq_items.iterrows():
            output_lines.append(f"{row['support']:.3f}\t{row['itemsets']}")
        return "\n".join(output_lines)
        # Zachycení chyb
    except Exception as e:
        return f"Apriori error: {e}"



# Zpracování řádku po řádku (NDJSON)
output_lines = []
with open(input_file, "r", encoding="utf-8") as infile:
    for line in infile:
        line = line.strip()
        if not line:
            continue
        try:
            entry = json.loads(line)
            prompt = entry.get("prompt", "")
            entry["response"] = process_prompt(prompt)
            output_lines.append(json.dumps(entry, ensure_ascii=False))
        except Exception as e:
            print(f"Chyba při zpracování řádku: {e}")

# Uložení výstupu
with open(output_file, "w", encoding="utf-8") as outfile:
    outfile.write("\n".join(output_lines))



In [17]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="output.json")
print(dataset['train'][0])

Generating train split: 0 examples [00:00, ? examples/s]

{'prompt': 'I will give you CSV and your job is to find frequent itemsets of maxlenght 5 with support 0.143. Do not use any code, it is your job to do! <csv>Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8\nE,E,D,E,C,D,A,B\nD,E,A,D,A,E,C,A\nC,E,A,D,E,B,B,A\nE,D,D,A,D,D,D,B\nA,C,B,A,B,B,A,C\nD,C,E,D,D,E,D,D\nD,D,B,E,E,A,B,B\nA,D,B,D,C,B,C,B\nA,A,B,D,E,D,C,A\nE,C,B,B,B,C,E,D\nB,A,E,D,A,C,C,A\nD,A,D,C,C,C,B,B\nA,D,D,E,B,E,A,C\nD,C,E,B,A,B,B,B\nD,B,C,A,B,C,B,A\nD,D,E,E,D,C,E,D\nE,E,A,D,C,A,C,E\nE,D,C,E,A,E,A,B\nC,E,E,A,E,B,E,C\nC,A,C,A,A,A,B,D\nD,D,A,A,A,A,B,D\nB,C,A,E,A,D,E,B\nA,D,C,D,B,C,B,E\nB,B,D,D,D,B,E,B</csv>', 'response': 'support\titemsets\n0.208\tCol1_A\n0.333\tCol1_D\n0.208\tCol1_E\n0.167\tCol2_A\n0.208\tCol2_C\n0.333\tCol2_D\n0.208\tCol2_E\n0.208\tCol3_A\n0.208\tCol3_B\n0.167\tCol3_C\n0.208\tCol3_D\n0.208\tCol3_E\n0.250\tCol4_A\n0.375\tCol4_D\n0.250\tCol4_E\n0.292\tCol5_A\n0.208\tCol5_B\n0.167\tCol5_C\n0.167\tCol5_D\n0.167\tCol5_E\n0.167\tCol6_A\n0.250\tCol6_B\n0.250\tCol6_C\n0.167\tCol