In [29]:
import re
import os
import sys
import tqdm
import json
import pandas as pd
from langchain import PromptTemplate, FewShotPromptTemplate

In [None]:
# TRANSFORMS THE DATA FROM JSONL TO CSV WITH SRC, TGT, OUTPUT
def jsonl_to_csv(jsonl_path : str, csv_path : str):
    with open(jsonl_path, "r") as f:
        data = [json.loads(line) for line in f.readlines()]
    df = []
    pattern_srctgt = re.compile(r"Source: (.*)\nTarget: (.*)")
    for sample in tqdm.tqdm(data, total=len(data), desc="Processing jsonl"):
        match = pattern_srctgt.search(sample["input"])
        if not match:
            print(f"Skipping {sample['input']}")
            continue
        src = match.group(1).strip()
        tgt = match.group(2).strip()
        df.append({
            "src": src,
            "tgt": tgt,
            "output": sample["output"],
        })
    df = pd.DataFrame(df)
    df.to_csv(csv_path, index=False)
data_path = "/fs/startiger0/nmoghe/data/llama/pilot/classification/exp1/ref-free"
jsonl_to_csv(f"{data_path}/dev.jsonl", f"./data/dev.csv")
jsonl_to_csv(f"{data_path}/train.jsonl", f"./data/train.csv")

In [None]:
example_prompt = PromptTemplate(
    input_variables=["src", "tgt", "output"], 
    template="Source: {src}\nTarget: {tgt}\nOutput: {output}\n",
)

fewshot_template = FewShotPromptTemplate(
    examples=[],
    example_prompt