In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from mecab import MeCab

mecab = MeCab()

def get_top_n_structure_similar(df, target_sentence, n=15):
    # "msg" 컬럼(문장 리스트) → 품사 시퀀스 변환
    
    err_sentence_list = df["err_sentence"].tolist()
    sentence_list = df["cor_sentence"].tolist()
    sentence_list_pos = [" ".join([tag for _, tag in mecab.pos(s)]) for s in sentence_list]
    target_pos = " ".join([tag for _, tag in mecab.pos(target_sentence)])
    
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([target_pos] + sentence_list_pos)
    sims = cosine_similarity(X[0], X[1:])[0]
    
    top_idxs = sims.argsort()[::-1][:n]
    # 결과를 (원문, 유사도) 튜플로 반환
    #return [err_sentence_list[idx] for idx in top_idxs], [(sentence_list[idx], sims[idx]) for idx in top_idxs]
    return list(zip([err_sentence_list[idx] for idx in top_idxs], [sentence_list[idx] for idx in top_idxs]))
    

In [None]:
df =pd.read_csv('data/train_dataset.csv')

print(get_top_n_structure_similar(df, "칭찬을 받으니 부끄럽자 얼굴이 빨개졌다."))


In [None]:
import os
import argparse

import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI
from prompts import baseline_prompt
import promptTemplate
import re
import importlib


importlib.reload(promptTemplate)

def extract_answer(text):
    # [Answer]: 뒤에 오는 줄의 맨 앞~줄 끝까지 추출
    match = re.search(r"\[Answer\]:\s*(.*)", text)
    if match:
        return match.group(1).strip()
    return ""


def main():

    args = {"input":"data/train_dataset.csv","model":"solar-pro2","output":"eval_submission.csv"}
    

    # Load environment variables
    load_dotenv()
    
    # Load data
    df = pd.read_csv(args['input'])
    
    if "err_sentence" not in df.columns:
        raise ValueError("Input CSV must contain 'err_sentence' column")

    # Setup Upstage client
    api_key = os.getenv("UPSTAGE_API_KEY")
    if not api_key:
        raise ValueError("UPSTAGE_API_KEY not found in environment variables")
    
    print(f"Model: {args['model']}")
    print(f"Output: {args['output']}")

    err_sentences = []
    cor_sentences = []
    
    # Process each sentence
    for row in tqdm(df.itertuples(), total=len(df), desc="Generating"):

        text = row.err_sentence
        
        err_sentences.append(text)

        examples = get_top_n_structure_similar(df, text)
        
        try:
            resp = promptTemplate.process(text, examples[1:])
            corrected = extract_answer(resp)
            cor_sentences.append(corrected)
            
        except Exception as e:
            print(f"Error processing: {text[:50]}... - {e}")
            cor_sentences.append(text)  # fallback to original

    # Save results with required column names
    out_df = pd.DataFrame({"err_sentence": err_sentences, "cor_sentence": cor_sentences})
    out_df.to_csv(args['output'], index=False)
    print(f"Wrote {len(out_df)} rows to {args['output']}")


In [None]:
import time
import subprocess
import pandas as pd

if __name__ == "__main__":

   
    main()

    time.sleep(3)

    # 명령어 실행 및 콘솔 출력
    cmd = "uv run python evaluate.py --true_df data/train_dataset.csv --pred_df eval_submission.csv --output analysis.csv"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    print("콘솔 출력:")
    print(result.stdout)

    # 파일 결과 확인
    df = pd.read_csv("analysis.csv")
    print("analysis.csv 결과:")
    print(df)
    
