## original ver.

In [3]:
import openai
import json
import os
import pprint
from dotenv import load_dotenv
from typing import Dict, List
import glob
import time

load_dotenv()

MODEL = "gpt-4o-mini"
openai.api_key = os.getenv("OPENAI_API_KEY")

client = openai
client

def fill_empty_fields(data: Dict, messages: List[Dict]) -> Dict:
    messages_str = "\n".join([f"{msg['role']}: {msg['content']['text']}" for msg in messages])
    
    participant_1 = data['participant_persona']['participant_1']
    participant_2 = data['participant_persona']['participant_2']

    prompt = f"""Analyze the given Conversation Log and Participant information and fill in ONLY the missing fields in the original JSON format. Do not modify any existing information.:

    Participant 1:
    name: {participant_1['name']}
    age: {participant_1['age']}
    gender: {participant_1['gender']}
    personality: {participant_1['personality']}
    background: {participant_1['background']}

    Participant 2:
    name: {participant_2['name']}
    age: {participant_2['age']}
    gender: {participant_2['gender']}
    personality: {participant_2['personality']}
    background: {participant_2['background']}

    # Conversation Log:
    {messages_str}

    # Guidelines:
    1. Infer age, gender, and other details based on the text content and writing style.
    2. Generate diverse and unique names and personalities for each participant. Use various expressions, not using the same expressions repeatedly.
    3. Use str sentences for the personality and background fields.
    4. Keep use the original fields text if it exists."""

    retries = 3
    for attempt in range(retries):
        try:
            response = client.ChatCompletion.create(
                        model=MODEL,
                        messages=[
                            {"role": "system", "content": "You are an AI assistant that helps to build conversation data set."},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.8,
                        response_format={"type": "json_object"}
            )
            response_content = response.choices[0].message.content

            try:
                filled_data = json.loads(response_content)   
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError: {e}")
                return

            for key, value in filled_data.items():
                if key == 'Participant 1':
                    participant_1.update(value)
                elif key == 'Participant 2':
                    participant_2.update(value)
            
            data['participant_persona']['participant_1'] = participant_1
            data['participant_persona']['participant_2'] = participant_2

            return data
        
        except openai.error.Timeout as e:
            print(f"Attempt {attempt + 1} of {retries} failed with timeout. Retrying...")
            time.sleep(3)  # Wait for 3 seconds before retrying

    raise Exception("All retry attempts failed due to timeout.")



# SPC-test

json_files = sorted(glob.glob('/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/SPC-test/*.json'))

for i, json_file in enumerate(json_files[:10]):
    with open(json_file, 'r') as file:
        data = json.load(file)
        messages = data['messages']

    filled_data = fill_empty_fields(data, data['messages'])
    print(f"filled_data_SPC-test_{i + 1}:", filled_data)
    print(f"Function end time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}")
    print("=====================================")

    output_dir = '/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/SPC-test'
    output_file = os.path.join(output_dir, f'filled_data_SPC-test_{i + 1}.json')

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(filled_data, file, ensure_ascii=False, indent=4)


filled_data_SPC-test_1: {'participant_persona': {'participant_1': {'name': 'Emily Johnson', 'age': 20, 'gender': 'Female', 'personality': "Enthusiastic and caring, Emily loves to connect with others and share her passions. She's optimistic about her future.", 'background': 'I am learning jujitsu but am still new to it. I run to relieve stress. I am taking college and hoping to be a teacher. Poker is my favorite card game. I am a huge Ed Sheeran fan.'}, 'participant_2': {'name': 'Michael Smith', 'age': 25, 'gender': 'Male', 'personality': 'Michael is laid-back and friendly, with a great sense of humor. He often looks for joy in small things and values his relationships.', 'background': "Its a dead end job so i am looking for something different. I work at McDonald's. My parents did not want me. I am considering going to college. I was raised by my uncle and aunt."}}, 'messages': [{'role': 'participant_1', 'content': {'emotion_scores': {}, 'text': 'What do you do for a living?'}}, {'role

## 240930 missing fields debugging
- 전체 파일 filling 완료 후 field 값 비어있는 파일만 디버깅

### test

In [15]:
import os
import json

# Define the directory path
directory_path = "/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/SPC-train"

# Initialize counters
counters = {
    "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
    "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
}

# Iterate through each JSON file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".json"):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            participant_persona = data.get("participant_persona", {})
            
            for participant in ["participant_1", "participant_2"]:
                for field in ["name", "age", "gender", "personality", "background"]:
                    if not participant_persona.get(participant, {}).get(field):
                        counters[participant][field] += 1

# Print the results
for participant, fields in counters.items():
    print(f"{participant}:")
    for field, count in fields.items():
        print(f"  {field}: {count} files")

participant_1:
  name: 152 files
  age: 152 files
  gender: 152 files
  personality: 152 files
  background: 0 files
participant_2:
  name: 152 files
  age: 152 files
  gender: 152 files
  personality: 152 files
  background: 0 files


In [67]:
import os
import json

# Define the directory path
directory_path = "/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/SPC-train"

# Initialize counters
counters = {
    "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
    "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
}

# List to store filenames with empty fields
files_with_empty_fields = []

# Iterate through each JSON file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".json"):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            participant_persona = data.get("participant_persona", {})
            
            for participant in ["participant_1", "participant_2"]:
                for field in ["name", "age", "gender", "personality", "background"]:
                    if not participant_persona.get(participant, {}).get(field):
                        counters[participant][field] += 1
                        if filename not in files_with_empty_fields:
                            files_with_empty_fields.append(filename)

# Print filenames with empty fields
for filename in sorted(files_with_empty_fields):
    print(filename)

filled_data_SPC-train_02965.json


In [66]:
import os
import json
from typing import List

def get_files_with_empty_fields(directory_path: str) -> List[str]:
    # Initialize counters
    counters = {
        "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
        "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
    }

    # List to store filenames with empty fields
    files_with_empty_fields = []

    # Iterate through each JSON file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                participant_persona = data.get("participant_persona", {})
                
                for participant in ["participant_1", "participant_2"]:
                    for field in ["name", "age", "gender", "personality", "background"]:
                        if not participant_persona.get(participant, {}).get(field):
                            counters[participant][field] += 1
                            if filename not in files_with_empty_fields:
                                files_with_empty_fields.append(filename)

    return sorted(files_with_empty_fields)

# Example usage
directory_path = "/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/SPC-train"
files_with_empty_fields = get_files_with_empty_fields(directory_path)
print(files_with_empty_fields)

['filled_data_SPC-train_02965.json']


In [None]:
import os
import json

def count_empty_fields_in_files(directory_path):
    # Initialize counters
    counters = {
        "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
        "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
    }

    # List to store filenames with empty fields
    files_with_empty_fields = []

    # Iterate through each JSON file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                participant_persona = data.get("participant_persona", {})
                
                for participant in ["participant_1", "participant_2"]:
                    for field in ["name", "age", "gender", "personality", "background"]:
                        if not participant_persona.get(participant, {}).get(field):
                            counters[participant][field] += 1
                            if filename not in files_with_empty_fields:
                                files_with_empty_fields.append(filename)

    # Print filenames with empty fields
    for filename in sorted(files_with_empty_fields):
        print(filename)

# Example usage
folder_name = "SPC-train"
directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"
count_empty_fields_in_files(directory_path)

### SPC-train
- [X] 다섯자리 넘버링 버전 파일로 SPC-train 디버깅 최종 코드
- [X] good

In [71]:
# 비어있는 값 파일, 갯수 확인
# count: 885 -> 152 -> 32 -> 8 -> 4 -> 1 -> 0

folder_name = "SPC-train"

import os
import json

# Define the directory path
directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"

# Initialize counters
counters = {
    "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
    "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
}

# List to store filenames with empty fields
files_with_empty_fields = []

# Iterate through each JSON file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".json"):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            participant_persona = data.get("participant_persona", {})
            
            for participant in ["participant_1", "participant_2"]:
                for field in ["name", "age", "gender", "personality", "background"]:
                    if not participant_persona.get(participant, {}).get(field):
                        counters[participant][field] += 1
                        if filename not in files_with_empty_fields:
                            files_with_empty_fields.append(filename)

# Print filenames with empty fields
for filename in sorted(files_with_empty_fields):
    print(filename)

participant_1:
  name: 0 files
  age: 0 files
  gender: 0 files
  personality: 0 files
  background: 0 files
participant_2:
  name: 0 files
  age: 0 files
  gender: 0 files
  personality: 0 files
  background: 0 files


In [70]:
# 비어있는 값 채우기

folder_name = 'SPC-train'

import openai
import json
import os
import pprint
from dotenv import load_dotenv
from typing import Dict, List
import glob
import time

load_dotenv()

MODEL = "gpt-4o-mini"
openai.api_key = os.getenv("OPENAI_API_KEY")

client = openai
client

def fill_empty_fields(data: Dict, messages: List[Dict]) -> Dict:
    messages_str = "\n".join([f"{msg['role']}: {msg['content']['text']}" for msg in messages])
    
    participant_1 = data['participant_persona']['participant_1']
    participant_2 = data['participant_persona']['participant_2']

    prompt = f"""Analyze the given Conversation Log and Participant information and fill in ONLY the missing fields in the original JSON format. Do not modify any existing information.:

    Participant 1:
    name: {participant_1['name']}
    age: {participant_1['age']}
    gender: {participant_1['gender']}
    personality: {participant_1['personality']}
    background: {participant_1['background']}

    Participant 2:
    name: {participant_2['name']}
    age: {participant_2['age']}
    gender: {participant_2['gender']}
    personality: {participant_2['personality']}
    background: {participant_2['background']}

    # Conversation Log:
    {messages_str}

    # Guidelines:
    1. Infer age, gender, and other details based on the text content and writing style.
    2. Generate diverse and unique names and personalities for each participant. Use various expressions, not using the same expressions repeatedly.
    3. Use str sentences for the personality and background fields.
    4. Keep use the original fields text if it exists."""

    retries = 3
    for attempt in range(retries):
        try:
            response = client.ChatCompletion.create(
                        model=MODEL,
                        messages=[
                            {"role": "system", "content": "You are an AI assistant that helps to build conversation data set."},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.8,
                        response_format={"type": "json_object"}
            )
            response_content = response.choices[0].message.content

            try:
                filled_data = json.loads(response_content)   
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError: {e}")
                return

            for key, value in filled_data.items():
                if key == 'Participant 1':
                    participant_1.update(value)
                elif key == 'Participant 2':
                    participant_2.update(value)
            
            data['participant_persona']['participant_1'] = participant_1
            data['participant_persona']['participant_2'] = participant_2

            return data
        except openai.error.Timeout as e:
            print(f"Attempt {attempt + 1} of {retries} failed with timeout. Retrying...")
            time.sleep(3)

    raise Exception("All retry attempts failed due to timeout.")

def process_files(file_list: List[str], directory_path: str):
    for filename in file_list:
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)
        
        messages = data.get('messages', [])
        filled_data = fill_empty_fields(data, messages)
        
        if filled_data:
            with open(file_path, 'w') as file:
                json.dump(filled_data, file, indent=4)

directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"
process_files(files_with_empty_fields, directory_path)

### SPC-test

In [25]:
# 비어있는 값 확인
# 12개 -> 3개 -> 0개

folder_name = 'SPC-test'

import os
import json

# Define the directory path
directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"

# Initialize counters
counters = {
    "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
    "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
}

# Iterate through each JSON file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".json"):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            participant_persona = data.get("participant_persona", {})
            
            for participant in ["participant_1", "participant_2"]:
                for field in ["name", "age", "gender", "personality", "background"]:
                    if not participant_persona.get(participant, {}).get(field):
                        counters[participant][field] += 1

# Print the results
for participant, fields in counters.items():
    print(f"{participant}:")
    for field, count in fields.items():
        print(f"  {field}: {count} files")

participant_1:
  name: 0 files
  age: 0 files
  gender: 0 files
  personality: 0 files
  background: 0 files
participant_2:
  name: 0 files
  age: 0 files
  gender: 0 files
  personality: 0 files
  background: 0 files


In [24]:
# 12개 비어있는 파일 채우기

folder_name = 'SPC-test'

import os
import json
from typing import List

def get_files_with_empty_fields(directory_path: str) -> List[str]:
    # Initialize counters
    counters = {
        "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
        "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
    }

    # List to store filenames with empty fields
    files_with_empty_fields = []

    # Iterate through each JSON file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                participant_persona = data.get("participant_persona", {})
                
                for participant in ["participant_1", "participant_2"]:
                    for field in ["name", "age", "gender", "personality", "background"]:
                        if not participant_persona.get(participant, {}).get(field):
                            counters[participant][field] += 1
                            if filename not in files_with_empty_fields:
                                files_with_empty_fields.append(filename)

    return sorted(files_with_empty_fields)

directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"
files_with_empty_fields = get_files_with_empty_fields(directory_path)
print(files_with_empty_fields)

import openai
import json
import os
import pprint
from dotenv import load_dotenv
from typing import Dict, List
import glob
import time

load_dotenv()

MODEL = "gpt-4o-mini"
openai.api_key = os.getenv("OPENAI_API_KEY")

client = openai
client


def fill_empty_fields(data: Dict, messages: List[Dict]) -> Dict:
    messages_str = "\n".join([f"{msg['role']}: {msg['content']['text']}" for msg in messages])
    
    participant_1 = data['participant_persona']['participant_1']
    participant_2 = data['participant_persona']['participant_2']

    prompt = f"""Analyze the given Conversation Log and Participant information and fill in ONLY the missing fields in the original JSON format. Do not modify any existing information.:

    Participant 1:
    name: {participant_1['name']}
    age: {participant_1['age']}
    gender: {participant_1['gender']}
    personality: {participant_1['personality']}
    background: {participant_1['background']}

    Participant 2:
    name: {participant_2['name']}
    age: {participant_2['age']}
    gender: {participant_2['gender']}
    personality: {participant_2['personality']}
    background: {participant_2['background']}

    # Conversation Log:
    {messages_str}

    # Guidelines:
    1. Infer age, gender, and other details based on the text content and writing style.
    2. Generate diverse and unique names and personalities for each participant. Use various expressions, not using the same expressions repeatedly.
    3. Use str sentences for the personality and background fields.
    4. Keep use the original fields text if it exists."""

    retries = 3
    for attempt in range(retries):
        try:
            response = client.ChatCompletion.create(
                        model=MODEL,
                        messages=[
                            {"role": "system", "content": "You are an AI assistant that helps to build conversation data set."},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.8,
                        response_format={"type": "json_object"}
            )
            response_content = response.choices[0].message.content

            try:
                filled_data = json.loads(response_content)   
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError: {e}")
                return

            for key, value in filled_data.items():
                if key == 'Participant 1':
                    participant_1.update(value)
                elif key == 'Participant 2':
                    participant_2.update(value)
            
            data['participant_persona']['participant_1'] = participant_1
            data['participant_persona']['participant_2'] = participant_2

            return data
        except openai.error.Timeout as e:
            print(f"Attempt {attempt + 1} of {retries} failed with timeout. Retrying...")
            time.sleep(3)

    raise Exception("All retry attempts failed due to timeout.")

def process_files(file_list: List[str], directory_path: str):
    for filename in file_list:
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)
        
        messages = data.get('messages', [])
        filled_data = fill_empty_fields(data, messages)
        
        if filled_data:
            with open(file_path, 'w') as file:
                json.dump(filled_data, file, indent=4)


directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"
process_files(files_with_empty_fields, directory_path)

['filled_data_SPC-test_00324.json', 'filled_data_SPC-test_00408.json', 'filled_data_SPC-test_00595.json']


### SPC-valid

In [46]:
# 비어있는 값 확인
# 18개 -> 4개 -> 3개 -> 2개 -> 1개 -> 0개 (수동으로 채웠음)

folder_name = 'SPC-valid'

import os
import json

# Define the directory path
directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"

# Initialize counters
counters = {
    "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
    "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
}

# List to store filenames with empty fields
files_with_empty_fields = []

# Iterate through each JSON file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".json"):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            participant_persona = data.get("participant_persona", {})
            
            for participant in ["participant_1", "participant_2"]:
                for field in ["name", "age", "gender", "personality", "background"]:
                    if not participant_persona.get(participant, {}).get(field):
                        counters[participant][field] += 1
                        if filename not in files_with_empty_fields:
                            files_with_empty_fields.append(filename)

# # Print the results
for participant, fields in counters.items():
    print(f"{participant}:")
    for field, count in fields.items():
        print(f"  {field}: {count} files")

# Print filenames with empty fields
for filename in sorted(files_with_empty_fields):
    print(filename)

participant_1:
  name: 0 files
  age: 0 files
  gender: 0 files
  personality: 0 files
  background: 0 files
participant_2:
  name: 0 files
  age: 0 files
  gender: 0 files
  personality: 0 files
  background: 0 files


In [36]:
# 비어있는 파일 채우기

folder_name = 'SPC-valid'

import os
import json
from typing import List

def get_files_with_empty_fields(directory_path: str) -> List[str]:
    # Initialize counters
    counters = {
        "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
        "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
    }

    # List to store filenames with empty fields
    files_with_empty_fields = []

    # Iterate through each JSON file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                participant_persona = data.get("participant_persona", {})
                
                for participant in ["participant_1", "participant_2"]:
                    for field in ["name", "age", "gender", "personality", "background"]:
                        if not participant_persona.get(participant, {}).get(field):
                            counters[participant][field] += 1
                            if filename not in files_with_empty_fields:
                                files_with_empty_fields.append(filename)

    return sorted(files_with_empty_fields)

directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"
files_with_empty_fields = get_files_with_empty_fields(directory_path)
print(files_with_empty_fields)

import openai
import json
import os
import pprint
from dotenv import load_dotenv
from typing import Dict, List
import glob
import time

load_dotenv()

MODEL = "gpt-4o-mini"
openai.api_key = os.getenv("OPENAI_API_KEY")

client = openai
client


def fill_empty_fields(data: Dict, messages: List[Dict]) -> Dict:
    messages_str = "\n".join([f"{msg['role']}: {msg['content']['text']}" for msg in messages])
    
    participant_1 = data['participant_persona']['participant_1']
    participant_2 = data['participant_persona']['participant_2']

    prompt = f"""Analyze the given Conversation Log and Participant information and fill in ONLY the missing fields in the original JSON format. Do not modify any existing information.:

    Participant 1:
    name: {participant_1['name']}
    age: {participant_1['age']}
    gender: {participant_1['gender']}
    personality: {participant_1['personality']}
    background: {participant_1['background']}

    Participant 2:
    name: {participant_2['name']}
    age: {participant_2['age']}
    gender: {participant_2['gender']}
    personality: {participant_2['personality']}
    background: {participant_2['background']}

    # Conversation Log:
    {messages_str}

    # Guidelines:
    1. Infer age, gender, and other details based on the text content and writing style.
    2. Generate diverse and unique names and personalities for each participant. Use various expressions, not using the same expressions repeatedly.
    3. Use str sentences for the personality and background fields.
    4. Keep use the original fields text if it exists."""

    retries = 3
    for attempt in range(retries):
        try:
            response = client.ChatCompletion.create(
                        model=MODEL,
                        messages=[
                            {"role": "system", "content": "You are an AI assistant that helps to build conversation data set."},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.8,
                        response_format={"type": "json_object"}
            )
            response_content = response.choices[0].message.content

            try:
                filled_data = json.loads(response_content)   
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError: {e}")
                return

            for key, value in filled_data.items():
                if key == 'Participant 1':
                    participant_1.update(value)
                elif key == 'Participant 2':
                    participant_2.update(value)
            
            data['participant_persona']['participant_1'] = participant_1
            data['participant_persona']['participant_2'] = participant_2

            return data
        except openai.error.Timeout as e:
            print(f"Attempt {attempt + 1} of {retries} failed with timeout. Retrying...")
            time.sleep(3)

    raise Exception("All retry attempts failed due to timeout.")

def process_files(file_list: List[str], directory_path: str):
    for filename in file_list:
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)
        
        messages = data.get('messages', [])
        filled_data = fill_empty_fields(data, messages)
        
        if filled_data:
            with open(file_path, 'w') as file:
                json.dump(filled_data, file, indent=4)


directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"
process_files(files_with_empty_fields, directory_path)

['filled_data_SPC-valid_00776.json']


filled_data_SPC-valid_00776: {'participant_persona': {'participant_1': {'name': '', 'age': '', 'gender': '', 'personality': '', 'background': 'I was raised in a single parent household.\nI only eat kosher.\nI am a stunt double as my second job.\nI read twenty books a year.\nI never broke a bone in my body ever in my life.'}, 'participant_2': {'name': '', 'age': '', 'gender': '', 'personality': '', 'background': 'I have a german shepherd dog.\nI live is a rural farming community.\nI enjoy coloring books.\nI like to watch nhl hockey.'}}, 'messages': [{'role': 'participant_1', 'content': {'emotion_scores': {}, 'text': 'Hi! What do you like to do?'}}, {'role': 'participant_2', 'content': {'emotion_scores': {}, 'text': 'I like to color books and watch hockey.'}}, {'role': 'participant_1', 'content': {'emotion_scores': {}, 'text': 'That sounds cool! I like to read books and watch movies.'}}, {'role': 'participant_2', 'content': {'emotion_scores': {}, 'text': 'What kind of books do you like t

### New-SPC


#### 합치기


In [4]:
# 비어있는 값 확인

folder_name = 'New-SPC'

import os
import json

# Define the directory path
directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"

# Initialize counters
counters = {
    "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
    "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
}

# List to store filenames with empty fields
files_with_empty_fields = []

# Iterate through each JSON file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".json"):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            participant_persona = data.get("participant_persona", {})
            
            for participant in ["participant_1", "participant_2"]:
                for field in ["name", "age", "gender", "personality", "background"]:
                    if not participant_persona.get(participant, {}).get(field):
                        counters[participant][field] += 1
                        if filename not in files_with_empty_fields:
                            files_with_empty_fields.append(filename)

# # Print the results
for participant, fields in counters.items():
    print(f"{participant}:")
    for field, count in fields.items():
        print(f"  {field}: {count} files")

# Print filenames with empty fields
for filename in sorted(files_with_empty_fields):
    print(filename)

participant_1:
  name: 0 files
  age: 0 files
  gender: 0 files
  personality: 0 files
  background: 0 files
participant_2:
  name: 0 files
  age: 0 files
  gender: 0 files
  personality: 0 files
  background: 0 files


In [3]:
import openai
import json
import os
from dotenv import load_dotenv
from typing import Dict, List
import time

load_dotenv()

MODEL = "gpt-4o-mini"
openai.api_key = os.getenv("OPENAI_API_KEY")
client = openai



# New-SPC
folder_name = "New-SPC"
input_directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/{folder_name}"
output_directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"

# # test
# folder_name = "test"
# input_directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/{folder_name}"
# output_directory_path = f"/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/{folder_name}"



def fill_empty_fields(data: Dict, messages: List[Dict]) -> Dict:
    messages_str = "\n".join([f"{msg['role']}: {msg['content']['text']}" for msg in messages])
    
    participant_1 = data['participant_persona']['participant_1']
    participant_2 = data['participant_persona']['participant_2']

    prompt = f"""Analyze the given Conversation Log and Participant information and fill in ONLY the missing fields in the original JSON format. Do not modify any existing information.:

    Participant 1:
    name: {participant_1['name']}
    age: {participant_1['age']}
    gender: {participant_1['gender']}
    personality: {participant_1['personality']}
    background: {participant_1['background']}

    Participant 2:
    name: {participant_2['name']}
    age: {participant_2['age']}
    gender: {participant_2['gender']}
    personality: {participant_2['personality']}
    background: {participant_2['background']}

    # Conversation Log:
    {messages_str}

    # Guidelines:
    1. Infer age, gender, and other details based on the text content and writing style.
    2. Generate diverse and unique names and personalities for each participant. Use various expressions, not using the same expressions repeatedly.
    3. Use str sentences for the personality and background fields.
    4. Keep use the original fields text if it exists."""

    retries = 3
    for attempt in range(retries):
        try:
            response = client.ChatCompletion.create(
                        model=MODEL,
                        messages=[
                            {"role": "system", "content": "You are an AI assistant that helps to build conversation data set."},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.8,
                        response_format={"type": "json_object"}
            )
            response_content = response.choices[0].message.content

            try:
                filled_data = json.loads(response_content)   
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError: {e}")
                return

            for key, value in filled_data.items():
                if key == 'Participant 1':
                    participant_1.update(value)
                elif key == 'Participant 2':
                    participant_2.update(value)
            
            data['participant_persona']['participant_1'] = participant_1
            data['participant_persona']['participant_2'] = participant_2

            return data
        
        except openai.error.Timeout as e:
            print(f"Attempt {attempt + 1} of {retries} failed with timeout. Retrying...")
            time.sleep(3)

    raise Exception("All retry attempts failed due to timeout.")



def process_files(file_list: List[str], input_directory_path: str, output_directory_path: str):
    for filename in file_list:
        input_file_path = os.path.join(input_directory_path, filename)
        with open(input_file_path, 'r') as file:
            data = json.load(file)
        
        messages = data.get('messages', [])
        filled_data = fill_empty_fields(data, messages)
        
        if filled_data:
            output_file_path = os.path.join(output_directory_path, filename)
            with open(output_file_path, 'w') as file:
                json.dump(filled_data, file, indent=4)




def count_empty_fields(output_directory_path):
    # Initialize counters
    counters = {
        "participant_1": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0},
        "participant_2": {"name": 0, "age": 0, "gender": 0, "personality": 0, "background": 0}
    }

    # List to store filenames with empty fields
    files_with_empty_fields = []

    # Iterate through each JSON file in the directory
    for filename in os.listdir(output_directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(output_directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                participant_persona = data.get("participant_persona", {})
                
                for participant in ["participant_1", "participant_2"]:
                    for field in ["name", "age", "gender", "personality", "background"]:
                        if not participant_persona.get(participant, {}).get(field):
                            counters[participant][field] += 1
                            if filename not in files_with_empty_fields:
                                files_with_empty_fields.append(filename)

    files_with_empty_fields = sorted(files_with_empty_fields)

    return files_with_empty_fields




def re_process_files(file_list: List[str], output_directory_path: str):
    for filename in file_list:
        input_file_path = os.path.join(output_directory_path, filename)
        with open(input_file_path, 'r') as file:
            data = json.load(file)
        
        messages = data.get('messages', [])
        filled_data = fill_empty_fields(data, messages)
        
        if filled_data:
            output_file_path = os.path.join(output_directory_path, filename)
            with open(output_file_path, 'w') as file:
                json.dump(filled_data, file, indent=4)


# Main execution loop
if __name__ == "__main__":
    # # Step 1: Run process_files once
    # all_files = [f for f in os.listdir(input_directory_path) if f.endswith(".json")]
    # process_files(all_files, input_directory_path, output_directory_path)
    
    # Step 2: Count empty fields and process files in a loop
    while True:
        files_with_empty_fields = count_empty_fields(output_directory_path)
        if not files_with_empty_fields:
            print("All fields are filled.")
            break
        else:
            print(f"Processing {len(files_with_empty_fields)} files with empty fields...")
            re_process_files(files_with_empty_fields, output_directory_path)
            print("Processing complete. Checking for remaining empty fields...")


Processing 223 files with empty fields...
Processing complete. Checking for remaining empty fields...
Processing 24 files with empty fields...
Processing complete. Checking for remaining empty fields...
Processing 4 files with empty fields...
Processing complete. Checking for remaining empty fields...
Processing 1 files with empty fields...
Processing complete. Checking for remaining empty fields...
All fields are filled.


#### 이미 돌린 파일 번호는 제외하고 돌리기

In [6]:
import os
import shutil

# Directories
filled_data_dir = "/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/03_filled_data/New-SPC"
renamed_data_dir = "/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/New-SPC"
output_dir = "/home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/New-SPC_2"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Get list of existing files in filled_data_dir
existing_files = set(f for f in os.listdir(filled_data_dir) if f.endswith('.json'))

# Identify missing numbers
missing_numbers = []
for i in range(1, 11002):
    filename = f"SPC-filled_data_New-SPC_{i:05d}.json"
    if filename not in existing_files:
        missing_numbers.append(i)

# Copy missing files from renamed_data_dir to output_dir
for num in missing_numbers:
    src_filename = f"SPC_renamed_data_new-spc_{num:05d}.json"
    src_path = os.path.join(renamed_data_dir, src_filename)
    if os.path.exists(src_path):
        dst_path = os.path.join(output_dir, src_filename)
        shutil.copy(src_path, dst_path)
        print(f"Copied {src_filename} to {output_dir}")

print("Completed copying missing files.")

Copied SPC_renamed_data_new-spc_00001.json to /home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/New-SPC_2
Copied SPC_renamed_data_new-spc_00003.json to /home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/New-SPC_2
Copied SPC_renamed_data_new-spc_00004.json to /home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/New-SPC_2
Copied SPC_renamed_data_new-spc_00005.json to /home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/New-SPC_2
Copied SPC_renamed_data_new-spc_00006.json to /home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/New-SPC_2
Copied SPC_renamed_data_new-spc_00009.json to /home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_data/New-SPC_2
Copied SPC_renamed_data_new-spc_00011.json to /home/user1/conversation-data/dataset-02-SPC/Synthetic-Persona-Chat/data/02_renamed_