In [1]:
!pip install transformers torch

[0m

In [8]:
# ## 영어 대화 생성
# import pandas as pd
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# import torch
# import random

# # Specify GPT-2 model
# model_name = "gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)

# # Set device to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Function to generate conversations
# def generate_conversations(num_conversations=1000, min_length=20, max_length=520, avg_length=219):
#     prompts = [
#         "Let's talk about something interesting.",
#         "What do you think about life these days?",
#         "Tell me about your favorite hobbies.",
#         "Let's discuss travel and favorite destinations.",
#         "What are your thoughts on technology and its impact?",
#         "How do you usually spend your weekends?",
#         "What is your favorite childhood memory?",
#         "Have you read any good books recently?",
#         "What kind of music do you like to listen to?",
#         "Do you enjoy cooking? If so, what's your favorite dish to make?",
#         "What is a goal you are currently working toward?",
#         "If you could live anywhere in the world, where would it be?",
#         "What do you think makes a good friend?",
#         "What is your favorite way to relax after a busy day?",
#         "What are your thoughts on climate change and how we can address it?",
#         "What inspires you to keep going when things get tough?",
#         "What’s the most interesting thing you’ve learned recently?",
#         "If you could meet any historical figure, who would it be and why?",
#         "What do you usually do when you feel stressed?",
#         "Have you ever had a life-changing experience you'd like to share?",
#     ]

#     conversations = []

#     for i in range(num_conversations):
#         prompt = random.choice(prompts)
#         input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

#         # Determine the length for this conversation
#         conversation_length = random.randint(min_length, max_length)
#         if random.random() < 0.5:  # Adjust length bias toward the average
#             conversation_length = int(avg_length + random.uniform(-50, 50))

#         # Generate text
#         output = model.generate(
#             input_ids,
#             max_length=conversation_length,
#             num_return_sequences=1,
#             do_sample=True,
#             temperature=0.7,
#             top_k=50,
#             top_p=0.9,
#             pad_token_id=tokenizer.eos_token_id
#         )

#         # Decode the generated text
#         generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
#         # Ensure text meets length criteria
#         generated_text = " ".join(generated_text.split()[:conversation_length])
#         conversations.append(generated_text)

#         print(f"Generated conversation {i+1}/{num_conversations}")  # Progress log

#     return conversations

# # Generate 1,000 conversations
# num_conversations = 1000
# generated_conversations = generate_conversations(num_conversations=num_conversations)

# # Create DataFrame with idx, class, and conversation columns
# data = {
#     "idx": list(range(num_conversations)),
#     "class": ['일반 대화'] * num_conversations,  # All entries belong to class '4'
#     "conversation": generated_conversations,
# }

# df = pd.DataFrame(data)

# # Save to a CSV file
# output_file = "general_conversations.csv"
# df.to_csv(output_file, index=False, encoding="utf-8-sig") # 한글깨짐 방지
# print(f"Generated conversations saved to {output_file}")


In [1]:
## 한글대화 생성
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import random

# 한국어 GPT-2 모델 로드
model_name = "skt/kogpt2-base-v2"  # 한국어 모델
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 대화 생성 함수
def generate_korean_conversations(num_conversations = 1000, min_length = 40, max_length = 993, avg_length = 238):
    prompts = [
        "오늘 하루는 어땠나요?",
        "가장 좋아하는 취미는 무엇인가요?",
        "주말에 주로 무엇을 하시나요?",
        "가보고 싶은 여행지는 어디인가요?",
        "요즘 관심 있는 기술에 대해 이야기해보아요.",
        "어릴 적 가장 기억에 남는 일은 무엇인가요?",
        "좋아하는 음식은 무엇인가요?",
        "최근에 읽은 책 중에서 추천하고 싶은 책은 무엇인가요?",
        "스트레스를 받을 때 주로 무엇을 하시나요?",
        "오늘 배운 가장 흥미로운 것은 무엇인가요?",
    ]

    conversations = []

    for i in range(num_conversations):
        prompt = random.choice(prompts)
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

        # 대화 길이 설정
        conversation_length = random.randint(min_length, max_length)
        if random.random() < 0.5:  # 평균 길이에 더 가까운 문장 생성
            conversation_length = int(avg_length + random.uniform(-50, 50))

        # 텍스트 생성
        output = model.generate(
            input_ids,
            max_length=conversation_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id
        )

        # 생성된 텍스트 디코딩
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

        # 대화 길이 조정
        generated_text = " ".join(generated_text.split()[:conversation_length])
        conversations.append(generated_text)

        print(f"Generated conversation {i+1}/{num_conversations}")  # 진행 상황 출력
    return conversations

# 대화 데이터 생성
num_conversations = 1000
generated_conversations = generate_korean_conversations(num_conversations=num_conversations)

# DataFrame 생성
data = {
    "idx": list(range(num_conversations)),
    "class": ['일반 대화'] * num_conversations,  # 클래스는 '일반 대화'로 설정
    "conversation": generated_conversations,
}

df = pd.DataFrame(data)

# CSV 파일 저장
output_file = "general_conversations_korean.csv"
df.to_csv(output_file, index=False, encoding="utf-8-sig")  # 한글 깨짐 방지
print(f"Generated conversations saved to {output_file}")

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.69M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/490M [00:00<?, ?B/s]

Generated conversation 1/1000
Generated conversation 2/1000
Generated conversation 3/1000
Generated conversation 4/1000
Generated conversation 5/1000
Generated conversation 6/1000
Generated conversation 7/1000
Generated conversation 8/1000
Generated conversation 9/1000
Generated conversation 10/1000
Generated conversation 11/1000
Generated conversation 12/1000
Generated conversation 13/1000
Generated conversation 14/1000
Generated conversation 15/1000
Generated conversation 16/1000
Generated conversation 17/1000
Generated conversation 18/1000
Generated conversation 19/1000
Generated conversation 20/1000
Generated conversation 21/1000
Generated conversation 22/1000
Generated conversation 23/1000
Generated conversation 24/1000
Generated conversation 25/1000
Generated conversation 26/1000
Generated conversation 27/1000
Generated conversation 28/1000
Generated conversation 29/1000
Generated conversation 30/1000
Generated conversation 31/1000
Generated conversation 32/1000
Generated convers