# 일반 대화 - [모두의 말뭉치: 일상 대화 말뭉치 2022](https://kli.korean.go.kr/corpus/request/corpusRegist.do)

## 데이터 로드

In [None]:
import os
import json
import pandas as pd

class ConversationData:
    def __init__(self, json_data):
        self.json_data = json_data
    
    def is_normal_conversation(self):
        return "일상대화" in self.json_data["metadata"]["category"]
    
    def get_topic_type(self):
        document = self.__get_document()
        full_topic = document["metadata"]["topic"]
        return full_topic.split(" > ")[0]
    
    def get_conversation(self):
        document = self.__get_document()
        
        # 한 발화자가 여러 utterance에 걸쳐 발화를 하는 경우, 하나의 문장으로 만든다.
        conversation = []
        curr_speaker = None
        curr_sentence = None

        for utterance in document["utterance"]:
            sentence = utterance["form"].strip()
            speaker_id = utterance["speaker_id"].strip()
            if sentence == "":
                continue

            # 대화 처음
            if curr_speaker is None:
                curr_speaker = speaker_id
                curr_sentence = sentence
                continue


            # 이전 발화자가 계속 발화를 이어가는 경우
            if curr_speaker == speaker_id:
                curr_sentence += " " + sentence

            # 다른 사람이 발화하는 경우
            else:
                conversation.append(curr_sentence)
                curr_speaker = speaker_id
                curr_sentence = sentence

        conversation.append(curr_sentence)
        
        return conversation
        
    def __get_document(self):
        return self.json_data["document"][0] # document는 size가 1인 list

def load_json(path):
    with open(path) as file:
        return json.load(file)

def load_nikl_normal_conversation_df(data_dir_path="data/NIKL_DIALOGUE_2022_v1.0"):
    topic_types = []
    conversations = []
    
    for file_path in os.listdir(data_dir_path):
        if not file_path.endswith("json"):
            continue
        
        json_data = load_json(f"{data_dir_path}/{file_path}")
        conversation_data = ConversationData(json_data)
        
        if not conversation_data.is_normal_conversation():
            continue
        
        topic_types.append(conversation_data.get_topic_type())
        conversations.append(conversation_data.get_conversation())

    return pd.DataFrame({
        "topic_type": topic_types,
        "conversation": ["\n".join(c) for c in conversations],
    })


In [None]:
nikl_normal_conversation_df = load_nikl_normal_conversation_df()

## 데이터 수 확인

In [None]:
# 전체 데이터 수
nikl_normal_conversation_df.info()

In [None]:
# 주제별 데이터
nikl_normal_conversation_df["topic_type"].value_counts()

In [None]:
# random data
nikl_normal_conversation_df.sample(20)

## 전처리

In [None]:
processed_df = nikl_normal_conversation_df.copy()

### 10문장 미만 데이터 제거

In [None]:
processed_df["conversation_len"] =\
    processed_df["conversation"].apply(lambda x: len(x.split("\n")))

processed_df = processed_df[processed_df["conversation_len"] >= 10]

processed_df

### 10문장만 데이터 뽑기

In [None]:
def remove_out_of_n(conversation, n):
    return "\n".join(conversation.split("\n")[:n])

processed_df["conversation"] = processed_df["conversation"].apply(lambda x: remove_out_of_n(x, n=10))
processed_df["conversation"]

### 주제 별로 1,000개 데이터 뽑기

In [None]:
new_df = pd.DataFrame({
    "topic_type": [],
    "conversation": [],
    "conversation_len": [],
})

for topic_type, data_num in [
    ("대중교통", 63),
    ("방송/연예", 63),
    ("취직", 63),
    ("건강/다이어트", 63),
    ("경제/재테크", 63),
    ("먹거리", 63),
    ("회사/학교", 63),
    ("휴가", 63),
    ("스포츠/레저/취미", 62),
    ("가족/관혼상제", 62),
    ("쇼핑", 62),
    ("생활/주거환경", 62),
    ("우정", 62),
    ("반려동물", 62),
    ("음악", 62),
    ("기타", 62),
]:
    topic_df = processed_df[processed_df["topic_type"] == topic_type]
    topic_df = topic_df.iloc[:data_num, :]
    new_df = pd.concat([new_df, topic_df], axis=0, ignore_index=True)
    
new_df

## 데이터 저장

In [None]:
data_df = new_df[["topic_type", "conversation"]]
data_df.to_csv("data/normal_conversation_nikl.csv", index=False)

### train 데이터와 합치기

In [None]:
train_df = pd.read_csv("data/train.csv")
train_df

In [None]:
normal_conversation_nikl_df = pd.read_csv("data/normal_conversation_nikl.csv")
normal_conversation_nikl_df

In [None]:
curr_idx = train_df["idx"].max() + 1

for idx, data in normal_conversation_nikl_df.iterrows():
    train_df = train_df.append({
        "idx": curr_idx,
        "class": "일반 대화",
        "conversation": data["conversation"],
    }, ignore_index=True)

    curr_idx += 1
    
train_df

In [None]:
train_df.to_csv("data/train_with_normal_nikl.csv", index=False)

In [None]:
train_with_normal_nikl_df = pd.read_csv("data/train_with_normal_nikl.csv")
train_with_normal_nikl_df

In [None]:
train_with_normal_nikl_df["class"].value_counts()