# 일반 대화 - [한국어 멀티세션 대화](https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=data&dataSetSn=71630)

## 데이터 로드

In [None]:
import os
import json
import pandas as pd
from tqdm import tqdm

def txt_file_to_json(txt_file_path):
    with open(txt_file_path) as txt_file:
        return json.load(txt_file)

def extract_session_data(json_data):
    session_data = []
    topic_type = json_data["topicInfo"]["topicType"].split(">")[0]
    topic_type = topic_type.replace(" ", "")
    
    for session in json_data["sessionInfo"]:
        conversation = [dialog["utterance"] for dialog in session["dialog"]]
        
        session_data.append({
            "topic_type": topic_type,
            "conversation": "\n".join(conversation),
        })
    
    return session_data

def load_session_data(dir_path):
    session_data = []
    
    for txt_file_name in tqdm(os.listdir(dir_path)):
        json_data = txt_file_to_json(f"{dir_path}/{txt_file_name}")
        session_data += extract_session_data(json_data)
    
    return session_data

def load_normal_conversations_df():
    topic_types = []
    conversations = []
    
    for dir_path in ["data/TS_session2", "data/TS_session3", "data/TS_session4"]:
        session_data = load_session_data(dir_path)
        
        for session in session_data:
            topic_types.append(session["topic_type"])
            conversations.append(session["conversation"])
    
    return pd.DataFrame({
        "topic_type": topic_types,
        "conversation": conversations,
    })

In [None]:
normal_conversations_df = load_normal_conversations_df()

## 데이터 수 확인

In [None]:
# 전체 데이터 수
normal_conversations_df.info()

In [None]:
# 주제별 데이터
normal_conversations_df["topic_type"].value_counts()

In [None]:
# random data
normal_conversations_df.sample(20)

## 전처리

### 인사말 제거

In [None]:
processed_df = normal_conversations_df.copy()

def remove_greeting(conversation):
    return "\n".join(conversation.split("\n")[2:])

processed_df["conversation"] = processed_df["conversation"].apply(remove_greeting)

processed_df

### 10문장 미만 데이터 제거

In [None]:
processed_df["conversation_len"] =\
    processed_df["conversation"].apply(lambda x: len(x.split("\n")))

processed_df = processed_df[processed_df["conversation_len"] >= 10]

processed_df

### 10문장만 데이터 뽑기

In [None]:
def remove_out_of_n(conversation, n):
    return "\n".join(conversation.split("\n")[:n])

processed_df["conversation"] = processed_df["conversation"].apply(lambda x: remove_out_of_n(x, n=10))
processed_df

### 채팅에서만 사용되는 표현 삭제

In [None]:
import re

# ~ ! . 를 제외한 특수문자 제거
special_char_pattern = re.compile(r'[#\$%&\'()*+,\-/:;<=>~@\[\]\\^_`{|}]')

def remove_chat_expression(sentence):
    sentence = re.sub("([ㄱ-ㅎㅏ-ㅣ]+)", "", sentence)
    sentence = special_char_pattern.sub("", sentence)
    return sentence

processed_df["conversation"] = processed_df["conversation"].apply(remove_chat_expression)

### 주제 별로 1,000개 데이터 뽑기

In [None]:
new_df = pd.DataFrame({
    "topic_type": [],
    "conversation": [],
    "conversation_len": [],
})

for topic_type, data_num in [
    ("개인및관계", 77),
    ("여가와오락", 77),
    ("미용과건강", 77),
    ("시사/사회", 77),
    ("일과직업", 77),
    ("교육", 77),
    ("예술문화생활", 77),
    ("상거래전반", 77),
    ("식음료", 77),
    ("기후", 77),
    ("교통", 77),
    ("주거와생활", 77),
    ("패션", 76),
]:
    topic_df = processed_df[processed_df["topic_type"] == topic_type]
    topic_df = topic_df.iloc[:data_num, :]
    new_df = pd.concat([new_df, topic_df], axis=0, ignore_index=True)
    
new_df

## 데이터 저장

In [None]:
data_df = new_df.copy()
data_df = data_df.drop(columns=["conversation_len"])
data_df.to_csv("data/normal_conversation_aihub.csv", index=False)

### train 데이터와 합치기

In [None]:
train_df = pd.read_csv("data/train.csv")
train_df

In [None]:
normal_conversation_df = pd.read_csv("data/normal_conversation_aihub.csv")
normal_conversation_df

In [None]:
curr_idx = train_df["idx"].max() + 1

for idx, data in normal_conversation_df.iterrows():
    train_df = train_df.append({
        "idx": curr_idx,
        "class": "일반 대화",
        "conversation": data["conversation"],
    }, ignore_index=True)

    curr_idx += 1
    
train_df

In [None]:
train_df.to_csv("data/train_with_normal_aihub.csv", index=False)

In [None]:
train_with_normal_df = pd.read_csv("data/train_with_normal_aihub.csv")
train_with_normal_df

In [None]:
train_with_normal_df["class"].value_counts()