# 일상대화 말뭉치 Corpus

In [11]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import seaborn as sns

## 분석 대상 정의

In [12]:
# 파열음 자음.
plosive_consonant = ['ㄱ', 'ㄷ', 'ㅂ', 'ㅅ', 'ㅈ']
# target 모음
target_vowel = ['ㅏ', 'ㅔ', 'ㅣ', 'ㅗ', 'ㅜ']

## 파일 로드

In [13]:
corpus1Path = "./word_frequency.csv"
corpus2Path = "./Korean Language Usage Frequency Study/일반어휘통계.xlsx"
corpus1 = pd.read_csv(corpus1Path)
corpus2 = pd.read_excel(corpus2Path)
corpus1.shape, corpus2.shape

((71698, 2), (82501, 5))

In [14]:
# 저빈도 순으로 정렬.
corpus1.sort_values(by='빈도', ascending=True, inplace=True, ignore_index=True)
corpus1

Unnamed: 0,단어,빈도
0,나대진가,1
1,담갔어요,1
2,들어가겠네,1
3,들어오겠지,1
4,보러도,1
...,...,...
71693,사람,18997
71694,하는,23080
71695,가지,25581
71696,생각,33607


In [15]:
corpus2.sort_values(by='빈도', ascending=False, inplace=True, ignore_index=True)
corpus2

Unnamed: 0,순위,빈도,어휘,풀이,품사
0,1,97499,이다,,지
1,2,50558,것01,,의
2,3,42900,하다01,,동
3,4,39290,있다01,,보
4,5,37028,있다01,,형
...,...,...,...,...,...
82496,1171,1,전사되다01,傳寫-,동
82497,1171,1,부책임자,副責任者,명
82498,1171,1,부처,,명
82499,1171,1,전사14,傳寫,명


## Utils

In [17]:
def extract_korean(text):
    # 한글 정규표현식 패턴
    # ㄱ-ㅎ: 자음, ㅏ-ㅣ: 모음, 가-힣: 완성된 한글 글자
    pattern = '[ㄱ-ㅎㅏ-ㅣ가-힣]+'
    
    # 한글만 추출
    result = re.findall(pattern, text)
    
    # 결과를 하나의 문자열로 합치기
    return ''.join(result)


def syllable_count(df: pd.DataFrame, col_name: str):
    
    if df.columns.__contains__(col_name):
        df['음절수'] = df[col_name].str.len()
        df = df[df['음절수'] == 2]
        df.reset_index(drop=True, inplace=True)
    else:
        raise ValueError(f"col_name: {col_name} not found in df")
    
    return df

In [18]:
def decompose_hangul_phoneme(text):
    # 초성 리스트
    CHOSUNG = ['ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
    # 중성 리스트
    JUNGSUNG = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']
    # 종성 리스트 (공백은 종성 없음을 의미)
    JONGSUNG = ['', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

    chosung_list = []
    jungsung_list = []
    jongsung_list = []
    for char in text:
        # 한글인 경우만 분해
        if '가' <= char <= '힣':
            # 유니코드 값에서 한글 시작 값(0xAC00) 빼기
            char_code = ord(char) - 0xAC00
            
            # 초성 인덱스 = 문자코드 // (중성 개수 * 종성 개수)
            cho_idx = char_code // (21 * 28)
            # 중성 인덱스 = (문자코드 % (중성 개수 * 종성 개수)) // 종성 개수
            jung_idx = (char_code % (21 * 28)) // 28
            # 종성 인덱스 = 문자코드 % 종성 개수
            jong_idx = char_code % 28
            
            chosung_list.append(CHOSUNG[cho_idx])
            jungsung_list.append(JUNGSUNG[jung_idx])
            if jong_idx > 0:  # 종성이 있는 경우만 추가
                jongsung_list.append(JONGSUNG[jong_idx])
        else:
            pass
    
    return chosung_list, jungsung_list, jongsung_list

In [19]:
def set_2_syllable(df: pd.DataFrame):
    df = df[df['음절수'] == 2]
    df.reset_index(drop=True, inplace=True)
    return df

In [20]:
# 사용 예시
word_sample = "안녕하세요"
chosung_list, jungsung_list, jongsung_list = decompose_hangul_phoneme(word_sample)
print(f"원본 단어: {word_sample}")
print(f"분해 결과: {chosung_list, jungsung_list, jongsung_list}")

원본 단어: 안녕하세요
분해 결과: (['ㅇ', 'ㄴ', 'ㅎ', 'ㅅ', 'ㅇ'], ['ㅏ', 'ㅕ', 'ㅏ', 'ㅔ', 'ㅛ'], ['ㄴ', 'ㅇ'])


## Preprocessing

### 일반어휘 통계 중복어 합치기

In [21]:
# 필요없는 컬럼 제거
if corpus2.columns.__contains__('순위') and corpus2.columns.__contains__('풀이'):
    corpus2.drop(columns=['순위', '풀이'], inplace=True)
corpus2

Unnamed: 0,빈도,어휘,품사
0,97499,이다,지
1,50558,것01,의
2,42900,하다01,동
3,39290,있다01,보
4,37028,있다01,형
...,...,...,...
82496,1,전사되다01,동
82497,1,부책임자,명
82498,1,부처,명
82499,1,전사14,명


In [22]:
# groupby를 사용하여 동일한 어휘의 빈도를 합치고 정렬
corpus2['어휘'] = corpus2['어휘'].apply(extract_korean)
len1 = len(corpus2)
corpus2 = corpus2.groupby('어휘')['빈도'].sum().reset_index()
len2 = len(corpus2)
corpus2 = corpus2.sort_values(by='빈도', ascending=True, ignore_index=True)
print(f"중복 제거 전: {len1}, 중복 제거 후: {len2}")

중복 제거 전: 82501, 중복 제거 후: 73770


### 음절수 추출

In [26]:
# 2음절 단어 추출.
corpus1 = syllable_count(corpus1, '단어')
corpus2 = syllable_count(corpus2, '어휘')
corpus1 = set_2_syllable(corpus1)
corpus2 = set_2_syllable(corpus2)
corpus1

Unnamed: 0,단어,빈도,음절수
0,찔기,1,2
1,낭망,1,2
2,용궁,1,2
3,속죄,1,2
4,익금,1,2
...,...,...,...
21945,사람,18997,2
21946,하는,23080,2
21947,가지,25581,2
21948,생각,33607,2


In [27]:
corpus2

Unnamed: 0,어휘,빈도,음절수
0,시성,1,2
1,시식,1,2
2,시산,1,2
3,시삽,1,2
4,시암,1,2
...,...,...,...
20812,보다,22093,2
20813,되다,24589,2
20814,하다,63825,2
20815,있다,76984,2


In [28]:
def decompose_hangul(df: pd.DataFrame, col_name: str):
    chosung_list = []
    jungsung_list = []
    jongsung_list = []

    for item in tqdm(df[col_name]):
        chosung, jungsung, jongsung = decompose_hangul_phoneme(item)
        chosung_list.append(chosung)
        jungsung_list.append(jungsung)
        jongsung_list.append(jongsung)

    df['초성'] = chosung_list
    df['중성'] = jungsung_list
    df['종성'] = jongsung_list
    return df

corpus1 = decompose_hangul(corpus1, col_name='단어')
corpus2 = decompose_hangul(corpus2, col_name='어휘')

100%|██████████| 21950/21950 [00:00<00:00, 769201.62it/s]
100%|██████████| 20817/20817 [00:00<00:00, 300732.35it/s]


In [29]:
# 중성 컬럼의 모음이 target_vowel에 있는 행만 필터링
def filter_target_vowel(df: pd.DataFrame):
    target_col = '중성'
    df = df[df[target_col].str[0].isin(target_vowel)]
    df = df[df[target_col].str[1].isin(target_vowel)]
    df.reset_index(drop=True, inplace=True)
    return df

corpus1 = filter_target_vowel(corpus1)
corpus2 = filter_target_vowel(corpus2)

In [30]:
corpus1

Unnamed: 0,단어,빈도,음절수,초성,중성,종성
0,찔기,1,2,"[ㅉ, ㄱ]","[ㅣ, ㅣ]",[ㄹ]
1,낭망,1,2,"[ㄴ, ㅁ]","[ㅏ, ㅏ]","[ㅇ, ㅇ]"
2,지집,1,2,"[ㅈ, ㅈ]","[ㅣ, ㅣ]",[ㅂ]
3,함지,1,2,"[ㅎ, ㅈ]","[ㅏ, ㅣ]",[ㅁ]
4,꼬보,1,2,"[ㄲ, ㅂ]","[ㅗ, ㅗ]",[]
...,...,...,...,...,...,...
8080,우리,14485,2,"[ㅇ, ㄹ]","[ㅜ, ㅣ]",[]
8081,하고,18843,2,"[ㅎ, ㄱ]","[ㅏ, ㅗ]",[]
8082,사람,18997,2,"[ㅅ, ㄹ]","[ㅏ, ㅏ]",[ㅁ]
8083,가지,25581,2,"[ㄱ, ㅈ]","[ㅏ, ㅣ]",[]


In [31]:
corpus2

Unnamed: 0,어휘,빈도,음절수,초성,중성,종성
0,시식,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅣ]",[ㄱ]
1,시산,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅏ]",[ㄴ]
2,시삽,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅏ]",[ㅂ]
3,시암,1,2,"[ㅅ, ㅇ]","[ㅣ, ㅏ]",[ㅁ]
4,시이,1,2,"[ㅅ, ㅇ]","[ㅣ, ㅣ]",[]
...,...,...,...,...,...,...
7355,않다,18416,2,"[ㅇ, ㄷ]","[ㅏ, ㅏ]",[ㄶ]
7356,보다,22093,2,"[ㅂ, ㄷ]","[ㅗ, ㅏ]",[]
7357,하다,63825,2,"[ㅎ, ㄷ]","[ㅏ, ㅏ]",[]
7358,있다,76984,2,"[ㅇ, ㄷ]","[ㅣ, ㅏ]",[ㅆ]


In [32]:
from konlpy.tag import Okt

def get_pos(word):
    # Okt 형태소 분석기 초기화
    okt = Okt()
    
    # 품사 태깅
    pos_tagged = okt.pos(word)
    
    return pos_tagged

In [38]:
# 사용 예시
word = "쩔기"
result = get_pos(word)
print(f"단어: {word}")
print(f"품사 분석 결과: {result}")

단어: 쩔기
품사 분석 결과: [('쩔기', 'Verb')]


In [43]:
# 명사만 살리기.
def get_noun(df: pd.DataFrame, col_name: str):
    drop_index = []
    for index, word in enumerate(df[col_name]):
        result = get_pos(word)
        if len(result) == 1 and result[0][1] == 'Noun':
            df.loc[index, '품사'] = result[0][1]
        else:
            drop_index.append(index)
    df.drop(drop_index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [44]:
corpus1 = get_noun(corpus1, '단어')
corpus2 = get_noun(corpus2, '어휘')

In [45]:
corpus1

Unnamed: 0,단어,빈도,음절수,초성,중성,종성,품사
0,낭망,1,2,"[ㄴ, ㅁ]","[ㅏ, ㅏ]","[ㅇ, ㅇ]",Noun
1,함지,1,2,"[ㅎ, ㅈ]","[ㅏ, ㅣ]",[ㅁ],Noun
2,꼬보,1,2,"[ㄲ, ㅂ]","[ㅗ, ㅗ]",[],Noun
3,부암,1,2,"[ㅂ, ㅇ]","[ㅜ, ㅏ]",[ㅁ],Noun
4,집아,1,2,"[ㅈ, ㅇ]","[ㅣ, ㅏ]",[ㅂ],Noun
...,...,...,...,...,...,...,...
6453,진짜,9467,2,"[ㅈ, ㅉ]","[ㅣ, ㅏ]",[ㄴ],Noun
6454,친구,11288,2,"[ㅊ, ㄱ]","[ㅣ, ㅜ]",[ㄴ],Noun
6455,인제,14093,2,"[ㅇ, ㅈ]","[ㅣ, ㅔ]",[ㄴ],Noun
6456,우리,14485,2,"[ㅇ, ㄹ]","[ㅜ, ㅣ]",[],Noun


In [46]:
corpus2

Unnamed: 0,어휘,빈도,음절수,초성,중성,종성,품사
0,시식,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅣ]",[ㄱ],Noun
1,시산,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅏ]",[ㄴ],Noun
2,시삽,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅏ]",[ㅂ],Noun
3,시암,1,2,"[ㅅ, ㅇ]","[ㅣ, ㅏ]",[ㅁ],Noun
4,시놉,1,2,"[ㅅ, ㄴ]","[ㅣ, ㅗ]",[ㅂ],Noun
...,...,...,...,...,...,...,...
6184,문제,5368,2,"[ㅁ, ㅈ]","[ㅜ, ㅔ]",[ㄴ],Noun
6185,알다,6433,2,"[ㅇ, ㄷ]","[ㅏ, ㅏ]",[ㄹ],Noun
6186,우리,11226,2,"[ㅇ, ㄹ]","[ㅜ, ㅣ]",[],Noun
6187,지다,12289,2,"[ㅈ, ㄷ]","[ㅣ, ㅏ]",[],Noun


In [None]:
corpus2.columns = ['어휘', '빈도']
corpus1.columns


In [49]:
main_df = pd.concat([corpus1, corpus2], ignore_index=True)
main_df
main_df.groupby('어휘')['빈도'].sum()
main_df.groupby('단어')['빈도'].sum()


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x16aaab050>

# 일반어휘통계

In [101]:
#일반 어휘 통계에서 동일한 단어를 찾아서 빈도 추출.
for index, item in enumerate(corpus1['단어']):
    if item in corpus2['어휘'].values:
        corpus1.loc[index, '빈도2'] = corpus2.loc[corpus2['어휘'] == item, '빈도'].values[0]
    else:
        corpus1.loc[index, '빈도2'] = 0
corpus1

Unnamed: 0,단어,빈도,음절수,초성,중성,종성,품사,빈도2
0,낭망,1,2,"[ㄴ, ㅁ]","[ㅏ, ㅏ]","[ㅇ, ㅇ]",Noun,0.0
1,함지,1,2,"[ㅎ, ㅈ]","[ㅏ, ㅣ]",[ㅁ],Noun,0.0
2,꼬보,1,2,"[ㄲ, ㅂ]","[ㅗ, ㅗ]",[],Noun,0.0
3,부암,1,2,"[ㅂ, ㅇ]","[ㅜ, ㅏ]",[ㅁ],Noun,0.0
4,집아,1,2,"[ㅈ, ㅇ]","[ㅣ, ㅏ]",[ㅂ],Noun,0.0
...,...,...,...,...,...,...,...,...
6455,인제,14093,2,"[ㅇ, ㅈ]","[ㅣ, ㅔ]",[ㄴ],Noun,0.0
6456,우리,14485,2,"[ㅇ, ㄹ]","[ㅜ, ㅣ]",[],Noun,0.0
6457,사람,18997,2,"[ㅅ, ㄹ]","[ㅏ, ㅏ]",[ㅁ],Noun,13594.0
6458,가지,25581,2,"[ㄱ, ㅈ]","[ㅏ, ㅣ]",[],Noun,0.0


In [102]:
# 두 자료로 부터 빈도 합치고 다시 정렬.
word_df['빈도'] = word_df['빈도'].astype(int)
word_df['빈도2'] = word_df['빈도2'].astype(int)
word_df['총빈도'] = word_df['빈도'] + word_df['빈도2']

In [103]:
word_df.sort_values(by='총빈도', ascending=True, inplace=True, ignore_index=True)
word_df.reset_index(drop=True, inplace=True)
word_df

Unnamed: 0,단어,빈도,음절수,초성,중성,종성,품사,빈도2,총빈도
0,낭망,1,2,"[ㄴ, ㅁ]","[ㅏ, ㅏ]","[ㅇ, ㅇ]",Noun,0,1
1,종지,1,2,"[ㅈ, ㅈ]","[ㅗ, ㅣ]",[ㅇ],Noun,0,1
2,핑키,1,2,"[ㅍ, ㅋ]","[ㅣ, ㅣ]",[ㅇ],Noun,0,1
3,파충,1,2,"[ㅍ, ㅊ]","[ㅏ, ㅜ]",[ㅇ],Noun,0,1
4,토마,1,2,"[ㅌ, ㅁ]","[ㅗ, ㅏ]",[],Noun,0,1
...,...,...,...,...,...,...,...,...,...
6455,인제,14093,2,"[ㅇ, ㅈ]","[ㅣ, ㅔ]",[ㄴ],Noun,0,14093
6456,우리,14485,2,"[ㅇ, ㄹ]","[ㅜ, ㅣ]",[],Noun,0,14485
6457,가지,25581,2,"[ㄱ, ㅈ]","[ㅏ, ㅣ]",[],Noun,0,25581
6458,사람,18997,2,"[ㅅ, ㄹ]","[ㅏ, ㅏ]",[ㅁ],Noun,13594,32591


In [112]:
word_df[(word_df['빈도'] == 1) & (word_df['빈도2'] == 1)]

Unnamed: 0,단어,빈도,음절수,초성,중성,종성,품사,빈도2,총빈도
1914,모포,1,2,"[ㅁ, ㅍ]","[ㅗ, ㅗ]",[],Noun,1,2
1915,고항,1,2,"[ㄱ, ㅎ]","[ㅗ, ㅏ]",[ㅇ],Noun,1,2
1916,세코,1,2,"[ㅅ, ㅋ]","[ㅔ, ㅗ]",[],Noun,1,2
1917,단심,1,2,"[ㄷ, ㅅ]","[ㅏ, ㅣ]","[ㄴ, ㅁ]",Noun,1,2
1918,술국,1,2,"[ㅅ, ㄱ]","[ㅜ, ㅜ]","[ㄹ, ㄱ]",Noun,1,2
1919,짐꾼,1,2,"[ㅈ, ㄲ]","[ㅣ, ㅜ]","[ㅁ, ㄴ]",Noun,1,2
1920,밤비,1,2,"[ㅂ, ㅂ]","[ㅏ, ㅣ]",[ㅁ],Noun,1,2
1921,밀밭,1,2,"[ㅁ, ㅂ]","[ㅣ, ㅏ]","[ㄹ, ㅌ]",Noun,1,2
1922,풍족,1,2,"[ㅍ, ㅈ]","[ㅜ, ㅗ]","[ㅇ, ㄱ]",Noun,1,2
1923,십장,1,2,"[ㅅ, ㅈ]","[ㅣ, ㅏ]","[ㅂ, ㅇ]",Noun,1,2


In [113]:
len(word_df[(word_df['빈도'] == 1) & (word_df['빈도2'] == 1)])

51

In [122]:
counter = [0, 0, 0, 0, 0]
for idx in range(len(word_df[(word_df['빈도'] == 1) & (word_df['빈도2'] == 1)])):
    item = word_df[(word_df['빈도'] == 1) & (word_df['빈도2'] == 1)]['중성'].iloc[idx]
    if item[0] == 'ㅏ':
        counter[0] += 1
    elif item[0] == 'ㅔ':
        counter[1] += 1
    elif item[0] == 'ㅣ':
        counter[2] += 1
    elif item[0] == 'ㅗ':
        counter[3] += 1
    elif item[0] == 'ㅜ':
        counter[4] += 1

    if item[1] == 'ㅏ':
        counter[0] += 1
    elif item[1] == 'ㅔ':
        counter[1] += 1
    elif item[1] == 'ㅣ':
        counter[2] += 1
    elif item[1] == 'ㅗ':
        counter[3] += 1
    elif item[1] == 'ㅜ':
        counter[4] += 1
for idx, item in enumerate(counter):
    print(f"{target_vowel[idx]}: {item}")


ㅏ: 35
ㅔ: 6
ㅣ: 26
ㅗ: 19
ㅜ: 16


In [123]:
sum = 0
for item in counter:
    sum += item
sum


102