# 일상대화 말뭉치 Corpus

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import seaborn as sns

## 분석 대상 정의

In [2]:
# 자음
consonant = ['ㅂ', 'ㅍ', 'ㅃ', 'ㅁ', 'ㄴ']
# target 모음
target_vowel = ['ㅏ', 'ㅐ', 'ㅣ', 'ㅗ', 'ㅜ']

## 파일 로드

In [3]:
corpus1Path = "./word_frequency.csv"
corpus2Path = "./Korean Language Usage Frequency Study/일반어휘통계.xlsx"
corpus1 = pd.read_csv(corpus1Path)
corpus2 = pd.read_excel(corpus2Path)
corpus1.shape, corpus2.shape

((71698, 2), (82501, 5))

In [4]:
# 저빈도 순으로 정렬.
corpus1.sort_values(by='빈도', ascending=True, inplace=True, ignore_index=True)
corpus1

Unnamed: 0,단어,빈도
0,나대진가,1
1,담갔어요,1
2,들어가겠네,1
3,들어오겠지,1
4,보러도,1
...,...,...
71693,사람,18997
71694,하는,23080
71695,가지,25581
71696,생각,33607


In [5]:
corpus2.sort_values(by='빈도', ascending=False, inplace=True, ignore_index=True)
corpus2

Unnamed: 0,순위,빈도,어휘,풀이,품사
0,1,97499,이다,,지
1,2,50558,것01,,의
2,3,42900,하다01,,동
3,4,39290,있다01,,보
4,5,37028,있다01,,형
...,...,...,...,...,...
82496,1171,1,전사되다01,傳寫-,동
82497,1171,1,부책임자,副責任者,명
82498,1171,1,부처,,명
82499,1171,1,전사14,傳寫,명


## Utils

In [6]:
def extract_korean(text):
    # 한글 정규표현식 패턴
    # ㄱ-ㅎ: 자음, ㅏ-ㅣ: 모음, 가-힣: 완성된 한글 글자
    pattern = '[ㄱ-ㅎㅏ-ㅣ가-힣]+'
    
    # 한글만 추출
    result = re.findall(pattern, text)
    
    # 결과를 하나의 문자열로 합치기
    return ''.join(result)


def syllable_count(df: pd.DataFrame, col_name: str):
    
    if df.columns.__contains__(col_name):
        df['음절수'] = df[col_name].str.len()
        df = df[df['음절수'] == 2]
        df.reset_index(drop=True, inplace=True)
    else:
        raise ValueError(f"col_name: {col_name} not found in df")
    
    return df

In [7]:
def decompose_hangul_phoneme(text):
    # 초성 리스트
    CHOSUNG = ['ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
    # 중성 리스트
    JUNGSUNG = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']
    # 종성 리스트 (공백은 종성 없음을 의미)
    JONGSUNG = ['', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

    chosung_list = []
    jungsung_list = []
    jongsung_list = []
    for char in text:
        # 한글인 경우만 분해
        if '가' <= char <= '힣':
            # 유니코드 값에서 한글 시작 값(0xAC00) 빼기
            char_code = ord(char) - 0xAC00
            
            # 초성 인덱스 = 문자코드 // (중성 개수 * 종성 개수)
            cho_idx = char_code // (21 * 28)
            # 중성 인덱스 = (문자코드 % (중성 개수 * 종성 개수)) // 종성 개수
            jung_idx = (char_code % (21 * 28)) // 28
            # 종성 인덱스 = 문자코드 % 종성 개수
            jong_idx = char_code % 28
            
            chosung_list.append(CHOSUNG[cho_idx])
            jungsung_list.append(JUNGSUNG[jung_idx])
            if jong_idx > 0:  # 종성이 있는 경우만 추가
                jongsung_list.append(JONGSUNG[jong_idx])
        else:
            pass
    
    return chosung_list, jungsung_list, jongsung_list

In [8]:
def set_2_syllable(df: pd.DataFrame):
    df = df[df['음절수'] == 2]
    df.reset_index(drop=True, inplace=True)
    return df

In [9]:
# 사용 예시
word_sample = "안녕하세요"
chosung_list, jungsung_list, jongsung_list = decompose_hangul_phoneme(word_sample)
print(f"원본 단어: {word_sample}")
print(f"분해 결과: {chosung_list, jungsung_list, jongsung_list}")

원본 단어: 안녕하세요
분해 결과: (['ㅇ', 'ㄴ', 'ㅎ', 'ㅅ', 'ㅇ'], ['ㅏ', 'ㅕ', 'ㅏ', 'ㅔ', 'ㅛ'], ['ㄴ', 'ㅇ'])


## Preprocessing

### 일반어휘 통계 중복어 합치기

In [10]:
# 필요없는 컬럼 제거
if corpus2.columns.__contains__('순위') and corpus2.columns.__contains__('풀이'):
    corpus2.drop(columns=['순위', '풀이'], inplace=True)
corpus2

Unnamed: 0,빈도,어휘,품사
0,97499,이다,지
1,50558,것01,의
2,42900,하다01,동
3,39290,있다01,보
4,37028,있다01,형
...,...,...,...
82496,1,전사되다01,동
82497,1,부책임자,명
82498,1,부처,명
82499,1,전사14,명


In [11]:
# groupby를 사용하여 동일한 어휘의 빈도를 합치고 정렬
corpus2['어휘'] = corpus2['어휘'].apply(extract_korean)
len1 = len(corpus2)
corpus2 = corpus2.groupby('어휘')['빈도'].sum().reset_index()
len2 = len(corpus2)
corpus2 = corpus2.sort_values(by='빈도', ascending=True, ignore_index=True)
print(f"중복 제거 전: {len1}, 중복 제거 후: {len2}")

중복 제거 전: 82501, 중복 제거 후: 73770


### 음절수 추출

In [12]:
# 2음절 단어 추출.
corpus1 = syllable_count(corpus1, '단어')
corpus2 = syllable_count(corpus2, '어휘')
corpus1 = set_2_syllable(corpus1)
corpus2 = set_2_syllable(corpus2)
corpus1

Unnamed: 0,단어,빈도,음절수
0,찔기,1,2
1,낭망,1,2
2,용궁,1,2
3,속죄,1,2
4,익금,1,2
...,...,...,...
21945,사람,18997,2
21946,하는,23080,2
21947,가지,25581,2
21948,생각,33607,2


In [13]:
corpus2

Unnamed: 0,어휘,빈도,음절수
0,시성,1,2
1,시식,1,2
2,시산,1,2
3,시삽,1,2
4,시암,1,2
...,...,...,...
20812,보다,22093,2
20813,되다,24589,2
20814,하다,63825,2
20815,있다,76984,2


In [14]:
def decompose_hangul(df: pd.DataFrame, col_name: str):
    chosung_list = []
    jungsung_list = []
    jongsung_list = []

    for item in tqdm(df[col_name]):
        chosung, jungsung, jongsung = decompose_hangul_phoneme(item)
        chosung_list.append(chosung)
        jungsung_list.append(jungsung)
        jongsung_list.append(jongsung)

    df['초성'] = chosung_list
    df['중성'] = jungsung_list
    df['종성'] = jongsung_list
    return df

corpus1 = decompose_hangul(corpus1, col_name='단어')
corpus2 = decompose_hangul(corpus2, col_name='어휘')

100%|██████████| 21950/21950 [00:00<00:00, 214215.18it/s]
100%|██████████| 20817/20817 [00:00<00:00, 985750.23it/s]


In [15]:
# 중성 컬럼의 모음이 target_vowel에 있는 행만 필터링
def filter_target_vowel(df: pd.DataFrame):
    target_col = '중성'
    df = df[df[target_col].str[0].isin(target_vowel)]
    df = df[df[target_col].str[1].isin(target_vowel)]
    df.reset_index(drop=True, inplace=True)
    return df

corpus1 = filter_target_vowel(corpus1)
corpus2 = filter_target_vowel(corpus2)

In [16]:
corpus1

Unnamed: 0,단어,빈도,음절수,초성,중성,종성
0,찔기,1,2,"[ㅉ, ㄱ]","[ㅣ, ㅣ]",[ㄹ]
1,낭망,1,2,"[ㄴ, ㅁ]","[ㅏ, ㅏ]","[ㅇ, ㅇ]"
2,지집,1,2,"[ㅈ, ㅈ]","[ㅣ, ㅣ]",[ㅂ]
3,함지,1,2,"[ㅎ, ㅈ]","[ㅏ, ㅣ]",[ㅁ]
4,꼬보,1,2,"[ㄲ, ㅂ]","[ㅗ, ㅗ]",[]
...,...,...,...,...,...,...
8697,우리,14485,2,"[ㅇ, ㄹ]","[ㅜ, ㅣ]",[]
8698,하고,18843,2,"[ㅎ, ㄱ]","[ㅏ, ㅗ]",[]
8699,사람,18997,2,"[ㅅ, ㄹ]","[ㅏ, ㅏ]",[ㅁ]
8700,가지,25581,2,"[ㄱ, ㅈ]","[ㅏ, ㅣ]",[]


In [17]:
corpus2

Unnamed: 0,어휘,빈도,음절수,초성,중성,종성
0,시식,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅣ]",[ㄱ]
1,시산,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅏ]",[ㄴ]
2,시삽,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅏ]",[ㅂ]
3,시암,1,2,"[ㅅ, ㅇ]","[ㅣ, ㅏ]",[ㅁ]
4,시재,1,2,"[ㅅ, ㅈ]","[ㅣ, ㅐ]",[]
...,...,...,...,...,...,...
8478,않다,18416,2,"[ㅇ, ㄷ]","[ㅏ, ㅏ]",[ㄶ]
8479,보다,22093,2,"[ㅂ, ㄷ]","[ㅗ, ㅏ]",[]
8480,하다,63825,2,"[ㅎ, ㄷ]","[ㅏ, ㅏ]",[]
8481,있다,76984,2,"[ㅇ, ㄷ]","[ㅣ, ㅏ]",[ㅆ]


In [18]:
from konlpy.tag import Okt

def get_pos(word):
    # Okt 형태소 분석기 초기화
    okt = Okt()
    
    # 품사 태깅
    pos_tagged = okt.pos(word)
    
    return pos_tagged

In [19]:
# 사용 예시
word = "쩔기"
result = get_pos(word)
print(f"단어: {word}")
print(f"품사 분석 결과: {result}")

단어: 쩔기
품사 분석 결과: [('쩔기', 'Verb')]


In [20]:
# 명사만 살리기.
def get_noun(df: pd.DataFrame, col_name: str):
    drop_index = []
    for index, word in enumerate(df[col_name]):
        result = get_pos(word)
        if len(result) == 1 and result[0][1] == 'Noun':
            df.loc[index, '품사'] = result[0][1]
        else:
            drop_index.append(index)
    df.drop(drop_index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [21]:
corpus1 = get_noun(corpus1, '단어')
corpus2 = get_noun(corpus2, '어휘')

In [22]:
corpus1['빈도1'] = corpus1['빈도']
corpus1

Unnamed: 0,단어,빈도,음절수,초성,중성,종성,품사,빈도1
0,낭망,1,2,"[ㄴ, ㅁ]","[ㅏ, ㅏ]","[ㅇ, ㅇ]",Noun,1
1,함지,1,2,"[ㅎ, ㅈ]","[ㅏ, ㅣ]",[ㅁ],Noun,1
2,꼬보,1,2,"[ㄲ, ㅂ]","[ㅗ, ㅗ]",[],Noun,1
3,부암,1,2,"[ㅂ, ㅇ]","[ㅜ, ㅏ]",[ㅁ],Noun,1
4,생동,1,2,"[ㅅ, ㄷ]","[ㅐ, ㅗ]","[ㅇ, ㅇ]",Noun,1
...,...,...,...,...,...,...,...,...
7036,친구,11288,2,"[ㅊ, ㄱ]","[ㅣ, ㅜ]",[ㄴ],Noun,11288
7037,우리,14485,2,"[ㅇ, ㄹ]","[ㅜ, ㅣ]",[],Noun,14485
7038,사람,18997,2,"[ㅅ, ㄹ]","[ㅏ, ㅏ]",[ㅁ],Noun,18997
7039,가지,25581,2,"[ㄱ, ㅈ]","[ㅏ, ㅣ]",[],Noun,25581


In [23]:
corpus2['빈도2'] = corpus2['빈도']
corpus2

Unnamed: 0,어휘,빈도,음절수,초성,중성,종성,품사,빈도2
0,시식,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅣ]",[ㄱ],Noun,1
1,시산,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅏ]",[ㄴ],Noun,1
2,시삽,1,2,"[ㅅ, ㅅ]","[ㅣ, ㅏ]",[ㅂ],Noun,1
3,시암,1,2,"[ㅅ, ㅇ]","[ㅣ, ㅏ]",[ㅁ],Noun,1
4,시재,1,2,"[ㅅ, ㅈ]","[ㅣ, ㅐ]",[],Noun,1
...,...,...,...,...,...,...,...,...
7191,알다,6433,2,"[ㅇ, ㄷ]","[ㅏ, ㅏ]",[ㄹ],Noun,6433
7192,때문,7554,2,"[ㄸ, ㅁ]","[ㅐ, ㅜ]",[ㄴ],Noun,7554
7193,우리,11226,2,"[ㅇ, ㄹ]","[ㅜ, ㅣ]",[],Noun,11226
7194,지다,12289,2,"[ㅈ, ㄷ]","[ㅣ, ㅏ]",[],Noun,12289


In [24]:
corpus2 = corpus2.rename(columns={'어휘': '단어'})

In [25]:
# 합치고, 각 소스별 빈도도 살리기.
main_df = pd.concat([corpus1, corpus2], ignore_index=True)
main_df = main_df.groupby('단어').agg({'빈도': 'sum',
                           '음절수': 'first',
                           '초성': 'first',
                           '중성': 'first',
                           '종성': 'first',
                           '품사': 'first',
                           '빈도1': 'first',
                           '빈도2': 'last',
                           }).reset_index()


main_df.sort_values(by='빈도', ascending=True, inplace=True, ignore_index=True)
main_df['빈도1'] = main_df['빈도1'].fillna(0)
main_df['빈도2'] = main_df['빈도2'].fillna(0)
main_df['빈도1'] = main_df['빈도1'].astype(int)
main_df['빈도2'] = main_df['빈도2'].astype(int)
main_df

Unnamed: 0,단어,빈도,음절수,초성,중성,종성,품사,빈도1,빈도2
0,힝힝,1,2,"[ㅎ, ㅎ]","[ㅣ, ㅣ]","[ㅇ, ㅇ]",Noun,0,1
1,로똥,1,2,"[ㄹ, ㄸ]","[ㅗ, ㅗ]",[ㅇ],Noun,1,0
2,종놈,1,2,"[ㅈ, ㄴ]","[ㅗ, ㅗ]","[ㅇ, ㅁ]",Noun,0,1
3,로리,1,2,"[ㄹ, ㄹ]","[ㅗ, ㅣ]",[],Noun,1,0
4,로미,1,2,"[ㄹ, ㅁ]","[ㅗ, ㅣ]",[],Noun,1,0
...,...,...,...,...,...,...,...,...,...
9739,때문,15147,2,"[ㄸ, ㅁ]","[ㅐ, ㅜ]",[ㄴ],Noun,7593,7554
9740,우리,25711,2,"[ㅇ, ㄹ]","[ㅜ, ㅣ]",[],Noun,14485,11226
9741,가지,28383,2,"[ㄱ, ㅈ]","[ㅏ, ㅣ]",[],Noun,25581,2802
9742,사람,32591,2,"[ㅅ, ㄹ]","[ㅏ, ㅏ]",[ㅁ],Noun,18997,13594


In [26]:
lower_bound = -1
upper_bound = 6
sample = main_df[(lower_bound < main_df['빈도2']) & (main_df['빈도2'] < upper_bound) & (lower_bound < main_df['빈도1']) & (main_df['빈도1'] < upper_bound)]

In [27]:
def count_vowel(df: pd.DataFrame):
    counter = [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]
    for idx in range(len(df)):
        item = df['중성'].iloc[idx]
        if item[0] == 'ㅏ':
            counter[0][0] += 1
        elif item[0] == 'ㅐ':
            counter[1][0] += 1
        elif item[0] == 'ㅣ':
            counter[2][0] += 1
        elif item[0] == 'ㅗ':
            counter[3][0] += 1
        elif item[0] == 'ㅜ':
            counter[4][0] += 1

        if item[1] == 'ㅏ':
            counter[0][1] += 1
        elif item[1] == 'ㅐ':
            counter[1][1] += 1
        elif item[1] == 'ㅣ':
            counter[2][1] += 1
        elif item[1] == 'ㅗ':
            counter[3][1] += 1
        elif item[1] == 'ㅜ':
            counter[4][1] += 1
    return counter

def count_consonant(df: pd.DataFrame):
    consonant = ['ㅂ', 'ㅍ', 'ㅃ', 'ㅁ', 'ㄴ']
    counter = [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]
    for idx in range(len(df)):
        item = df['초성'].iloc[idx]
        for idx, cons in enumerate(consonant):
            if item[0] == cons:
                counter[idx][0] += 1
            if item[1] == cons:
                counter[idx][1] += 1
    return counter

counter = count_vowel(sample)
for idx, item in enumerate(counter):
    print(f"{target_vowel[idx]}: {item}")
counter = count_consonant(sample)
for idx, item in enumerate(counter):
    print(f"{consonant[idx]}: {item}")

ㅏ: [1458, 1414]
ㅐ: [561, 453]
ㅣ: [818, 1070]
ㅗ: [1058, 995]
ㅜ: [755, 718]
ㅂ: [509, 462]
ㅍ: [217, 227]
ㅃ: [48, 34]
ㅁ: [391, 311]
ㄴ: [240, 163]


In [28]:
#pd.set_option('display.max_rows', None)
pd.reset_option('display.max_rows')
sample[(sample['중성'].str[0] == 'ㅐ') & (sample['초성'].str[0].isin(consonant))]

Unnamed: 0,단어,빈도,음절수,초성,중성,종성,품사,빈도1,빈도2
173,맹맹,1,2,"[ㅁ, ㅁ]","[ㅐ, ㅐ]","[ㅇ, ㅇ]",Noun,1,0
175,맹지,1,2,"[ㅁ, ㅈ]","[ㅐ, ㅣ]",[ㅇ],Noun,0,1
176,맹추,1,2,"[ㅁ, ㅊ]","[ㅐ, ㅜ]",[ㅇ],Noun,0,1
177,맹호,1,2,"[ㅁ, ㅎ]","[ㅐ, ㅗ]",[ㅇ],Noun,0,1
190,맹독,1,2,"[ㅁ, ㄷ]","[ㅐ, ㅗ]","[ㅇ, ㄱ]",Noun,0,1
...,...,...,...,...,...,...,...,...,...
4620,매호,6,2,"[ㅁ, ㅎ]","[ㅐ, ㅗ]",[],Noun,3,3
4763,백삼,7,2,"[ㅂ, ㅅ]","[ㅐ, ㅏ]","[ㄱ, ㅁ]",Noun,4,3
4795,매부,7,2,"[ㅁ, ㅂ]","[ㅐ, ㅜ]",[],Noun,2,5
4803,매상,7,2,"[ㅁ, ㅅ]","[ㅐ, ㅏ]",[ㅇ],Noun,2,5


In [42]:
sample[(sample['초성'].str[0] == 'ㅂ') & (sample['중성'].str[0] == 'ㅐ')].head(20)

Unnamed: 0,단어,빈도,음절수,초성,중성,종성,품사,빈도1,빈도2
798,뱀눈,1,2,"[ㅂ, ㄴ]","[ㅐ, ㅜ]","[ㅁ, ㄴ]",Noun,0,1
800,뱃놈,1,2,"[ㅂ, ㄴ]","[ㅐ, ㅗ]","[ㅅ, ㅁ]",Noun,1,0
802,뱃소,1,2,"[ㅂ, ㅅ]","[ㅐ, ㅗ]",[ㅅ],Noun,1,0
804,뱃지,1,2,"[ㅂ, ㅈ]","[ㅐ, ㅣ]",[ㅅ],Noun,1,0
1033,백학,1,2,"[ㅂ, ㅎ]","[ㅐ, ㅏ]","[ㄱ, ㄱ]",Noun,0,1
1034,백파,1,2,"[ㅂ, ㅍ]","[ㅐ, ㅏ]",[ㄱ],Noun,0,1
1215,배각,1,2,"[ㅂ, ㄱ]","[ㅐ, ㅏ]",[ㄱ],Noun,1,0
1217,배값,1,2,"[ㅂ, ㄱ]","[ㅐ, ㅏ]",[ㅄ],Noun,1,0
1218,배곳,1,2,"[ㅂ, ㄱ]","[ㅐ, ㅗ]",[ㅅ],Noun,1,0
1222,배랑,1,2,"[ㅂ, ㄹ]","[ㅐ, ㅏ]",[ㅇ],Noun,1,0


In [37]:
word_list = ['백학', '배랑', '빽판', '뺴뺴', '패주', '맹호', '내훈',
             '삐침', '필생', '비재', '니나', '',
             '풍상', '분때',  #뿌끼
             '빨빨', '파쟁', 
             ]
word_list = ['빽판', '패주', '백학', '맹호', '내훈'
             ]
filtered_df = sample[sample['단어'].isin(word_list)]
filtered_df

Unnamed: 0,단어,빈도,음절수,초성,중성,종성,품사,빈도1,빈도2
177,맹호,1,2,"[ㅁ, ㅎ]","[ㅐ, ㅗ]",[ㅇ],Noun,0,1
455,내훈,1,2,"[ㄴ, ㅎ]","[ㅐ, ㅜ]",[ㄴ],Noun,0,1
892,빽판,1,2,"[ㅃ, ㅍ]","[ㅐ, ㅏ]","[ㄱ, ㄴ]",Noun,0,1
1033,백학,1,2,"[ㅂ, ㅎ]","[ㅐ, ㅏ]","[ㄱ, ㄱ]",Noun,0,1
1735,패주,1,2,"[ㅍ, ㅈ]","[ㅐ, ㅜ]",[],Noun,0,1


In [38]:
counter = count_vowel(sample[(sample['중성'].str[0] == 'ㅐ') & (sample['초성'].str[0].isin(consonant))])
for idx, item in enumerate(counter):
    print(f"{target_vowel[idx]}: {item}")
counter = count_consonant(sample[(sample['중성'].str[0] == 'ㅐ') & (sample['초성'].str[0].isin(consonant))])
for idx, item in enumerate(counter):
    print(f"{consonant[idx]}: {item}")

ㅏ: [0, 51]
ㅐ: [169, 20]
ㅣ: [0, 39]
ㅗ: [0, 39]
ㅜ: [0, 20]
ㅂ: [77, 14]
ㅍ: [22, 9]
ㅃ: [10, 2]
ㅁ: [32, 7]
ㄴ: [28, 7]


In [39]:
counter = count_vowel(filtered_df)
for idx, item in enumerate(counter):
    print(f"{target_vowel[idx]}: {item}")

counter = count_consonant(filtered_df)
for idx, item in enumerate(counter):
    print(f"{consonant[idx]}: {item}")

ㅏ: [0, 2]
ㅐ: [5, 0]
ㅣ: [0, 0]
ㅗ: [0, 1]
ㅜ: [0, 2]
ㅂ: [1, 0]
ㅍ: [1, 1]
ㅃ: [1, 0]
ㅁ: [1, 0]
ㄴ: [1, 0]
