# Kkma 형태소 분석기

In [None]:
import numpy as np
import pandas as pd
from konlpy.tag import Kkma
import collections
kkma = Kkma()

In [None]:
songs = pd.read_csv('C:\Python\doodle.csv', engine = 'python', encoding='euc-kr')
songs.head()

Unnamed: 0,TITLE,ARTIST,LYRICS
0,가거라 삼팔선,남인수,아 산이 막혀\r\n 못오시나요아 물이 막혀\r\n ...
1,가는 봄 오는 봄,최숙자,하늘마저 울던 그날에\r\n 어머님을 이별을 하고원한의 십년 세월\r\n ...
2,가는 세월,서유석,가는 세월 그 누구가\r\n 막을 수가 있나요흘러가는 시냇물을\r\n ...
3,가버린 당신,최진희,잊는다고 생각하면\r\n 또 다시 당신 생각미웁다고 생각하면\r\n ...
4,가을비 우산속,최 헌,그리움이 눈처럼\r\n 쌓인 거리를나 혼자서 걸었네\r\n ...


## 1. kkma.nouns()로 BoW(Bag of Words) 생성

In [None]:
print(kkma.nouns("국가수리과학연구소는 수학적 접근을 통해 기업이 직면한 문제를 함께 해결하고 있습니다."))

['국가', '국가수리과학연구소', '수리', '과학', '연구소', '수학적', '접근', '기업', '직면', '문제', '해결']


In [None]:
songs = pd.read_csv('C:\Python\doodle_bow.csv', engine = 'python', encoding='utf-8')
songs.head()

Unnamed: 0,TITLE,ARTIST,LYRICS,BoW
0,가거라 삼팔선,남인수,아 산이 막혀\r\n 못오시나요아 물이 막혀\r\n ...,"['산', '물', '고향', '고향땅', '땅', '남북', '원한', '천리길'..."
1,가는 봄 오는 봄,최숙자,하늘마저 울던 그날에\r\n 어머님을 이별을 하고원한의 십년 세월\r\n ...,"['하늘', '그날', '어머님', '이별', '원한', '십', '십년', '년'..."
2,가는 세월,서유석,가는 세월 그 누구가\r\n 막을 수가 있나요흘러가는 시냇물을\r\n ...,"['세월', '누구', '수', '시냇물', '아가', '어른', '슬픔', '행복..."
3,가버린 당신,최진희,잊는다고 생각하면\r\n 또 다시 당신 생각미웁다고 생각하면\r\n ...,"['생각', '당신', '얼굴', '사람', '모두', '부', '사랑', '순간'..."
4,가을비 우산속,최 헌,그리움이 눈처럼\r\n 쌓인 거리를나 혼자서 걸었네\r\n ...,"['그리움', '눈', '거리', '나', '혼자', '미련', '미련때문', '때..."


## 2. kkma.pos()로 new_BoW(Bag of Words) 생성

In [None]:
print(kkma.pos("국가수리과학연구소는 수학적 접근을 통해 기업이 직면한 문제를 함께 해결하고 있습니다."))

[('국가', 'NNG'), ('수리', 'NNG'), ('과학', 'NNG'), ('연구소', 'NNG'), ('는', 'JX'), ('수학적', 'NNG'), ('접근', 'NNG'), ('을', 'JKO'), ('통하', 'VV'), ('어', 'ECS'), ('기업', 'NNG'), ('이', 'JKS'), ('직면', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('문제', 'NNG'), ('를', 'JKO'), ('함께', 'MAG'), ('해결', 'NNG'), ('하', 'XSV'), ('고', 'ECE'), ('있', 'VXV'), ('습니다', 'EFN'), ('.', 'SF')]


In [None]:
# 테스트
text = songs.LYRICS[0]
df = pd.DataFrame(kkma.pos(text))
df

Unnamed: 0,0,1
0,아,VV
1,아,ECS
2,산,NNG
3,이,JKS
4,막히,VV
5,어,ECS
6,못,MAG
7,오,VV
8,시,EPH
9,나요,ECE


### 2-1. 의미있는 어간의 품사태그만 골라 리스트로 생성
IC 감탄사  
MA 부사  
MAG 일반부사  
MD 관형사  
MDN 수 관형사  
MDT 일반 관형사  
NN 명사  
NNG 보통명사  
NNP 고유명사  
NR 수사  
VA 형용사  
VV 동사  
XP 접두사  
XPN 체언 접두사  
XPV 용언 접두사  
XR 어근  
UN 명사추정범주  

In [None]:
ugan = ['IC', 'MA', 'MAG', 'MD', 'MDN', 'MDT', 'NN', 'NNG', 'NNP', 'NR', 'VA', 'VV', 'XP', 'XPN', 'XPV', 'XR', 'UN']
len(ugan)

17

In [None]:
# 테스트
c=[]
for i in range(16):
    a = df.loc[df[1] == ugan[i]]
    b = list(a.loc[:,0])
    for word in b:
        c.append(word)
print(c)

['못', '못', '다', '함께', '이', '산', '물', '고향', '땅', '남북', '원한', '천리길', '꿈', '꿈', '삼팔선', '때', '눈', '때', '보따리', '고개길', '새', '자유', '여', '자유', '여', '목숨', '같', '아', '막히', '오', '아', '막히', '오', '가', '오', '가로막히', '찾', '찾', '탄하', '아', '꽃피', '나오', '늘', '아', '오', '나오', '늘', '메', '넘', '사', '울', '넘', '위하', '위하', '바치']


### 2-2. 전체 곡에 대해 어간만 뽑아 new_BoW라는 리스트 생성

In [None]:
from tqdm import tqdm, trange
from time import sleep

In [None]:
with tqdm(total = len(songs)) as pbar:
    BoW = []
    for j in range(0, len(songs)):
        text = songs.LYRICS[j]
        df = pd.DataFrame(kkma.pos(text))
        c=[]
        for i in range(17):
            a = df.loc[df[1] == ugan[i]]
            b = list(a.loc[:,0])
            for word in b:
                c.append(word)
        BoW.append(c)
        pbar.update(1)

100%|██████████████████████████████████████████████████████████████████████████████| 6997/6997 [59:24<00:00,  2.57it/s]


In [None]:
songs['new_BoW'] = BoW
songs.to_csv('C:\Python\doodle_0821.csv', encoding='euc-kr')

In [None]:
songs = pd.read_csv('C:\Python\doodle_0821.csv', engine = 'python', encoding='euc-kr')
songs.head()

Unnamed: 0.1,Unnamed: 0,TITLE,ARTIST,LYRICS,BoW,new_BoW
0,0,가거라 삼팔선,남인수,아 산이 막혀\r\r\n 못오시나요아 물이 막혀\r\r\n ...,"['산', '물', '고향', '고향땅', '땅', '남북', '원한', '천리길'...","['못', '못', '다', '함께', '이', '산', '물', '고향', '땅'..."
1,1,가는 봄 오는 봄,최숙자,하늘마저 울던 그날에\r\r\n 어머님을 이별을 하고원한의 십년 세월\r\r\...,"['하늘', '그날', '어머님', '이별', '원한', '십', '십년', '년'...","['하늘', '그날', '어머님', '이별', '원한', '세월', '눈물', '속..."
2,2,가는 세월,서유석,가는 세월 그 누구가\r\r\n 막을 수가 있나요흘러가는 시냇물을\r\r...,"['세월', '누구', '수', '시냇물', '아가', '어른', '슬픔', '행복...","['다', '다', '그', '이', '이', '세월', '시냇물', '아가', '..."
3,3,가버린 당신,최진희,잊는다고 생각하면\r\r\n 또 다시 당신 생각미웁다고 생각하면\r\r\...,"['생각', '당신', '얼굴', '사람', '모두', '부', '사랑', '순간'...","['또', '다시', '오히려', '모두', '왜', '이렇게', '모두', '왜'..."
4,4,가을비 우산속,최 헌,그리움이 눈처럼\r\r\n 쌓인 거리를나 혼자서 걸었네\r\r...,"['그리움', '눈', '거리', '나', '혼자', '미련', '미련때문', '때...","['왜', '이다지', '또', '다시', '언젠가', '언젠가', '그', '그'..."


## 3. new_BoW열에 있는 모든 단어를 full_BoW라는 하나의 리스트 안에 합산

In [None]:
songs.new_BoW[0]

"['못', '못', '다', '함께', '이', '산', '물', '고향', '땅', '남북', '원한', '천리길', '꿈', '꿈', '삼팔선', '때', '눈', '때', '보따리', '고개길', '새', '자유', '여', '자유', '여', '목숨', '같', '아', '막히', '오', '아', '막히', '오', '가', '오', '가로막히', '찾', '찾', '탄하', '아', '꽃피', '나오', '늘', '아', '오', '나오', '늘', '메', '넘', '사', '울', '넘', '위하', '위하', '바치']"

In [None]:
# 문자를 리스트로 변환하는 함수 생성
def string_to_list (column):
    lyrics = []
    for string in list(column):
        temp = ''
        ls = []
        for l in string:
            if l == '[' or l == "'" or l == ']':
                l = ''
            temp += l
        raw_list = temp.split(',')
        for word in raw_list:
            word = word.strip()
            ls.append(word)
        lyrics.append(ls)
    return lyrics

In [None]:
# 테스트
string_to_list(songs.new_BoW)[0]

['못',
 '못',
 '다',
 '함께',
 '이',
 '산',
 '물',
 '고향',
 '땅',
 '남북',
 '원한',
 '천리길',
 '꿈',
 '꿈',
 '삼팔선',
 '때',
 '눈',
 '때',
 '보따리',
 '고개길',
 '새',
 '자유',
 '여',
 '자유',
 '여',
 '목숨',
 '같',
 '아',
 '막히',
 '오',
 '아',
 '막히',
 '오',
 '가',
 '오',
 '가로막히',
 '찾',
 '찾',
 '탄하',
 '아',
 '꽃피',
 '나오',
 '늘',
 '아',
 '오',
 '나오',
 '늘',
 '메',
 '넘',
 '사',
 '울',
 '넘',
 '위하',
 '위하',
 '바치']

In [None]:
# 타입 확인
type(string_to_list(songs.new_BoW)[0])

list

In [None]:
# 테스트
string_to_list(songs.new_BoW)[0][3]

'함께'

In [None]:
# 테스트
count = collections.Counter(string_to_list(songs.new_BoW)[0])
#print(count)
c = dict(count)
#type(c)
#print(c)
ko = list(c.keys())
#print(ko)
ji = list(c.values())
#print(ji)
count_df = pd.DataFrame({'WORD' : ko, 'Frequency' : ji})
count_df

Unnamed: 0,WORD,Frequency
0,못,2
1,다,1
2,함께,1
3,이,1
4,산,1
5,물,1
6,고향,1
7,땅,1
8,남북,1
9,원한,1


In [None]:
with tqdm(total = len(songs.new_BoW)) as pbar:
    full_BoW = []
    for i in range(0, len(songs.new_BoW)):
        full_BoW += string_to_list(songs.new_BoW)[i]
        pbar.update(1)

 55%|████████████████████████████████████████▌                                 | 3835/6997 [2:08:49<1:42:16,  1.94s/it]

## 4. full_BoW의 Frequency 계산하여 dic 생성

In [None]:
count = collections.Counter(full_BoW)
#print(count)
c = dict(count)
#type(c)
ko = list(c.keys())
ji = list(c.values())
count_df = pd.DataFrame({'WORD' : ko, 'Frequency' : ji})
count_df

## 5. 내림차순으로 정렬해서 상위 100개로 dic100 생성

In [None]:
sorted_result = count_df.sort_values(['Frequency'], ascending = [False])
dic100 = sorted_result.head(100)
dic100

In [None]:
dic100.to_csv("C:\Python\dic100.csv", encoding = 'euc-kr', index = False)
dic100 = pd.read_csv('C:\Python\dic100.csv', engine = 'python', encoding='euc-kr')
dic100

In [None]:
dic100_word_list = list(dic100['WORD'])
dic100_word_list

## 6. 한 곡당 Frequency 계산해서 곡마다 벡터를 생성

In [None]:
with tqdm(total = len(songs)) as pbar:
    dic100_vec = dic100
    for i in range(0, len(songs)):
        count = collections.Counter(string_to_list(songs.new_BoW)[i])
        #print(count)
        c = dict(count)
        type(c)
        ko = list(c.keys())
        ji = list(c.values())
        count_df = pd.DataFrame({'WORD' : ko, 'Frequency' : ji})
        dic100_vec[str(songs.TITLE[i])+' - '+str(songs.ARTIST[i])] = np.zeros(100)
        for k in range(0, len(count_df)):
            for j in range(0, 100):
                if count_df['WORD'][k]==dic100_word_list[j]:
                    dic100_vec.loc[j, str(songs.TITLE[i])+' - '+str(songs.ARTIST[i])] = count_df.loc[k]['Frequency']
        pbar.update(1)

In [None]:
dic100_vec

Unnamed: 0,WORD,Frequency,가거라 삼팔선 - 남인수,가는 봄 오는 봄 - 최숙자,가는 세월 - 서유석,가버린 당신 - 최진희,가을비 우산속 - 최 헌,가을을 - 패티김,가장 무도회 - 김완선,가지 마오 - 나훈아,...,How Do I Say - 신화,She's Mine - 휘성&SEVEN;,4월의 눈물 - 정인호,고백 - 3BOYS,기억해 - 정인호,남자는 속으로 운다 - 전미경,내가 - 이수영,Sweety - 클래지콰이,Ring My Bell - 다이나믹 듀오,기억을 흘리다 - 심현보
0,너,18011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,사랑,17731,0.0,0.0,0.0,2.0,0.0,4.0,4.0,6.0,...,5.0,4.0,2.0,5.0,4.0,4.0,6.0,0.0,2.0,2.0
2,내,12819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,3.0,0.0,1.0,2.0,0.0,2.0,3.0,0.0
3,나,12289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,그대,8550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,수,7886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,날,7689,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,3.0,8.0,1.0,0.0,0.0,3.0,0.0,1.0
7,그,7480,0.0,0.0,1.0,3.0,3.0,2.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0
8,말,6815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,1.0,0.0,1.0,0.0,2.0,2.0,0.0,0.0
9,난,5208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
dic100_vec.to_csv("C:/Python/kkma_total_vec_utf-8.csv", encoding='euc-kr')