In [1]:
import copy
import random

import numpy as np
import pandas as pd
import matplotlib as plt
import re

plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams["figure.figsize"] = (20, 10)


# 추가한 라이브러리 ======================== #

import sentencepiece as spm
from khaiii import KhaiiiApi
api = KhaiiiApi()

### [입력] 불러올 파일명을 입력해주세요

In [2]:
data_path = './data/json/song_meta.json'

### 데이터 다시 불러오기

In [3]:
# 이 코드를 불러오면 깨끗한 데이터를 다시 가져올 수 있습니다
data = pd.read_json(data_path, typ = 'frame')

In [4]:
df = pd.DataFrame(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707989 entries, 0 to 707988
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   song_gn_dtl_gnr_basket  707989 non-null  object
 1   issue_date              707989 non-null  int64 
 2   album_name              707985 non-null  object
 3   album_id                707989 non-null  int64 
 4   artist_id_basket        707989 non-null  object
 5   song_name               707989 non-null  object
 6   song_gn_gnr_basket      707989 non-null  object
 7   artist_name_basket      707989 non-null  object
 8   id                      707989 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 48.6+ MB


In [5]:
# 자동으로 csv파일을 생성해 줍니다(엑셀에서 텍스트를 확인해 보세요)
data.to_csv('data_test/data_0000.csv')

---

### [입력] 가져올 컬럼명을 my_column에 입력해 주세요

In [6]:
# 가져올 컬럼명을 입력해 주세요
# 샘플파일은 텍스트 전처리를 위해 하나씩만 가져옵니다
my_column = 'song_name'
df = df.loc[:, [my_column]]
df = df.rename(columns={my_column: "col"})

In [7]:
df.head()

Unnamed: 0,col
0,Feelings
1,"Bach : Partita No. 4 In D Major, BWV 828 - II...."
2,Solsbury Hill (Remastered 2002)
3,Feeling Right (Everything Is Nice) (Feat. Popc...
4,그남자 그여자


In [11]:
# 임시로 추출하기 위해 1000개만 가지고 작업, 나중에는 전체 가져오면 됨
data = df[:1000]

In [12]:
data['lan'] = " "

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
data.head()

Unnamed: 0,col,lan
0,Feelings,
1,"Bach : Partita No. 4 In D Major, BWV 828 - II....",
2,Solsbury Hill (Remastered 2002),
3,Feeling Right (Everything Is Nice) (Feat. Popc...,
4,그남자 그여자,


# 한글처리 (khaiii)

---
### [01] 정규화
- 한글, 영문 텍스트를 제외하고 삭제(특수문자등)

In [15]:
# 판다스를 이용한 정규표현식
data['col'] = data['col'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 a-zA-Z ]","") # 정규 표현식 수행
data['col'].replace(' ', np.nan, inplace=True) # 공백은 Null 값으로 변경
data = data.dropna(how='any') # Null 값 제거

In [16]:
data.head()

Unnamed: 0,col,lan
0,Feelings,
1,Bach Partita No In D Major BWV II Allemande,
2,Solsbury Hill Remastered,
3,Feeling Right Everything Is Nice Feat Popcaan ...,
4,그남자 그여자,


In [17]:
# 한글인것만  True로 표시해라
f = lambda row: row.apply(str).str.contains("[ㄱ-ㅎㅏ-ㅣ가-힣]" ,na=False, flags=re.IGNORECASE)
data['lan'] = data.apply(f, axis=1)

In [18]:
# 한국어 인지 영어인지 표시하라
data['lan'] = data['lan'].apply(lambda x: 'ko' if x == True else 'en')
data.head()

Unnamed: 0,col,lan
0,Feelings,en
1,Bach Partita No In D Major BWV II Allemande,en
2,Solsbury Hill Remastered,en
3,Feeling Right Everything Is Nice Feat Popcaan ...,en
4,그남자 그여자,ko


---
### [02] 토큰화(한국어, khaiii)
- 문자열에서 단어로 분리시키는 단계

In [19]:
tag_list = ['NNG', 'VV', 'VA']
def khaiiiFn(sentence):
    temp = []
    for word in api.analyze(sentence):
        for m in word.morphs:
            if m.tag in tag_list:
                temp.append([m.lex, m.tag])
        
    return temp

In [None]:
data.head()

In [20]:
data['vocabs'] = ''
data['vocabs'][data['lan']=='ko'] = data['col'][data['lan']=='ko'].apply(khaiiiFn)

In [21]:
data

Unnamed: 0,col,lan,vocabs
0,Feelings,en,
1,Bach Partita No In D Major BWV II Allemande,en,
2,Solsbury Hill Remastered,en,
3,Feeling Right Everything Is Nice Feat Popcaan ...,en,
4,그남자 그여자,ko,"[[남자, NNG], [긋, VV], [여, NNG]]"
...,...,...,...
995,I Learned To Dance In Mississippi,en,
996,Overtime Feat Sebastian Reynoso KYLE,en,
997,온 땅의 주인 Who Am I,ko,"[[땅, NNG], [주인, NNG]]"
998,Cimarosa Oboe Concerto in C Allegro giusto,en,


---
### [03] 불용어 제거 & 어간추출

- 전치사, 관사 등 너무 많이 등장하는 단어 등 문장이나 문서의 특징을 표현하는데 불필요한 단어 제거

In [22]:
# 개별적으로 제거하고자 하는 텍스트 추가 사용

stopwordsKo = ['긋','의','가','이','은','들','는','좀','잘',
             '걍','과','도','를','으로','자','에','와','한','하다']


In [23]:
# morphs에 불용어가 들어가 있으면 제거 하는 함수(한국어)

def stopwordsFnKo(text):
    word_tokens = text
    word_list = []

    for i in range(len(word_tokens)):
        i_text = word_tokens[i]

        if i_text[0] not in stopwordsKo:
            word_list.append(i_text)

    return word_list


In [24]:
# 한국어인 경우 에만 stopwordsFnKo 함수를 실행한다
ko_sentence = data[data['lan'] == 'ko']['vocabs']
ko_sentence = ko_sentence.apply(stopwordsFnKo)
ko_sentence

4                                  [[남자, NNG], [여, NNG]]
6      [[시, NNG], [슬프, VA], [왈츠, NNG], [작품, NNG], [번호...
9                                [[사랑, NNG], [멜로디, NNG]]
18     [[숙면, NNG], [휴식, NNG], [좋, VA], [편안, NNG], [빗소...
19                                  [[기, VV], [다리, NNG]]
                             ...                        
985                                [[학도, NNG], [병, NNG]]
987      [[모음곡, NNG], [작품번호, NNG], [아침, NNG], [기분, NNG]]
991                        [[위, NNG], [받, VV], [날, NNG]]
992    [[좋, VA], [말, NNG], [창작, NNG], [동요제, NNG], [대상...
997                                [[땅, NNG], [주인, NNG]]
Name: vocabs, Length: 313, dtype: object

In [25]:
data.head()

Unnamed: 0,col,lan,vocabs
0,Feelings,en,
1,Bach Partita No In D Major BWV II Allemande,en,
2,Solsbury Hill Remastered,en,
3,Feeling Right Everything Is Nice Feat Popcaan ...,en,
4,그남자 그여자,ko,"[[남자, NNG], [긋, VV], [여, NNG]]"


# 영문처리 (sentencepiece)

In [26]:
data[data['lan'] == 'en'].tail()

Unnamed: 0,col,lan,vocabs
994,This Life Feat Roxanne Emery,en,
995,I Learned To Dance In Mississippi,en,
996,Overtime Feat Sebastian Reynoso KYLE,en,
998,Cimarosa Oboe Concerto in C Allegro giusto,en,
999,Autumn Leaves,en,


In [27]:
# ./data/botchan.txt 를 사용하여 벡터화 하는것을 확인한다

spm.SentencePieceTrainer.train('--input=./data/pre_text/botchan.txt --model_prefix=m --vocab_size=2000 --normalization_rule_name=nfkc_cf')
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# encode: text => id
print(sp.encode_as_pieces('This is a test'))
print(sp.encode_as_ids('This is a test'))

# decode: id => text
print(sp.decode_pieces(['▁This', '▁is', '▁a', '▁t', 'est']))
print(sp.decode_ids([209, 31, 9, 375, 586]))

['▁this', '▁is', '▁a', '▁t', 'est']
[39, 35, 9, 258, 406]
▁This is a test
now you a countrybe


In [29]:
# 참고용
# 토큰화
en_encode_piece = sp.encode_as_pieces(data['col'].loc[1])
print(en_encode_piece)

# 숫자로 변환
en_encode = sp.encode_as_ids(data['col'].loc[1])
print(en_encode)

# 숫자를 영어로 변환
en_decode = sp.decode_ids(en_encode)
print(en_decode)

['▁b', 'a', 'ch', '▁part', 'it', 'a', '▁no', '▁in', '▁d', '▁ma', 'j', 'or', '▁b', 'w', 'v', '▁i', 'i', '▁all', 'em', 'and', 'e']
[180, 50, 184, 272, 178, 50, 64, 12, 383, 608, 336, 122, 180, 165, 164, 6, 36, 57, 682, 222, 38]
bach partita no in d major bwv ii allemande


In [31]:
# 영어 불용어를 추가한다
stopwordsEn = ['II']

In [32]:
# 영어 문장을 토큰화 하는 함수로 만들어 준다
# if i_text not in stopwordsEn 을 통해 영어 불용어를 제거한다

def sentencePieceFn(text):

    word_tokens = text
    en_list = []

    word_tokens = word_tokens.split(' ')

    for i in range(len(word_tokens)):
        i_text = word_tokens[i]
        
        if i_text not in stopwordsEn:
            en_list.append(i_text)
        
    en_list = ' '.join(en_list).split()
    return en_list

In [33]:
data['vocabs'][data['lan']=='en'] = data['col'][data['lan']=='en'].apply(sentencePieceFn)

In [34]:
data.loc[1]['vocabs']

['Bach', 'Partita', 'No', 'In', 'D', 'Major', 'BWV', 'Allemande']

In [35]:
data.head()

Unnamed: 0,col,lan,vocabs
0,Feelings,en,[Feelings]
1,Bach Partita No In D Major BWV II Allemande,en,"[Bach, Partita, No, In, D, Major, BWV, Allemande]"
2,Solsbury Hill Remastered,en,"[Solsbury, Hill, Remastered]"
3,Feeling Right Everything Is Nice Feat Popcaan ...,en,"[Feeling, Right, Everything, Is, Nice, Feat, P..."
4,그남자 그여자,ko,"[[남자, NNG], [긋, VV], [여, NNG]]"


### word2vec 을 이용한 한국어 벡터화

In [36]:
data_vector = data

In [37]:
# 데이터 프레임의 로우를 돌면서 한국어인 경우의 값을 가져온다
ko_index = data_vector['col'][data_vector['lan']=='ko'].index
ko_index

Int64Index([  4,   6,   9,  18,  19,  20,  25,  29,  32,  35,
            ...
            958, 968, 975, 977, 981, 985, 987, 991, 992, 997],
           dtype='int64', length=313)

In [38]:
# vocabs 한국어 데이터를 하나의 리스트로 변환한다

def vocabsOneLineFn(text):
    test_data = text
    vocabs_one_line = []
    
    lengh = len(text)
    
    for i in range(lengh):
        vocabs_one_line.append(test_data[i][0])
        # 형태소도 사용하고 싶을때는 아래의 주석을 풀어 사용한다
        # vocabs_one_line.append(test_data[i][1])
    
    return vocabs_one_line


In [39]:
data_vector['vocabs'][data_vector['lan']=='ko'] = data_vector['vocabs'][data_vector['lan']=='ko'].apply(vocabsOneLineFn)


In [40]:
data_vector.head()

Unnamed: 0,col,lan,vocabs
0,Feelings,en,[Feelings]
1,Bach Partita No In D Major BWV II Allemande,en,"[Bach, Partita, No, In, D, Major, BWV, Allemande]"
2,Solsbury Hill Remastered,en,"[Solsbury, Hill, Remastered]"
3,Feeling Right Everything Is Nice Feat Popcaan ...,en,"[Feeling, Right, Everything, Is, Nice, Feat, P..."
4,그남자 그여자,ko,"[남자, 긋, 여]"


In [41]:
data_vector['encode'] = 0
data_vector.to_csv('data_vector.csv', index=0)

In [42]:
data_vector.head()

Unnamed: 0,col,lan,vocabs,encode
0,Feelings,en,[Feelings],0
1,Bach Partita No In D Major BWV II Allemande,en,"[Bach, Partita, No, In, D, Major, BWV, Allemande]",0
2,Solsbury Hill Remastered,en,"[Solsbury, Hill, Remastered]",0
3,Feeling Right Everything Is Nice Feat Popcaan ...,en,"[Feeling, Right, Everything, Is, Nice, Feat, P...",0
4,그남자 그여자,ko,"[남자, 긋, 여]",0


In [43]:
def vocabsToEncodeFn(text):
    
    test_data = text
    en_encode_list = []
    
    lengh = len(text)
    
    for i in range(lengh):
        en_encode = sp.encode_as_ids(text[i])
        en_encode_list.append(en_encode)

    return en_encode_list

In [44]:
data_vector['encode'] = data_vector['vocabs'].apply(vocabsToEncodeFn)


In [45]:
data_vector

Unnamed: 0,col,lan,vocabs,encode
0,Feelings,en,[Feelings],"[[522, 13, 8]]"
1,Bach Partita No In D Major BWV II Allemande,en,"[Bach, Partita, No, In, D, Major, BWV, Allemande]","[[180, 50, 184], [272, 178, 50], [64], [12], [..."
2,Solsbury Hill Remastered,en,"[Solsbury, Hill, Remastered]","[[46, 70, 8, 96, 215, 51], [397, 121], [151, 9..."
3,Feeling Right Everything Is Nice Feat Popcaan ...,en,"[Feeling, Right, Everything, Is, Nice, Feat, P...","[[522, 13], [131], [598], [35], [1723], [792, ..."
4,그남자 그여자,ko,"[남자, 긋, 여]","[[16, 0], [16, 0], [16, 0]]"
...,...,...,...,...
995,I Learned To Dance In Mississippi,en,"[I, Learned, To, Dance, In, Mississippi]","[[6], [1498, 14], [7], [517], [12], [925, 210,..."
996,Overtime Feat Sebastian Reynoso KYLE,en,"[Overtime, Feat, Sebastian, Reynoso, KYLE]","[[179, 1097], [792, 149], [110, 38, 96, 50, 14..."
997,온 땅의 주인 Who Am I,ko,"[땅, 주인]","[[16, 0], [16, 0]]"
998,Cimarosa Oboe Concerto in C Allegro giusto,en,"[Cimarosa, Oboe, Concerto, in, C, Allegro, giu...","[[156, 646, 115, 65, 8, 50], [387, 96, 65, 38]..."
