# 외부 데이터를 가져와 Word2Vec 학습시키기

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import string
import nltk

In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

><b>외부 데이터 불러오기</b>

In [5]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/insight/word2vec/yelp_academic_dataset_review.csv")

In [6]:
df.head()

Unnamed: 0,user_id,review_id,text,votes.cool,business_id,votes.funny,stars,date,type,votes.useful
0,Xqd0DzHaiyRqVH3WRG7hzg,15SdjuK7DmYqUAj6rjGowg,dr. goldberg offers everything i look for in a...,1,vcNAWiLM4dR7D2nwwJ7nCA,0,5,2007-05-17,review,2
1,H1kH6QZV7Le4zqTRNxoZow,RF6UnRTtG7tWMcrO2GEoAg,"Unfortunately, the frustration of being Dr. Go...",0,vcNAWiLM4dR7D2nwwJ7nCA,0,2,2010-03-22,review,2
2,zvJCcrpm2yOZrxKffwGQLA,-TsVN230RCkLYKBeLsuz7A,Dr. Goldberg has been my doctor for years and ...,1,vcNAWiLM4dR7D2nwwJ7nCA,0,4,2012-02-14,review,1
3,KBLW4wJA_fwoWmMhiHRVOA,dNocEAyUucjT371NNND41Q,Been going to Dr. Goldberg for over 10 years. ...,0,vcNAWiLM4dR7D2nwwJ7nCA,0,4,2012-03-02,review,0
4,zvJCcrpm2yOZrxKffwGQLA,ebcN2aqmNUuYNoyvQErgnA,Got a letter in the mail last week that said D...,1,vcNAWiLM4dR7D2nwwJ7nCA,0,4,2012-05-15,review,2


><b>"text" 항목만 가져오기</b>

In [7]:
df = df[["text"]]

In [8]:
df.head()

Unnamed: 0,text
0,dr. goldberg offers everything i look for in a...
1,"Unfortunately, the frustration of being Dr. Go..."
2,Dr. Goldberg has been my doctor for years and ...
3,Been going to Dr. Goldberg for over 10 years. ...
4,Got a letter in the mail last week that said D...


><b>DataFrame 안의 내용을 string 자료형으로 변경한 후 모두 소문자로 변경</b>

In [9]:
def to_string(text):
    text = str(text)
    text = text.lower()
    return text

In [10]:
df['text'] = df['text'].apply(lambda x : to_string(x))
df.head()

Unnamed: 0,text
0,dr. goldberg offers everything i look for in a...
1,"unfortunately, the frustration of being dr. go..."
2,dr. goldberg has been my doctor for years and ...
3,been going to dr. goldberg for over 10 years. ...
4,got a letter in the mail last week that said d...


><b>문장 기호 지우기</b>

In [11]:
def remove_punct(text):
    table = str.maketrans("", "",string.punctuation)#(치환되는 문자, 치환하는 문자, 삭제할 문자)
    return text.translate(table)

example = "I am a #king"
print(remove_punct(example))

I am a king


In [12]:
df['text'] = df['text'].apply(lambda x : remove_punct(x))
df.head()

Unnamed: 0,text
0,dr goldberg offers everything i look for in a ...
1,unfortunately the frustration of being dr gold...
2,dr goldberg has been my doctor for years and i...
3,been going to dr goldberg for over 10 years i ...
4,got a letter in the mail last week that said d...


><b>문장 내 숫자 제거하기</b>

In [13]:
df["text"] = df['text'].str.replace("\d+", "")
df.head()

Unnamed: 0,text
0,dr goldberg offers everything i look for in a ...
1,unfortunately the frustration of being dr gold...
2,dr goldberg has been my doctor for years and i...
3,been going to dr goldberg for over years i th...
4,got a letter in the mail last week that said d...


><b>DataFrame 자료형을 numpy 자료형으로 변경</b>

In [14]:
data = df.to_numpy()
print(data.shape)

(1125458, 1)


><b>차원 변경</b>

In [15]:
data = data.reshape(-1)
print(data.shape)

(1125458,)


><b>nltk에서 제공하는 tokenizer를 이용하여 각 문장 토큰화 진행</b>

In [20]:
for i in range(len(data)):
    data[i] = word_tokenize(data[i])

In [21]:
print(data[0])

['dr', 'goldberg', 'offers', 'everything', 'i', 'look', 'for', 'in', 'a', 'general', 'practitioner', 'hes', 'nice', 'and', 'easy', 'to', 'talk', 'to', 'without', 'being', 'patronizing', 'hes', 'always', 'on', 'time', 'in', 'seeing', 'his', 'patients', 'hes', 'affiliated', 'with', 'a', 'topnotch', 'hospital', 'nyu', 'which', 'my', 'parents', 'have', 'explained', 'to', 'me', 'is', 'very', 'important', 'in', 'case', 'something', 'happens', 'and', 'you', 'need', 'surgery', 'and', 'you', 'can', 'get', 'referrals', 'to', 'see', 'specialists', 'without', 'having', 'to', 'see', 'him', 'first', 'really', 'what', 'more', 'do', 'you', 'need', 'im', 'sitting', 'here', 'trying', 'to', 'think', 'of', 'any', 'complaints', 'i', 'have', 'about', 'him', 'but', 'im', 'really', 'drawing', 'a', 'blank']


><b>gensim에서 제공하는 word2vec 모델을 불러온 후 학습 진행</b>

In [22]:
model = Word2Vec(sentences=data, size=100, window=5, min_count=5, workers=4, sg=1)
# size : 임베딩 벡터 사이즈
# window : 윈도우 사이즈
# min_count : 단어 최소 빈도 수 제한
# workers : 학습을 위한 프로세스 수
# sg : 0 = CBOW, 1 = Skip-gram

><b>단어 "man"과 가장 유사한 단어 출력</b>

In [23]:
model_result = model.wv.most_similar("man")
print(model_result)

[('dude', 0.7682689428329468), ('guy', 0.7519841194152832), ('gentleman', 0.6840146780014038), ('woman', 0.6766877174377441), ('lady', 0.670669674873352), ('dere', 0.6697652339935303), ('gent', 0.6596511602401733), ('hea', 0.6498531103134155), ('manim', 0.6431424021720886), ('chester', 0.6399444937705994)]


><b>단어 "man"의 임베딩 벡터 출력</b>

In [24]:
print(model["man"])

[-0.19431031  0.01632636  0.05583194  0.03764286 -0.5630345   0.16811843
  0.13258813 -0.35727978 -0.27572015 -0.18093751  0.42320022 -0.05997192
 -0.35762906  0.33326688 -0.21404114  0.29588243 -0.05228198  0.48110172
 -0.48055074 -0.10449351  0.05236864  0.30614766 -0.20200706 -0.25490975
  0.08842546 -0.24004993  0.00990041 -0.01341069  0.41216195 -0.19742137
  0.00620749  0.08040325  0.24733718  0.3746871   0.0067558   0.02067324
  0.36745062 -0.13324805  0.13904268  0.2817895  -0.00501569  0.00890389
  0.0045009  -0.1443863   0.28005528 -0.23329371  0.16115162 -0.05397427
 -0.21670343  0.07319455 -0.0865364  -0.23154731 -0.25174126  0.14103705
  0.27787274 -0.07705692 -0.20867406 -0.16168568  0.22484241  0.7609484
 -0.24356417 -0.0211877   0.02337233  0.25681308 -0.17514552  0.14511162
  0.01646813  0.07323065 -0.41366157 -0.19799973 -0.00184639 -0.26021746
  0.25707227  0.05160777  0.16891734  0.00866131  0.02622517  0.24636625
  0.06391644  0.30543333  0.33006364 -0.2662923   0.

  """Entry point for launching an IPython kernel.


# 미리 학습된 GloVe 가져오기

In [25]:
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim

><b>glove 벡터를 word2vec 모델이 읽을 수 있도록 파일 변환</b>

In [26]:
input_file = '/content/drive/MyDrive/Colab Notebooks/insight/word2vec/glove.6B.300d.txt'
output_file = '/content/drive/MyDrive/Colab Notebooks/insight/word2vec/gensim_glove.6B.300d.txt'
glove2word2vec(input_file, output_file)

(400000, 300)

><b>"gensim_glove.6B.300d.txt" 파일 읽어오기</b>

In [None]:
model = KeyedVectors.load_word2vec_format(output_file, binary=False)

><b>"cat"과 가장 유사한 단어 출력</b>

In [None]:
print(model.most_similar("cat"))