# (1) Load library

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re
import time
import nltk
from nltk.tokenize import word_tokenize

In [2]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# (2) Load & Check Data

In [3]:
#Pandas read_csv활용
data_path=r'C:\Users\user\Desktop\딥러닝프레임워크_박성호\spam.csv'
data=pd.read_csv(data_path,encoding='latin1')

In [4]:
#Pandas Dataframe 메소드 활용
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# (3) 데이터 기본 전처리 

In [5]:
data=data[['v1','v2']]
data['v1'].replace(['ham','spam'],[0,1],inplace=True)
data.columns=['Y','des']

In [6]:
#기본 전처리된 데이터 확인
#head는 인덱스 0~4, tail은 끝에서부터 불러오는 것
data.tail()

Unnamed: 0,Y,des
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...
5571,0,Rofl. Its true to its name


In [7]:
#Missing Value 확인
data.isnull().any()

Y      False
des    False
dtype: bool

In [8]:
#False면 missing value값이 없다는 것
data.isna().any() 

Y      False
des    False
dtype: bool

In [9]:
#전체 데이터 수 확인
data.shape

(5572, 2)

In [10]:
#중복 데이터 확인 및 제거
#중복되는 것이 있으면 drop하는 것
#des중에 중복되는 것이 있으면 첫 번째것만 남기고 나머지 제거
data.drop_duplicates(subset=['des'],inplace=True,keep='first')

In [11]:
data.shape

(5169, 2)

# (4) Text 전처리

In [12]:
#(1)
#대문자->소문자
#특수기호 구두점 등 제거

normalized_text=[] #전처리된 텍스트

for string in data['des']:
    tokens=re.sub(r"[^a-z0-9]+"," ",string.lower()) #대문자를 소문자로 바꾼후, 스팸이메일에서 소문자영어나 숫자가 아닌경우 빈 공간으로
    normalized_text.append(tokens)

In [13]:
normalized_text[0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat '

In [14]:
#(2)
#단어 토큰화

#normalized_text에서 각각의 이메일을 sentence로 받아서 work_tokenize시키겠다는 것
result=[word_tokenize(sentence) for sentence in normalized_text]

In [15]:
result

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  'in',
  '2',
  'a',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  'to',
  '87121',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  't',
  'c',
  's',
  'apply',
  '08452810075over18',
  's'],
 ['u', 'dun', 'say', 'so', 'early', 'hor', 'u', 'c', 'already', 'then', 'say'],
 ['nah',
  'i',
  'don',
  't',
  'think',
  'he',
  'goes',
  'to',
  'usf',
  'he',
  'lives',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  's',
  'been',
  '3',
  'week',
  's',
  'now',
  'and',
  'no',
  'word',
  'back',
  'i',
  'd',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',


# (5) Word2Vec 알고리즘 학습 및 실행 by gensim

In [44]:
#주어진 데이터
sentences=["I am a boy","You are a girl"]
sentences=[s.lower().strip().split(" ") for s in sentences]

In [45]:
sentences

[['i', 'am', 'a', 'boy'], ['you', 'are', 'a', 'girl']]

In [46]:
#size=embedding 차원
#window=문맥 크기
#min_count=단어 최소 빈도 수 제한 (빈도가 적은 단어들은)
#workers=학습을 위한 프로세스 수
#sg=0은 CBOW, 1은 Skip-gram

model=Word2Vec(sg=0,size=100,window=5,min_count=1,workers=4)
model.build_vocab(sentences=sentences)
#size:100개 짜리의 벡터 크기로 요약
#window:양 옆에 다섯개씩을 보겠다는 것

In [47]:
model.wv.vocab.keys()

dict_keys(['i', 'am', 'a', 'boy', 'you', 'are', 'girl'])

In [48]:
#2개의 문장을 학습시킨다는 것
#epochs:몇 번 전체 데이터 셋을 반복할 것인가
model.train(sentences=sentences, total_examples=len(sentences), epochs=10) 

(7, 80)

In [49]:
model.wv['boy'] #글자에 해당하는 임베딩된 결과를 볼 수 있음

array([ 4.1163201e-03, -2.5454143e-03,  4.0650582e-03,  3.1058700e-04,
       -2.9573604e-03, -1.9445620e-03,  1.7807125e-03,  3.4711182e-03,
        7.5885846e-04,  4.1712122e-03, -4.2141965e-03, -1.9799126e-04,
       -2.1138289e-03, -3.8273905e-03, -3.5639347e-03,  2.8618686e-03,
        1.0937967e-03,  2.3768747e-03, -3.5778922e-03,  9.8234590e-04,
        1.9473078e-03, -3.3522910e-04, -1.2794581e-03, -9.4119977e-04,
        2.3064488e-03, -3.8004841e-03, -9.3703222e-04,  3.3687993e-03,
       -4.4413647e-03,  3.1681680e-03,  1.0031001e-03,  4.0884260e-03,
       -2.5288280e-04, -1.4293592e-03,  2.5358496e-03, -3.7075556e-03,
        1.2399830e-03,  4.8626191e-03,  2.1426964e-03,  4.5331391e-03,
       -3.5664134e-03,  4.9998551e-03, -1.2216222e-03, -3.2873359e-03,
        1.6219895e-03,  1.3823189e-03,  9.9545543e-04,  1.6406415e-03,
        4.9572322e-04,  1.9672856e-04,  4.1872817e-03, -1.0751034e-03,
       -4.0216856e-03, -4.4933244e-04, -7.2104938e-04,  4.0223305e-03,
      

In [50]:
model.wv['i'].shape

(100,)

In [51]:
#주어진 단어와 가장 유사한 단어 (사용된 단어 중에서)
#This method computes cosine similarity

model_result=model.wv.most_similar("boy",topn=4) #topn:가장 유사한 애들 3개
print(model_result) #값이 클수록 유사도가 큰 것

[('am', 0.14145970344543457), ('i', 0.1253436654806137), ('are', 0.08957497775554657), ('a', 0.07668192684650421)]


In [52]:
model_result=model.wv.most_similar("girl",topn=4) 
print(model_result) 

[('i', 0.0003511738032102585), ('are', -0.02285034954547882), ('am', -0.04037884622812271), ('boy', -0.0786169022321701)]


In [53]:
model_result=model.wv.most_similar("guy",topn=4) 
print(model_result) 

KeyError: "word 'guy' not in vocabulary"

In [54]:
#두 단어간 유사도
model.wv.similarity('boy','girl') #cosine similarity

-0.07861692

In [55]:
#두 단어간 거리(비 유사도)
model.wv.distance('boy','girl') #1-cosine similarity

1.0786169171333313

In [56]:
1-model.wv.similarity('boy','girl')

1.0786169171333313

# (5-2) Word2Vec 알고리즘 학습 및 실행 by gensim

In [57]:
result[:3]

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  'in',
  '2',
  'a',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  'to',
  '87121',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  't',
  'c',
  's',
  'apply',
  '08452810075over18',
  's']]

In [58]:
model2=Word2Vec(sg=0,size=100,window=5,min_count=1,workers=4)
model2.build_vocab(sentences=result)

In [59]:
model2.train(sentences=result, total_examples=len(result), epochs=10) 

(661419, 823190)

In [60]:
model.wv.most_similar('boy',topn=5)

[('am', 0.14145970344543457),
 ('i', 0.1253436654806137),
 ('are', 0.08957497775554657),
 ('a', 0.07668192684650421),
 ('you', -0.0001475512981414795)]

In [61]:
model2.wv.most_similar('boy',topn=5)

[('mr', 0.9995406866073608),
 ('big', 0.9995222091674805),
 ('fun', 0.9994968175888062),
 ('wine', 0.9994663000106812),
 ('god', 0.9994497299194336)]

In [62]:
model2.wv.most_similar('girl',topn=5)

[('smile', 0.9994848370552063),
 ('loving', 0.999363124370575),
 ('those', 0.9992561340332031),
 ('dey', 0.9992291927337646),
 ('amp', 0.9991774559020996)]

In [63]:
model2.wv.most_similar('call')

[('claim', 0.9941309690475464),
 ('09066612661', 0.9939141273498535),
 ('txt', 0.9927957057952881),
 ('cash', 0.9926409721374512),
 ('000', 0.9924333095550537),
 ('1000', 0.9924222230911255),
 ('reply', 0.9922590255737305),
 ('land', 0.9919509291648865),
 ('cashto', 0.9913820028305054),
 ('09061213237', 0.9906802773475647)]

## pre-trained Model

## Fine tuning -> pre-trained 모델을 새로운 데이터에 맞게끔 업데이트하는 개념

# Pre-trained Model 사용-(1)

In [64]:
import gensim.downloader

In [65]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [66]:
pretrained_google=gensim.downloader.load('word2vec-google-news-300')

[--------------------------------------------------] 1.4% 23.4/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[=-------------------------------------------------] 3.5% 58.0/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[==------------------------------------------------] 5.7% 94.6/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[===-----------------------------------------------] 7.9% 130.8/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[=====---------------------------------------------] 10.0% 167.1/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [67]:
pretrained_google.wv.most_similar("call")

  pretrained_google.wv.most_similar("call")


[('Call', 0.6494125723838806),
 ('calls', 0.6452654600143433),
 ('calling', 0.6199396848678589),
 ('dial_#-###-###', 0.582241415977478),
 ('visit_www.clairmail.com', 0.5597918033599854),
 ('visit_www.oscars.org', 0.5302615761756897),
 ('See_www.thewerehouse.com', 0.5248599648475647),
 ('Charlene_Pellin_answered', 0.524530827999115),
 ('information_visit_http://www.tempurpedic.com', 0.5230315923690796),
 ('calll', 0.521233081817627)]

### 단어 사이에 의미를 반영해서 벡터화 (기하학적 공간에 매핑)
### 벡터화 : 거리와 방향

In [69]:
pretrained_google.most_similar(positive=['car','minivan'], topn=5) #positive:2개의 벡터를 더하는 것, 더해진 단어들 중 top5를 뽑겠다

[('SUV', 0.8532191514968872),
 ('vehicle', 0.8175784349441528),
 ('pickup_truck', 0.7763689160346985),
 ('Jeep', 0.7567334175109863),
 ('Ford_Explorer', 0.7565719485282898)]

In [71]:
for i, (word, similarity) in enumerate(pretrained_google.most_similar(positive=['car','minivan'], topn=5)):
    print(f"Top {i+1} : {word},{similarity}")

Top 1 : SUV,0.8532191514968872
Top 2 : vehicle,0.8175784349441528
Top 3 : pickup_truck,0.7763689160346985
Top 4 : Jeep,0.7567334175109863
Top 5 : Ford_Explorer,0.7565719485282898


In [75]:
#positive의 단어 벡터르 더하고 negative 단어 벡터를 뺀 값과 가장 유사한 단어
#king=남자+왕
#king+women=남자+왕+여자-남자=왕+여자
print(pretrained_google.wv.most_similar(positive=['king','women'], negative=['men'], topn=1))

#walking=걷다+현재형, swam=수영+과거, walked=걷다+과거
#walking+swam-walked=수영+현재형
print(pretrained_google.wv.most_similar(positive=['walking','swam'], negative=['walked'], topn=1))

  print(pretrained_google.wv.most_similar(positive=['king','women'], negative=['men'], topn=1))


[('queen', 0.6525817513465881)]


  print(pretrained_google.wv.most_similar(positive=['walking','swam'], negative=['walked'], topn=1))


[('swimming', 0.7448816895484924)]


In [76]:
#관계없는 단어 추출
pretrained_google.doesnt_match(['fire','water','land','sea','air','car'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'car'

# Pre-trained Model 사용-(2)

In [77]:
import urllib.request
import time

In [78]:
start_time=time.time()
urllib.request.urlretrieve("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz",filename="GoogleNews-vectors-negative300.bin.gz")

print(time.time()-start_time)

('GoogleNews-vectors-negative300.bin.gz',
 <http.client.HTTPMessage at 0x216b45946d0>)

In [82]:
googleNews_filepath="GoogleNews-vectors-negative300.bin.gz"

In [83]:
#LOAD pre-trained key vector
#model을 load한 것이 아니고 Embedding vector만 load
#limit=단어수 조정(빅데이터의 경우)

start_time=time.time()
GoogleSlimModel=KeyedVectors.load_word2vec_format(
    googleNews_filepath, binary=True, limit=1000000 #100만개의 단어만 불러올 것
)

print(f"== {googleNews_filepath} load as word2vec model complete, {time.time()-start_time}")

== GoogleNews-vectors-negative300.bin.gz load as word2vec model complete, 33.78724551200867


In [84]:
GoogleSlimModel.most_similar('hair',topn=5)

[('curly_hair', 0.7542630434036255),
 ('tresses', 0.7536026835441589),
 ('mane', 0.6898636817932129),
 ('hairdo', 0.6756651997566223),
 ('blonde_hair', 0.6589916944503784)]

# Transfer Learning

## (1) pre-trained model:사전에 방대한 데이터로부터 학습된 모델
## (1) Fine-tuning:현재 풀고자하는 데이터로부터 pre-trained model된 모델 업데이트

In [85]:
#LOAD pre-trained key vector
#model을 load한 것이 아니고 Embedding vector만 load
#limit=단어수 조정(빅데이터의 경우)

start_time=time.time()
PreTrainedKeyvector=KeyedVectors.load_word2vec_format(
    googleNews_filepath, binary=True, limit=1000000 #100만개의 단어만 불러올 것
)

print(f"== {googleNews_filepath} load as word2vec model complete, {time.time()-start_time}")

== GoogleNews-vectors-negative300.bin.gz load as word2vec model complete, 32.21029567718506


In [86]:
PreTrainedKeyvector.most_similar('hair',topn=5)

[('curly_hair', 0.7542630434036255),
 ('tresses', 0.7536026835441589),
 ('mane', 0.6898636817932129),
 ('hairdo', 0.6756651997566223),
 ('blonde_hair', 0.6589916944503784)]

In [88]:
#Fine tuning 할 새로운 Word2Vec 모델 생성
#PreTrainedKeyvector와 'vector_size'가 같은 word2vec model을 생성

TransferedModel=Word2Vec(size=PreTrainedKeyvector.vector_size,min_count=1)

In [89]:
#단어 생성(build_vocab) by PreTrainedKeyvector word Vocabulary
#TransferedModel.build_vocab input:
#[[]] #list of list
TransferedModel.build_vocab([PreTrainedKeyvector.vocab.keys()])

In [90]:
#단어 수 확인
len(TransferedModel.wv.vocab.keys())

1000000

In [91]:
#주어진 데이터
sentences=["I am a boy","You are a girl"]
sentences=[s.lower().strip().split(" ") for s in sentences]

In [92]:
#주어진 데이터로 새로운 모델의 단어 추가
#update parameter를 True로 설정

TransferedModel.build_vocab(sentences,update=True)

In [93]:
#단어 수 확인
len(TransferedModel.wv.vocab.keys())

1000001

In [94]:
#Pretrained 모델의 학습 파라미터를 기반으로 새로운 모델의 학습 파라미터 초기화
#학습파라미터를 'googleNews_filepath'에 있는 값으로 모두 업데이트해줌
#lockf=0.0 : 보통은 학습 파라미터를 update하지 못하도록 lock이 걸려있음
#lockf=1 : 학습 파라미터를 update하도록 lock 해제

TransferedModel.intersect_word2vec_format(
    googleNews_filepath, binary=True, lockf=1.0
)

In [96]:
#새로운 데이터 기반의 학습
TransferedModel.train(sentences, total_examples=len(sentences), epochs=100)

(69, 800)

In [97]:
#학습 후 결과
TransferedModel.most_similar('boy',topn=5)

  TransferedModel.most_similar('boy',topn=5)


[('girl', 0.8543282747268677),
 ('teenager', 0.7606685757637024),
 ('toddler', 0.7043962478637695),
 ('teenage_girl', 0.6851469278335571),
 ('man', 0.6824932098388672)]

In [98]:
#학습 전 결과
PreTrainedKeyvector.most_similar('boy',topn=5)

[('girl', 0.8543272018432617),
 ('teenager', 0.7606690526008606),
 ('toddler', 0.7043969631195068),
 ('teenage_girl', 0.6851483583450317),
 ('man', 0.6824870109558105)]

# Fine Tuning by using Spam data

In [99]:
#Fine Tuning할 새로운 Word2Vec 모델 생성
#PreTrainedKeyvector와 vector_size'가 같은 word2vec model을 생성

TransferedModel2=Word2Vec(size=PreTrainedKeyvector.vector_size,min_count=1)

In [100]:
#단어 생성(build_vocab) by PreTrainedKeyvector word Vocabulary
#TransferedModel.build_vocab input:
#[[]] #list of list
TransferedModel2.build_vocab([PreTrainedKeyvector.vocab.keys()])

In [101]:
#주어진 데이터로 새로운 모델의 단어 추가
#update parameter를 True로 설정

TransferedModel2.build_vocab(result,update=True)

In [102]:
len(TransferedModel2.wv.vocab.keys())

1002738

### Tuning 전/후 'call'단어 비교

In [103]:
PreTrainedKeyvector.most_similar('call',topn=10)

[('Call', 0.6494125723838806),
 ('calls', 0.6452654600143433),
 ('calling', 0.6199396848678589),
 ('Visit_www.theorangepeel.net', 0.5139467716217041),
 ('dialing', 0.5134264230728149),
 ('Visit_www.ticketmaster.com', 0.5067222714424133),
 ('visit_www.sba.gov_advo', 0.5047945976257324),
 ('dial', 0.5043871998786926),
 ('bbigelow@xconomy.com', 0.5041642785072327),
 ('#-###-CRIME-TV', 0.5023549795150757)]

### 새로운 데이터들의 단어(토큰) 기반 fine tuning->Training

In [104]:
TransferedModel2.train(result,total_examples=len(result),epochs=1)

(66012, 82319)

In [105]:
TransferedModel2.most_similar('call',topn=10)

  TransferedModel2.most_similar('call',topn=10)


[('Yaggi', 0.26878854632377625),
 ('John_Sleezer', 0.2631785273551941),
 ('Topton', 0.2595468759536743),
 ('Mark_Dobmeier', 0.25190621614456177),
 ('Guevin', 0.24977199733257294),
 ('THEIRS', 0.24521076679229736),
 ('octet', 0.2450137436389923),
 ('Slaven', 0.24477775394916534),
 ('Rigoberto', 0.24464727938175201),
 ('goats_graze', 0.24460285902023315)]