# 데이터 준비

In [21]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
documents = dataset.data

print(len(documents))
documents[3]

11314


'Notwithstanding all the legitimate fuss about this proposal, how much\nof a change is it?  ATT\'s last product in this area (a) was priced over\n$1000, as I suspect \'clipper\' phones will be; (b) came to the customer \nwith the key automatically preregistered with government authorities. Thus,\naside from attempting to further legitimize and solidify the fed\'s posture,\nClipper seems to be "more of the same", rather than a new direction.\n   Yes, technology will eventually drive the cost down and thereby promote\nmore widespread use- but at present, the man on the street is not going\nto purchase a $1000 crypto telephone, especially when the guy on the other\nend probably doesn\'t have one anyway.  Am I missing something?\n   The real question is what the gov will do in a year or two when air-\ntight voice privacy on a phone line is as close as your nearest pc.  That\nhas got to a problematic scenario for them, even if the extent of usage\nnever surpasses the \'underground\' stature

In [22]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def clean_text(d):
  pattern = r'[^a-zA-Z\s]'
  text = re.sub(pattern, '', d)
  return text

def clean_stopword(d):
  stop_words = stopwords.words('english')
  return ' '.join([w.lower() for w in d.split() if w.lower() not in stop_words and len(w) > 3])

def tokenize(d):
  return word_tokenize(d)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [23]:
import pandas as pd

news_df = pd.DataFrame({'article':documents})
len(news_df)

11314

In [24]:
news_df.replace("", float("NaN"), inplace=True)
news_df.dropna(inplace=True)
print(len(news_df))

11096


In [25]:
news_df['article'] = news_df['article'].apply(clean_text)
news_df['article']

0        Well im not sure about the story nad it did se...
1        \n\n\n\n\n\n\nYeah do you expect people to rea...
2        Although I realize that principle is not one o...
3        Notwithstanding all the legitimate fuss about ...
4        Well I will have to change the scoring on my p...
                               ...                        
11309    Danny Rubenstein an Israeli journalist will be...
11310                                                   \n
11311    \nI agree  Home runs off Clemens are always me...
11312    I used HP DeskJet with Orange Micros Grappler ...
11313                                          \nNo arg...
Name: article, Length: 11096, dtype: object

In [26]:
news_df['article'] = news_df['article'].apply(clean_stopword)
news_df['article']

0        well sure story seem biased disagree statement...
1        yeah expect people read actually accept hard a...
2        although realize principle strongest points wo...
3        notwithstanding legitimate fuss proposal much ...
4        well change scoring playoff pool unfortunately...
                               ...                        
11309    danny rubenstein israeli journalist speaking t...
11310                                                     
11311    agree home runs clemens always memorable kinda...
11312    used deskjet orange micros grappler system upd...
11313    argument murphy scared hell came last year han...
Name: article, Length: 11096, dtype: object

In [27]:
tokenized_news = news_df['article'].apply(tokenize)
tokenized_news = tokenized_news.to_list()

In [28]:
import numpy as np

drop_news = [index for index, sentence in enumerate(tokenized_news) if len(sentence) <= 1]
news_texts = np.delete(tokenized_news, drop_news, axis=0)
print(len(news_texts))

10939


# Gensim 이용한 Word2Vec 모델 생성 및 시각화

## CBOW 모델 생성

In [30]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=news_texts, window=4,
                 size=100, min_count=5, workers=4, sg=0)

In [31]:
model.wv.similarity('woman', 'daughter')

  if np.issubdtype(vec.dtype, np.int):


0.9519115

In [32]:
model.most_similar(positive=['soldiers'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('henrik', 0.9869531393051147),
 ('azerbaijani', 0.9859133362770081),
 ('azeris', 0.9851698875427246),
 ('genocide', 0.9824388027191162),
 ('turkey', 0.9823763966560364),
 ('land', 0.9798774123191833),
 ('muslim', 0.9786380529403687),
 ('turks', 0.9781256318092346),
 ('arms', 0.977458655834198),
 ('armenia', 0.9757216572761536)]

In [33]:
model.wv.most_similar(positive=['woman', 'soldiers'], negative=['daughter'])

  if np.issubdtype(vec.dtype, np.int):


[('killed', 0.9814844131469727),
 ('armenians', 0.978642463684082),
 ('women', 0.9767076969146729),
 ('children', 0.976512610912323),
 ('lived', 0.9666338562965393),
 ('dead', 0.9665491580963135),
 ('death', 0.9617465734481812),
 ('azerbaijani', 0.9577610492706299),
 ('arms', 0.9577434659004211),
 ('died', 0.9563472270965576)]

### CBOW 임베딩 벡터 시각화

In [34]:
from gensim.models import KeyedVectors

model.wv.save_word2vec_format('news_w2v')

In [36]:
!python -m gensim.scripts.word2vec2tensor -i news_w2v -o news_w2v

2020-10-29 14:45:24,997 - word2vec2tensor - INFO - running /usr/local/lib/python3.6/dist-packages/gensim/scripts/word2vec2tensor.py -i news_w2v -o news_w2v
2020-10-29 14:45:24,998 - utils_any2vec - INFO - loading projection weights from news_w2v
2020-10-29 14:45:26,658 - utils_any2vec - INFO - loaded (16996, 100) matrix from news_w2v
2020-10-29 14:45:28,121 - word2vec2tensor - INFO - 2D tensor file saved to news_w2v_tensor.tsv
2020-10-29 14:45:28,121 - word2vec2tensor - INFO - Tensor metadata file saved to news_w2v_metadata.tsv
2020-10-29 14:45:28,124 - word2vec2tensor - INFO - finished running word2vec2tensor.py


In [37]:
!ls

news_w2v  news_w2v_metadata.tsv  news_w2v_tensor.tsv  sample_data


In [38]:
from google.colab import files

files.download('news_w2v_metadata.tsv')
files.download('news_w2v_tensor.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

* Embedding Projector: https://projector.tensorflow.org/

## Skip-gram 모델 생성

In [39]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=news_texts, window=4,
                 size=100, min_count=5, workers=4, sg=1)

In [40]:
model.wv.similarity('woman', 'daughter')

  if np.issubdtype(vec.dtype, np.int):


0.91456634

In [41]:
model.most_similar(positive=['soldiers'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('wounded', 0.935700535774231),
 ('burned', 0.9340214729309082),
 ('azeri', 0.9335383176803589),
 ('massacre', 0.9316290616989136),
 ('troops', 0.9310280084609985),
 ('civilians', 0.9288100004196167),
 ('girls', 0.9248819351196289),
 ('fleeing', 0.9226990938186646),
 ('azerbaijanis', 0.9198404550552368),
 ('burnt', 0.9152743220329285)]

In [42]:
model.wv.most_similar(positive=['woman', 'soldiers'], negative=['daughter'])

  if np.issubdtype(vec.dtype, np.int):


[('civilians', 0.8842564821243286),
 ('burned', 0.8790743947029114),
 ('burning', 0.8712849617004395),
 ('troops', 0.8709535002708435),
 ('corpse', 0.8704695701599121),
 ('shouted', 0.8702771663665771),
 ('babies', 0.8678631782531738),
 ('bedroom', 0.8665541410446167),
 ('azerbaijanis', 0.8622031211853027),
 ('azeri', 0.8581645488739014)]

### Skip-gram 임베딩 벡터 시각화

In [43]:
from gensim.models import KeyedVectors

model.wv.save_word2vec_format('news_w2v_2')

In [44]:
!python -m gensim.scripts.word2vec2tensor -i news_w2v_2 -o news_w2v_2

2020-10-29 14:46:57,624 - word2vec2tensor - INFO - running /usr/local/lib/python3.6/dist-packages/gensim/scripts/word2vec2tensor.py -i news_w2v_2 -o news_w2v_2
2020-10-29 14:46:57,625 - utils_any2vec - INFO - loading projection weights from news_w2v_2
2020-10-29 14:46:59,315 - utils_any2vec - INFO - loaded (16996, 100) matrix from news_w2v_2
2020-10-29 14:47:00,704 - word2vec2tensor - INFO - 2D tensor file saved to news_w2v_2_tensor.tsv
2020-10-29 14:47:00,705 - word2vec2tensor - INFO - Tensor metadata file saved to news_w2v_2_metadata.tsv
2020-10-29 14:47:00,707 - word2vec2tensor - INFO - finished running word2vec2tensor.py


In [45]:
!ls

news_w2v    news_w2v_2_metadata.tsv  news_w2v_metadata.tsv  sample_data
news_w2v_2  news_w2v_2_tensor.tsv    news_w2v_tensor.tsv


In [46]:
from google.colab import files

files.download('news_w2v_2_metadata.tsv')
files.download('news_w2v_2_tensor.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

* Embedding Projector: https://projector.tensorflow.org/