Kết nối Colab với Google Drive để lấy dữ liệu

In [84]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Cài đặt thư viện Gensim cho việc train model Word2Vec

In [85]:
!pip install gensim



Import các thư viện cần thiết

In [86]:
import pandas as pd
import os
import pandas as pd
import string
from gensim.models import Word2Vec
import spacy 
import multiprocessing
from time import time
import re



Load tập dữ liệu từ thư mục trong Google Drive

In [87]:

df = pd.read_csv('/content/drive/MyDrive/Learning/Natural Language Processing/Exercises/word2vec/simpsons_dataset.csv', header= 0,
                        encoding= 'unicode_escape')
print("Shape: %s", str(df.shape))

print("Sample data\n")
print(df.head())
print("Summary\n")
df.isnull().sum()


Shape: %s (158314, 2)
Sample data

        raw_character_text                                       spoken_words
0              Miss Hoover  No, actually, it was a little of both. Sometim...
1             Lisa Simpson                             Where's Mr. Bergstrom?
2              Miss Hoover  I don't know. Although I'd sure like to talk t...
3             Lisa Simpson                         That life is worth living.
4  Edna Krabappel-Flanders  The polls will be open from now until the end ...
Summary



raw_character_text    17814
spoken_words          26459
dtype: int64

Bỏ value bị thiếu

In [88]:
df = df.dropna().reset_index(drop=True)
print("Summary\n")
df.isnull().sum()


Summary



raw_character_text    0
spoken_words          0
dtype: int64

Xử lý loại bỏ số, ký tự đặc biệt và stop words

In [89]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))



df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

 

Time to clean up everything: 1.24 mins


(85964, 1)

Xử dụng Gensim Phrases package để phát hiện những cụm từ thông dụng

In [90]:
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)
sentences = phrases[sent]




Những từ có tuần suất nhiều nhất

In [91]:
from collections import defaultdict
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)
sorted(word_freq, key=word_freq.get, reverse=True)[:10]





['oh', 'like', 'know', 'get', 'hey', 'think', 'right', 'look', 'want', 'come']

Train model Word2Vec. Trong quá trình train, có bạn có thể chỉnh sử dụng GPU của Colab để đẩy nhanh tiến độ bằng cách vào Rumtime => Change Runtime Type và chọn GPU

In [92]:

# Skipgram model
model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

t = time()

#model = Word2Vec(dataset, min_count = 1, size =300, window = 5, sg = 1)
model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))


t = time()
model.train(sentences, total_examples=model.corpus_count, epochs=30, report_delay=1)
model.init_sims(replace=True)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

print
model.wv.save('/content/drive/MyDrive/Learning/Natural Language Processing/Exercises/word2vec/word2vec_skipgram.bin')






Time to build vocab: 0.07 mins
Time to train the model: 2.08 mins


Sau khi có model. Ta tiến hành load mô hình

> Indented block



In [93]:
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
path = '/content/drive/MyDrive/Learning/Natural Language Processing/Exercises/word2vec/word2vec_skipgram.bin'
model = gensim.models.KeyedVectors.load(path)

Test kết quả

In [94]:
model.wv.most_similar(positive=["daddy"])

  """Entry point for launching an IPython kernel.


[('mommy', 0.8529200553894043),
 ('honey', 0.7579607963562012),
 ('sweetie', 0.7279981374740601),
 ('diaper', 0.7048475742340088),
 ('maggie', 0.7002843618392944),
 ('asleep', 0.688968300819397),
 ('run_away', 0.665802001953125),
 ('aww', 0.6601063013076782),
 ('feelin', 0.659238874912262),
 ('shoulda', 0.6586604714393616)]

In [95]:
model.similar_by_vector(model['daddy'], topn=10)

[('daddy', 1.0),
 ('mommy', 0.8529200553894043),
 ('honey', 0.7579607963562012),
 ('sweetie', 0.7279981374740601),
 ('diaper', 0.7048475742340088),
 ('maggie', 0.7002843618392944),
 ('asleep', 0.688968300819397),
 ('run_away', 0.665802001953125),
 ('aww', 0.6601063013076782),
 ('feelin', 0.659238874912262)]

In [96]:
model['daddy']

array([ 5.94953261e-02,  6.75222203e-02,  7.54637122e-02, -1.24046095e-02,
       -6.98539764e-02,  8.21525380e-02,  9.65439156e-02,  1.00639991e-01,
       -1.10133295e-03, -6.08249903e-02,  2.84046195e-02, -1.52703384e-02,
        6.84111118e-02, -6.47288328e-03, -8.25564843e-03,  7.53169730e-02,
       -5.63473813e-02,  6.61148950e-02,  7.06678480e-02,  2.67095659e-02,
        6.14882745e-02,  2.92577595e-03, -1.29941404e-02, -3.47870849e-02,
        4.44369279e-02, -7.31154233e-02, -1.04398886e-02,  4.29420024e-02,
       -1.99107155e-02, -3.15210633e-02,  2.21162289e-02,  8.68822336e-02,
        4.51426990e-02,  2.10201629e-02,  1.06752753e-01, -4.71733101e-02,
        3.88367007e-05,  6.02439940e-02,  6.04483262e-02,  3.72089371e-02,
        4.30855714e-02, -1.32246576e-02, -2.58083344e-02,  2.47485805e-02,
       -2.41913460e-03,  1.96083938e-03,  2.58436128e-02,  5.21746390e-02,
       -8.08328167e-02, -1.02004007e-01, -4.03867997e-02, -2.79409979e-02,
       -8.26525763e-02, -