In [1]:
import pandas as pd
import numpy as np
import pickle
import operator
from collections import defaultdict
import re
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText

In [8]:
#--- CONSTANTS ----------------------------------------------------------------+

class word2vec():
    def __init__ (self):
        self.n = settings['n']
        self.eta = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']
        pass
    
    
    # GENERATE TRAINING DATA
    def generate_training_data(self, settings, corpus):

        # GENERATE WORD COUNTS
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1

        self.v_count = len(word_counts.keys())

        # GENERATE LOOKUP DICTIONARIES
        self.words_list = sorted(list(word_counts.keys()),reverse=False)
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))

        training_data = []
        # CYCLE THROUGH EACH SENTENCE IN CORPUS
        for sentence in corpus:
            sent_len = len(sentence)

            # CYCLE THROUGH EACH WORD IN SENTENCE
            for i, word in enumerate(sentence):
                
                #w_target  = sentence[i]
                w_target = self.word2onehot(sentence[i])

                # CYCLE THROUGH CONTEXT WINDOW
                w_context = []
                for j in range(i-self.window, i+self.window+1):
                    if j!=i and j<=sent_len-1 and j>=0:
                        w_context.append(self.word2onehot(sentence[j]))
                training_data.append([w_target, w_context])
        return np.array(training_data)


    # SOFTMAX ACTIVATION FUNCTION
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)


    # CONVERT WORD TO ONE HOT ENCODING
    def word2onehot(self, word):
        word_vec = [0 for i in range(0, self.v_count)]
        word_index = self.word_index[word]
        word_vec[word_index] = 1
        return word_vec


    # FORWARD PASS
    def forward_pass(self, x):
        h = np.dot(self.w1.T, x)
        u = np.dot(self.w2.T, h)
        y_c = self.softmax(u)
        return y_c, h, u
                

    # BACKPROPAGATION
    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)  
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))

        # UPDATE WEIGHTS
        self.w1 = self.w1 - (self.eta * dl_dw1)
        self.w2 = self.w2 - (self.eta * dl_dw2)
        pass


    # TRAIN W2V model
    def train(self, training_data):
        # INITIALIZE WEIGHT MATRICES
        self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n))     # embedding matrix
        self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count))     # context matrix
        
        # CYCLE THROUGH EACH EPOCH
        for i in range(0, self.epochs):

            self.loss = 0

            # CYCLE THROUGH EACH TRAINING SAMPLE
            for w_t, w_c in training_data:

                # FORWARD PASS
                y_pred, h, u = self.forward_pass(w_t)
                
                # CALCULATE ERROR
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

                # BACKPROPAGATION
                self.backprop(EI, h, w_t)

                # CALCULATE LOSS
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
                #self.loss += -2*np.log(len(w_c)) -np.sum([u[word.index(1)] for word in w_c]) + (len(w_c) * np.log(np.sum(np.exp(u))))
                
            print('EPOCH:',i, 'LOSS:', self.loss)
        pass


    # input a word, returns a vector (if available)
    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w


    # input a vector, returns nearest word(s)
    def vec_sim(self, vec, top_n):

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_num = np.dot(vec, v_w2)
            theta_den = np.linalg.norm(vec) * np.linalg.norm(v_w2)
            theta = theta_num / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda word,sim:sim , reverse=True)

        for word, sim in words_sorted[:top_n]:
            print(word,sim)
            
        pass

    # input word, returns top [n] most similar words
    def word_sim(self, word, top_n):
        
        w1_index = self.word_index[word]
        v_w1 = self.w1[w1_index]

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_num = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_num / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda word,sim:sim, reverse=True)

        for word, sim in words_sorted[:top_n]:
            print(word,sim)
            
        pass

## 準備corpus

In [16]:
with open('airiti_english_corpus.pickle','rb') as f:
    en_data = pickle.load(f)
with open('airiti_chinese_corpus.pickle','rb') as f:
    zh_data = pickle.load(f)
len(zh_data)

25695

In [15]:
with open('airiti_chinese_corpus.pickle','wb') as f:
    pickle.dump(zh_data,f)

## 中文或英文，選一個訓練

In [3]:
en_data_dict = dict()
source = []
for i in range(0,len(en_data)):
    for j in range(0,len(en_data[i])):
        for k in range(0,len(en_data[i][j])):
            if not en_data[i][j][k] in en_data_dict:
                en_data_dict[en_data[i][j][k]]=1
            else:
                en_data_dict[en_data[i][j][k]] = en_data_dict[en_data[i][j][k]]+1
            en_data[i][j][k] = en_data[i][j][k].lower()
        source.append(en_data[i][j])

In [20]:
zh_data_dict = dict()
source = []
for i in range(0,len(zh_data)):
    for j in range(0,len(zh_data[i])):
        for k in range(0,len(zh_data[i][j])):
            if not zh_data[i][j][k] in zh_data_dict:
                zh_data_dict[zh_data[i][j][k]]=1
            else:
                zh_data_dict[zh_data[i][j][k]] = zh_data_dict[zh_data[i][j][k]]+1
        source.append(zh_data[i][j])

## 開始跑模型 

In [48]:
#--- EXAMPLE RUN --------------------------------------------------------------+

settings = {}
settings['n'] = 32                 # dimension of word embeddings
settings['window_size'] = 5         # context window +/- center word
settings['min_count'] = 0           # minimum word count
settings['epochs'] = 5           # number of training epochs
settings['neg_samp'] = 10           # number of negative words to use during training
settings['learning_rate'] = 0.01    # learning rate
np.random.seed(0)                   # set the seed for reproducibility

#corpus = [['the','quick','brown','fox','jumped','over','the','lazy','dog']]
corpus = source[:100]

# INITIALIZE W2V MODEL
w2v = word2vec()

# generate training data
training_data = w2v.generate_training_data(settings, corpus)

# train word2vec model
w2v.train(training_data)

#--- END ----------------------------------------------------------------------+

EPOCH: 0 LOSS: 110875.07081022441
EPOCH: 1 LOSS: 104323.88601961604
EPOCH: 2 LOSS: 99568.40993533852
EPOCH: 3 LOSS: 95863.33319794212
EPOCH: 4 LOSS: 92913.88367548105


In [8]:
CBow = Word2Vec(source,min_count=5,size=200,window=5,iter=50, workers=4,sg=0)

In [None]:
zhCBow = Word2Vec(source,min_count=5,size=200,window=5,iter=50, workers=5,sg=0)

In [6]:
def most_similar(w2v_model, words, topn=5):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except:
            print(word, "not found in Word2Vec model!")
    return similar_df

In [7]:
#只跑5個iteration
most_similar(CBow, ['computer','method','learn','paper','machine','algorithm','medical'])

Unnamed: 0,computer,cos,method,cos.1,learn,cos.2,paper,cos.3,machine,cos.4,algorithm,cos.5,medical,cos.6
0,vision,0.832695,technique,0.844628,start,0.788737,thesis,0.940712,learning,0.792323,scheme,0.892851,nursing,0.814213
1,bim,0.796203,approach,0.844047,recognize,0.757144,dissertation,0.932019,svm,0.742843,approach,0.845073,emergency,0.808847
2,reality,0.774201,algorithm,0.83403,select,0.756999,article,0.885475,vector,0.728682,method,0.83403,military,0.808737
3,science,0.766601,procedure,0.833115,protect,0.756079,work,0.846151,software,0.709588,procedure,0.819175,staff,0.799391
4,large-scale,0.76539,scheme,0.795746,leverage,0.744813,essay,0.803591,stereo,0.682782,framework,0.78597,care,0.79619


In [9]:
most_similar(CBow, ['computer','method','learn','paper','machine','algorithm','medical'])

Unnamed: 0,computer,cos,method,cos.1,learn,cos.2,paper,cos.3,machine,cos.4,algorithm,cos.5,medical,cos.6
0,computers,0.495371,approach,0.80763,extract,0.483715,thesis,0.90173,machines,0.438179,method,0.734917,nursing,0.517037
1,kinesthetic,0.408801,technique,0.749219,see,0.470761,dissertation,0.775195,supervised,0.426178,scheme,0.701819,healthcare,0.499971
2,graphics,0.404766,algorithm,0.734917,know,0.470506,study,0.668645,software,0.361836,approach,0.699059,clinic,0.469472
3,software,0.39005,scheme,0.697332,create,0.466998,article,0.649461,cnc,0.36041,algorithms,0.665026,hospital,0.457505
4,nvidia,0.386236,methods,0.654628,construct,0.464892,research,0.642569,aoi,0.357666,technique,0.597952,health,0.452118


In [None]:
most_similar(zhCBow, ['電腦','方法','學習','論文','機器','演算法','醫學'])

In [10]:
CBow.save('CBzh0505.model')