# Word Embedding from Wikipedia

In [1]:
# Data preprocessing

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
# import string

def preprocessing(article):
    # define stopwords, punctuations, lemmatizer
    lemmatizer = WordNetLemmatizer() 
    stopwords_set = set(stopwords.words('english'))
    # find all english words from article
    words = re.findall('\\b[a-z]+\\b',article)
    clean_words = []
    # delete all stopwords and creat a clean words_list
    for word in words:
        if word not in stopwords_set:
            clean_words.append(word)
    #Lemmatiztion
    filtered_words = [lemmatizer.lemmatize(word,'n') for word in clean_words] 
    filtered_article = ' '.join(filtered_words)
    return filtered_article

In [2]:
# clean original wiki_corpus and put cleaned content into a new file

# each line in this text file corresponding to an article
from tqdm import tqdm

with open("./Dataset/wiki_corpus_en.txt", "r", encoding='utf-8') as f:
    for line in tqdm(f):
        article = line.strip('\n')
        filtered_article = preprocessing(article)
        with open('./Dataset/wiki_processed_corpus_en.txt', 'a', encoding='utf-8') as w:
            w.write(filtered_article + '\n')

85102it [06:17, 225.38it/s]


In [4]:
# use gensim model to train word2vec embeddings

import gensim
from gensim.models import Word2Vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.models.word2vec import LineSentence

In [2]:
sentences = LineSentence('./Dataset/wiki_processed_corpus_en.txt')

#define and train the gensim model 
model = Word2Vec(sentences, sg=1, # using skip-gram model
                     size= 300,  # vector size 300
                     window= 5, # context window size
                     min_count= 5, # minimal word frequency
                     iter = 15, # itertaion 15 times
                     workers= 8 )

print('Finished Word2Vec training.....')

model.save('./Dataset/word2vec.model')
model.wv.save_word2vec_format("./Dataset/word2vec.txt",binary=False)

Finished Word2Vec training.....


In [5]:
# test word vector
wv_from_text = KeyedVectors.load_word2vec_format("./Dataset/word2vec.txt", binary=False)
print(wv_from_text['germany'])
model_trained = Word2Vec.load('./Dataset/word2vec.model')
model_trained.most_similar("china")

[-0.01883121  0.20953107 -0.06348649  0.12446711 -0.30815777 -0.06482673
 -0.24392161 -0.3617024   0.03132427  0.12887119  0.28699267 -0.30248684
  0.14022109 -0.01792672  0.06673583  0.08856788 -0.01744785  0.03064011
  0.19901419 -0.21329336 -0.13122673 -0.1792982  -0.1827066  -0.17658928
 -0.07008389  0.18022831  0.31833333 -0.1922571   0.25560144 -0.15702173
  0.1104394   0.07718533 -0.30509222  0.11992095  0.18463781 -0.06511118
 -0.0698548   0.38524476 -0.05967232 -0.19643001 -0.05561528  0.03574706
  0.01241283  0.12788253  0.30667087  0.3243983   0.10205039 -0.42644563
  0.21918465  0.13985078  0.02472307 -0.03273129  0.01207019 -0.09052381
  0.21873304  0.03180779  0.20899987 -0.15861568 -0.14080344 -0.07981589
 -0.06710944  0.20989531  0.05883698 -0.09836747  0.07262119 -0.09066577
  0.15727943  0.13546565 -0.27404085  0.23382223 -0.06612139 -0.02006742
  0.10213558 -0.07222913 -0.33423388  0.03667687 -0.22206816  0.37079248
  0.09546407 -0.00987136 -0.13475263 -0.03398062  0

  """


[('taiwan', 0.6887751221656799),
 ('chinese', 0.6543804407119751),
 ('zhangzong', 0.6423479914665222),
 ('putian', 0.6366764307022095),
 ('qing', 0.6274092197418213),
 ('guangxi', 0.6232026815414429),
 ('yongji', 0.6203204393386841),
 ('beijing', 0.6113041639328003),
 ('korea', 0.6085971593856812),
 ('aizong', 0.6058491468429565)]

In [14]:
model_trained.most_similar("afghanistan")

  """Entry point for launching an IPython kernel.


[('iraq', 0.6985960006713867),
 ('afghan', 0.6625270247459412),
 ('kabul', 0.647105872631073),
 ('taliban', 0.6283484697341919),
 ('wardak', 0.6166841387748718),
 ('helmand', 0.6157212257385254),
 ('iran', 0.602006733417511),
 ('gardez', 0.5908187627792358),
 ('iraqi', 0.5878793001174927),
 ('paktika', 0.584496021270752)]

In [27]:
# find countries vector and store it
import pandas as pd

df = pd.read_csv('./Dataset/countries/countries of the world.csv')

country2vector = {}

print('missed countries: ')
for country in df['Country']:
    c = str(country.strip()).lower()
    country = c.split(' ')[0]
    country = country.split(',')[0]
    try:
        country2vector[country] = wv_from_text[country]
    except:
        print(c, ' is not in our trained vocabulary model')
        
df_cou2vec = pd.DataFrame({'country': list(country2vector.keys()),'vector': list(country2vector.values()) })
df_cou2vec.to_csv('./Dataset/countries/country2vector.csv')

missed countries: 
guinea-bissau  is not in our trained vocabulary model
laos  is not in our trained vocabulary model
n. mariana islands  is not in our trained vocabulary model
philippines  is not in our trained vocabulary model
turks & caicos is  is not in our trained vocabulary model


In [32]:
df_c = pd.read_csv('./Dataset/countries/country2vector.csv')
print(df_c.head()['country'])

0    afghanistan
1        albania
2        algeria
3       american
4        andorra
Name: country, dtype: object
