# Pre-trained Word2Vec Model From Google

In [9]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('Users/karina/LighthouseLabs/GoogleNews-vectors-negative300.bin', binary=True)

FileNotFoundError: [Errno 2] No such file or directory: 'Users/karina/LighthouseLabs/GoogleNews-vectors-negative300.bin'

In [4]:
vector = model['easy']
# see the shape of the vector (300,)
vector.shape

(300,)

### Similarities

In [5]:
# Find the most similar words to any word
model.most_similar("nice")

[('good', 0.6836091876029968),
 ('lovely', 0.6676310896873474),
 ('neat', 0.6616737246513367),
 ('fantastic', 0.6569241881370544),
 ('wonderful', 0.6561347246170044),
 ('terrific', 0.6552367806434631),
 ('great', 0.6454657912254333),
 ('awesome', 0.6404187083244324),
 ('nicer', 0.6302445530891418),
 ('decent', 0.5993332266807556)]

In [6]:
# Find the similarity score of any two words
model.similarity("nice","good")

0.68360907

In [7]:
# Antonyms will have highly similar scores
# Opposite words can be replaced with each other in text
model.similarity("bad","good")

0.7190052

In [8]:
# Relationship between words
# king - queen = man - woman
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411403656006)]

In [9]:
# mom - girl = dad - boy
model.most_similar(positive=['girl', 'dad'], negative=['mom'])

[('boy', 0.808031439781189),
 ('teenager', 0.6755870580673218),
 ('teenage_girl', 0.6386616826057434),
 ('man', 0.6255338191986084),
 ('lad', 0.616614043712616),
 ('schoolgirl', 0.6113480925559998),
 ('schoolboy', 0.6011566519737244),
 ('son', 0.5938458442687988),
 ('father', 0.5887871384620667),
 ('uncle', 0.5734449028968811)]

In [10]:
# france - paris, spain - madrid
model.most_similar(positive=['paris', 'spain'], negative=['france'])

[('madrid', 0.5295541286468506),
 ('dubai', 0.509259819984436),
 ('heidi', 0.48901548981666565),
 ('portugal', 0.48763689398765564),
 ('paula', 0.48557141423225403),
 ('alex', 0.480734646320343),
 ('lohan', 0.4801103472709656),
 ('diego', 0.48010095953941345),
 ('florence', 0.47695302963256836),
 ('costa', 0.4761490225791931)]

In [11]:
# mother - daughter, table - chair
model.most_similar(positive=['chair', 'mother'], negative=['table'])

[('daughter', 0.6066097021102905),
 ('niece', 0.5490824580192566),
 ('granddaughter', 0.540050745010376),
 ('aunt', 0.5397382974624634),
 ('husband', 0.5387389659881592),
 ('sister', 0.5360148549079895),
 ('son', 0.5356959104537964),
 ('wife', 0.5313628911972046),
 ('father', 0.5261732339859009),
 ('grandmother', 0.5253341197967529)]

# Implemeting Word2Vec with Gensim

In [12]:
import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

### Preprocessing

In [13]:
# Cleaing the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article ) # convert to lowercase
processed_article = re.sub(r'\s+', ' ', processed_article) # remove all digits, special characters and extra spaces

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article) # convert to sentences

all_words = [nltk.word_tokenize(sent) for sent in all_sentences] # convert to words

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

### Creating Word2Vec Model

In [14]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=2)

In [16]:
vocabulary = word2vec.wv.key_to_index
print(vocabulary)

{'ai': 0, 'intelligence': 1, 'artificial': 2, 'learning': 3, 'human': 4, 'used': 5, 'research': 6, 'machine': 7, 'use': 8, 'many': 9, 'problems': 10, 'networks': 11, 'also': 12, 'data': 13, 'knowledge': 14, 'search': 15, 'intelligent': 16, 'researchers': 17, 'agent': 18, 'world': 19, 'neural': 20, 'algorithms': 21, 'general': 22, 'field': 23, 'symbolic': 24, 'logic': 25, 'may': 26, 'information': 27, 'machines': 28, 'states': 29, 'systems': 30, 'would': 31, 'system': 32, 'mind': 33, 'problem': 34, 'computer': 35, 'one': 36, 'goal': 37, 'example': 38, 'cyber': 39, 'applications': 40, 'reasoning': 41, 'could': 42, 'however': 43, 'goals': 44, 'risk': 45, 'humans': 46, 'technology': 47, 'diplomacy': 48, 'cybersecurity': 49, 'security': 50, 'approaches': 51, 'program': 52, 'specific': 53, 'russia': 54, 'scientific': 55, 'using': 56, 'computing': 57, 'since': 58, 'developed': 59, 'optimization': 60, 'including': 61, 'language': 62, 'mathematical': 63, 'solve': 64, 'recognition': 65, 'known':

### Model Analysis

In [19]:
# View vector representation of word
v1 = word2vec.wv['artificial']

In [21]:
# Comparing Similar Words
sim_words = word2vec.wv.most_similar('intelligence')
sim_words

[('ai', 0.5954935550689697),
 ('information', 0.5101127028465271),
 ('used', 0.5012791752815247),
 ('networks', 0.4910092353820801),
 ('agent', 0.4829583168029785),
 ('researchers', 0.4747096598148346),
 ('artificial', 0.4654604494571686),
 ('use', 0.4515644609928131),
 ('may', 0.4363589286804199),
 ('data', 0.43425726890563965)]