<a href="https://colab.research.google.com/github/hvarS/NLPRefer/blob/main/word2vec_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Imports and Data

In [19]:
import nltk
import numpy as np
#Tokenizer Download
nltk.download('punkt')
#Stopwords Download
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
paragraph =  "Fans, for the past two weeks you have been reading about the bad break I got. Yet today I consider myself the luckiest man on the face of this earth. I have been in ballparks for seventeen years and have never received anything but kindness and encouragement from you fans.Look at these grand men. Which of you wouldn't consider it the highlight of his career just to associate with them for even one day? Sure, I'm lucky. Who wouldn't consider it an honor to have known Jacob Ruppert? Also, the builder of baseball's greatest empire, Ed Barrow? To have spent six years with that wonderful little fellow, Miller Huggins? Then to have spent the next nine years with that outstanding leader, that smart student of psychology, the best manager in baseball today, Joe McCarthy? Sure, I'm lucky.When the New York Giants, a team you would give your right arm to beat, and vice versa, sends you a gift - that's something. When everybody down to the groundskeepers and those boys in white coats remember you with trophies - that's something. When you have a wonderful mother-in-law who takes sides with you in squabbles with her own daughter - that's something. When you have a father and a mother who work all their lives so you can have an education and build your body - it's a blessing. When you have a wife who has been a tower of strength and shown more courage than you dreamed existed - that's the finest I know."

###Preprocessing

In [21]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [22]:
# Cleaning the texts

#Using Stemming 
ps = PorterStemmer()

sentences = nltk.sent_tokenize(paragraph)
corpus_stemmed = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus_stemmed.append(review)

In [23]:
sentences

['Fans, for the past two weeks you have been reading about the bad break I got.',
 'Yet today I consider myself the luckiest man on the face of this earth.',
 'I have been in ballparks for seventeen years and have never received anything but kindness and encouragement from you fans.Look at these grand men.',
 "Which of you wouldn't consider it the highlight of his career just to associate with them for even one day?",
 "Sure, I'm lucky.",
 "Who wouldn't consider it an honor to have known Jacob Ruppert?",
 "Also, the builder of baseball's greatest empire, Ed Barrow?",
 'To have spent six years with that wonderful little fellow, Miller Huggins?',
 'Then to have spent the next nine years with that outstanding leader, that smart student of psychology, the best manager in baseball today, Joe McCarthy?',
 "Sure, I'm lucky.When the New York Giants, a team you would give your right arm to beat, and vice versa, sends you a gift - that's something.",
 "When everybody down to the groundskeepers a

In [24]:
corpus_stemmed

['fan past two week read bad break got',
 'yet today consid luckiest man face earth',
 'ballpark seventeen year never receiv anyth kind encourag fan look grand men',
 'consid highlight career associ even one day',
 'sure lucki',
 'consid honor known jacob ruppert',
 'also builder basebal greatest empir ed barrow',
 'spent six year wonder littl fellow miller huggin',
 'spent next nine year outstand leader smart student psycholog best manag basebal today joe mccarthi',
 'sure lucki new york giant team would give right arm beat vice versa send gift someth',
 'everybodi groundskeep boy white coat rememb trophi someth',
 'wonder mother law take side squabbl daughter someth',
 'father mother work live educ build bodi bless',
 'wife tower strength shown courag dream exist finest know']

In [25]:
wordnet=WordNetLemmatizer()
corpus_lemmatized = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus_lemmatized.append(review)

In [26]:
sentences

['Fans, for the past two weeks you have been reading about the bad break I got.',
 'Yet today I consider myself the luckiest man on the face of this earth.',
 'I have been in ballparks for seventeen years and have never received anything but kindness and encouragement from you fans.Look at these grand men.',
 "Which of you wouldn't consider it the highlight of his career just to associate with them for even one day?",
 "Sure, I'm lucky.",
 "Who wouldn't consider it an honor to have known Jacob Ruppert?",
 "Also, the builder of baseball's greatest empire, Ed Barrow?",
 'To have spent six years with that wonderful little fellow, Miller Huggins?',
 'Then to have spent the next nine years with that outstanding leader, that smart student of psychology, the best manager in baseball today, Joe McCarthy?',
 "Sure, I'm lucky.When the New York Giants, a team you would give your right arm to beat, and vice versa, sends you a gift - that's something.",
 "When everybody down to the groundskeepers a

In [27]:
corpus_lemmatized

['fan past two week reading bad break got',
 'yet today consider luckiest man face earth',
 'ballpark seventeen year never received anything kindness encouragement fan look grand men',
 'consider highlight career associate even one day',
 'sure lucky',
 'consider honor known jacob ruppert',
 'also builder baseball greatest empire ed barrow',
 'spent six year wonderful little fellow miller huggins',
 'spent next nine year outstanding leader smart student psychology best manager baseball today joe mccarthy',
 'sure lucky new york giant team would give right arm beat vice versa sends gift something',
 'everybody groundskeeper boy white coat remember trophy something',
 'wonderful mother law take side squabble daughter something',
 'father mother work life education build body blessing',
 'wife tower strength shown courage dreamed existed finest know']

###Word2Vec


In [56]:
##Tokenize sentences into words before Word2Vec

In [54]:
sentences = [nltk.word_tokenize(sentence) for sentence in corpus_lemmatized]

In [57]:
from gensim.models import Word2Vec

In [58]:
w2v = Word2Vec(sentences,min_count=1)

In [59]:
words = w2v.wv.vocab

In [64]:
list(words.keys())

['fan',
 'past',
 'two',
 'week',
 'reading',
 'bad',
 'break',
 'got',
 'yet',
 'today',
 'consider',
 'luckiest',
 'man',
 'face',
 'earth',
 'ballpark',
 'seventeen',
 'year',
 'never',
 'received',
 'anything',
 'kindness',
 'encouragement',
 'look',
 'grand',
 'men',
 'highlight',
 'career',
 'associate',
 'even',
 'one',
 'day',
 'sure',
 'lucky',
 'honor',
 'known',
 'jacob',
 'ruppert',
 'also',
 'builder',
 'baseball',
 'greatest',
 'empire',
 'ed',
 'barrow',
 'spent',
 'six',
 'wonderful',
 'little',
 'fellow',
 'miller',
 'huggins',
 'next',
 'nine',
 'outstanding',
 'leader',
 'smart',
 'student',
 'psychology',
 'best',
 'manager',
 'joe',
 'mccarthy',
 'new',
 'york',
 'giant',
 'team',
 'would',
 'give',
 'right',
 'arm',
 'beat',
 'vice',
 'versa',
 'sends',
 'gift',
 'something',
 'everybody',
 'groundskeeper',
 'boy',
 'white',
 'coat',
 'remember',
 'trophy',
 'mother',
 'law',
 'take',
 'side',
 'squabble',
 'daughter',
 'father',
 'work',
 'life',
 'education',
 '

In [68]:
w2v.wv["work"].shape
#100 dimension embedding

(100,)

In [69]:
vectorised_sentences = []
for sentence in sentences:
  s = []
  for word in sentence:
    vector = w2v.wv[word]
    s.append(vector)
  
  s = np.array(s)
  vectorised_sentences.append(s)

vectorised_sentences = np.array(vectorised_sentences)

  # This is added back by InteractiveShellApp.init_path()


In [74]:
vectorised_sentences[0].shape #First Sentence has 8 words and each word has 100 dimensions

(8, 100)

In [78]:
##To Use Google's pretrained Word2Vec
# Load Google's pre-trained Word2Vec model.
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')