In [28]:
# Basic experimentation with nltk
# Install nltk using pip before usage
import nltk
import numpy as np

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('treebank')
from nltk.translate import bleu

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [2]:
# Testing out nltk's tokenization

Test_Sentence = "This is a basic test sentence to see how nltk works."
tokens = nltk.word_tokenize(Test_Sentence)
print(tokens)

#testing out and printing tokens
entities = nltk.chunk.ne_chunk(nltk.pos_tag(tokens))
print(entities)

# doesn't work because mac doesn't support tkinter
# from nltk.corpus import treebank
# t = treebank.parsed_sents('wsj_0001.mrg')[0]
# t.draw()

['This', 'is', 'a', 'basic', 'test', 'sentence', 'to', 'see', 'how', 'nltk', 'works', '.']
(S
  This/DT
  is/VBZ
  a/DT
  basic/JJ
  test/NN
  sentence/NN
  to/TO
  see/VB
  how/WRB
  nltk/JJ
  works/NNS
  ./.)


In [3]:
# Testing out Bleu Scoring
sentence1 = "Two string of words which are the same but don't have a single same word"
sentence2 = "Despite their stark linguistic differences, both sentences convey the same underlying message with distinct vocabulary and structure."
bleu_score = bleu([sentence1.split()], sentence2.split(),  (1,),)
print(bleu_score)

0.11764705882352941


In [4]:
# Testing out nltk's sentimental analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentence1 = "This is very good! I am really happy."
tokens = nltk.word_tokenize(sentence1)
sid = SentimentIntensityAnalyzer()
ss = sid.polarity_scores(sentence1)
print(ss)

{'neg': 0.0, 'neu': 0.401, 'pos': 0.599, 'compound': 0.8165}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [5]:
# Converting paragraphs into sentences

paragraph = "This is a paragraph. This is because there are multiple sentences in this text. However, paragraphs are not separated with newline characters. Isn't that obvious?"
sentence_list = nltk.sent_tokenize(paragraph)
print(sentence_list)
for i in range(len(sentence_list)):
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence_list[i])
    print("Sentence: ", sentence_list[i])
    print(ss)
    print('\n')

['This is a paragraph.', 'This is because there are multiple sentences in this text.', 'However, paragraphs are not separated with newline characters.', "Isn't that obvious?"]
Sentence:  This is a paragraph.
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


Sentence:  This is because there are multiple sentences in this text.
{'neg': 0.0, 'neu': 0.882, 'pos': 0.118, 'compound': 0.0516}


Sentence:  However, paragraphs are not separated with newline characters.
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


Sentence:  Isn't that obvious?
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}




In [7]:
#accesing nltk's corpora
nltk.download("gutenberg")

from nltk.corpus import gutenberg
print("hello")
print(gutenberg.words())
print()


hello
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]



[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [21]:
# Using gensim's word2vec 
import gensim

#training using nltk's corpus 
training_data = gutenberg.sents()[:100000]
models = gensim.models.Word2Vec(training_data)
print(models)
print(models.sorted_vocab)

#saving this model as gutenberg_word2vec
models.save('Gutenberg_Model')
#loading the model 
gutenberg_model = gensim.models.Word2Vec.load('Gutenberg_Model')

Word2Vec<vocab=17011, vector_size=100, alpha=0.025>
1


In [42]:
# Using this gutenberg model 
word1_vector  = gutenberg_model.wv['print']
# print(word1_vector)
word2_vector = gutenberg_model.wv['print']
dot_prod = np.dot(word1_vector, word2_vector)
magnitude_product = np.dot(word1_vector, word1_vector)**0.5* np.dot(word2_vector, word2_vector)**0.5
print("Dot product of the two vectors are: ", dot_prod)
print("Cosine of the two vectors are: ", dot_prod/magnitude_product)

def convert_word2vec(word1):
    wordvector = gutenberg_model.wv[word1]
    return wordvector

def word_similarity(word1vector, word2vector):
    dot_prod = np.dot(word1vector, word2vector)
    magnitude_product=  np.dot(word2vector, word2vector)**0.5 * np.dot(word1vector, word1vector)**0.5
    similarity = dot_prod/magnitude_product
    return similarity

input1 = input("Enter the first word: ")
input2 = input("Enter the second word: ")
print("There is a ", word_similarity(convert_word2vec(input1), convert_word2vec(input2))*100," percentage similarity between the two words." )


#This is equivalent to the same thing:
print("The similarity between two words = ", gutenberg_model.wv.similarity(input1, input2))

Dot product of the two vectors are:  1.2413924
Cosine of the two vectors are:  1.0
There is a  52.34339108294756  percentage similarity between the two words.
The similarity between two words =  0.52343386


In [44]:
from nltk.data import find
nltk.download("word2vec_sample")
word2vec_samplee = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_samplee, binary=False)

[nltk_data] Downloading package word2vec_sample to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


In [52]:
len(model)
print(len(model['university']))
#finding similar words
print(model.most_similar(positive=['university'], topn=10 ))
#finding the most dis-similar words
print(model.most_similar(negative=['university'], topn=10 ))

300
[('universities', 0.7003918290138245), ('faculty', 0.6780906915664673), ('undergraduate', 0.6587095856666565), ('campus', 0.6434988379478455), ('college', 0.638526976108551), ('academic', 0.6317198872566223), ('professors', 0.6298646926879883), ('undergraduates', 0.6149812936782837), ('University', 0.6139305233955383), ('student', 0.600540041923523)]
[('Flashed', 0.19817940890789032), ('9N', 0.16438086330890656), ('Neversink', 0.1642790585756302), ('Blanched', 0.15624172985553741), ('Grapefruit', 0.15487904846668243), ('domi', 0.1502796858549118), ('pulverizing', 0.14868904650211334), ('Nosebleed', 0.14782992005348206), ('Tempter', 0.14712963998317719), ('thickeners', 0.14493298530578613)]
