In [193]:
from __future__ import unicode_literals
from spacy.lang.en.stop_words import STOP_WORDS
from gensim.models import Word2Vec
import tensorflow_hub as hub
import gensim
import spacy
import textacy
import re
import numpy as np
from scipy import spatial
import pandas as pd

nlp = spacy.load('en')

# We can choose either of the two once embedding
<ul>
<li>Loading the googlenews word2vec
<li>Create word embedding using MSR Corposes(or Any corposes which you intend to use)
</ul>

# Loading googlenews word2vec with a limit of 5000000

In [227]:
model = gensim.models.KeyedVectors.load_word2vec_format('./googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin',binary=True,limit=5000000) 

# Creating word embedding using the MSR Corpuses

In [214]:
def tokenize(text):
    return text.lower().split(' ')

# Reading the MSR corpus 
df = pd.read_csv('msr_paraphrase_data.txt', sep='\t', error_bad_lines=False)

# Creating a list of the tokenized sentences
df['String'] = df['String'].apply(tokenize)
sentences = df['String'].to_list()

# train model
model = Word2Vec(sentences, min_count=1)

# Cleaning the sentences
if you wish you can go ahead and clean the sentences and remove the stopword using the below para however in my case I have not done that

In [182]:
# Cleaning the sentence
def clean_sentence(sentence):
    sentence = sentence.strip()
    sentence = re.sub(r'[^a-zA-Z0-9\s]','', sentence)
    return re.sub(r'\s{2,}', ' ', sentence)
                      
# Removing stopwords
def Remove_stopwords(sentence):
    return ' '.join([token for token in sentence.split() if token not in STOP_WORDS])

# Extracting the phrase

In [183]:
# Extracting prepositinal phrase
def get_pps(doc):
    "Function to get PPs from a parsed document."
    pps = []
    for token in doc:
        # Try this with other parts of speech for different subtrees.
        if token.pos_ == 'ADP':
            pp = ' '.join([tok.orth_ for tok in token.subtree])
            pps.append(pp)
    return pps

In [184]:
# Extract the phrases from the sentences here I have used space to extract the verb, noun ,prepositional phrase 
def extract_phrases(doc):
    phrase_list = []
    # Extracting the noun phrase from the sentence
    for chunk in doc.noun_chunks:
         phrase_list.append(str(chunk).lower())
      
    #Extracting the verb phrase from the sentence
    pattern =  r'<VERB>?<ADV>*<VERB>+'
    lists = textacy.extract.pos_regex_matches(doc, pattern)
    for list in lists:
         phrase_list.append(list.text.lower())
            
    # Extracting prepositinal phrase
    phrase_list.extend(get_pps(doc))
    
    return phrase_list

# Averaging the vector 

In [185]:
# Averaging the vector for each word in a phrase
def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

# Finding similarity
Finding the phase with the highest similary however I have kept a minimum threshold of i.e the similarity > 0

In [207]:
# Finding the phrases with the highest similarity
def similarity(phrase1, phrase2):  
    for ph1 in phrase1:
        max_sim = 0
        ph_sim =''
        for ph2 in phrase2:
            if ph1 != ph2:
                try:                    
                    if max_sim == 0:
                        # Cosine similarity between two words.
                        max_sim = model.wv.n_similarity(ph1.lower().split(), ph2.lower().split())
                        ph_sim = ph2
                    elif max_sim <= model.wv.n_similarity(ph1.lower().split(), ph2.lower().split()):
                        max_sim = model.wv.n_similarity(ph1.lower().split(), ph2.lower().split())
                        ph_sim = ph2
                except Exception as e:
                    print(e)
                    break

        if max_sim >=0 and ph_sim and max_sim:
            print(ph1,',',ph_sim,',',max_sim)
    

# Provide the input sentences 

In [232]:
sent1 = "Feelings about current business conditions improved substantially from the first quarter, jumping from 40 to 55."
sent2 = "Assessment of current business conditions improved substantially, the Conference Board said, jumping to 55 from 40 in the first quarter."

In [233]:
sent1 = clean_sentence(sent1)
sent1 = Remove_stopwords(sent1)

sent2 = clean_sentence(sent2)
sent2 = Remove_stopwords(sent2)

In [234]:
sent1_phr_lst = extract_phrases(nlp(sent1))
sent2_phr_lst = extract_phrases(nlp(sent2))

  action="once",
  action="once",


# Output using by creating word embedding using MSR corpus
No cleaning

In [223]:
print(sent1_phr_lst)
print(sent2_phr_lst)

['feelings', 'current business conditions', 'the first quarter', 'improved', 'jumping', 'about current business conditions', 'from the first quarter', 'from 40 to 55']
['assessment', 'current business conditions', 'the conference board', 'the first quarter', 'improved', 'said', 'jumping', 'of current business conditions', 'to 55', 'from 40 in the first quarter', 'in the first quarter']


In [224]:
similarity(sent1_phr_lst, sent2_phr_lst)

feelings , assessment , 0.9543698
current business conditions , improved , 0.99947375
the first quarter , the conference board , 0.9999101
improved , current business conditions , 0.99947375
jumping , the conference board , 0.9972994
about current business conditions , current business conditions , 0.99991834
from the first quarter , from 40 in the first quarter , 0.9999393
from 40 to 55 , to 55 , 0.99947983


After cleaning

In [219]:
print(sent1_phr_lst)
print(sent2_phr_lst)

['feelings current business conditions', 'improved', 'jumping']
['assessment current business conditions', 'conference board', '55 40 quarter', 'improved', 'said jumping']


In [220]:
similarity(sent1_phr_lst, sent2_phr_lst)

feelings current business conditions , assessment current business conditions , 0.9999904
improved , assessment current business conditions , 0.9994738
jumping , assessment current business conditions , 0.99721813


# Output using Word2vec google new
No cleaning

In [230]:
print(sent1_phr_lst)
print(sent2_phr_lst)

['feelings', 'current business conditions', 'the first quarter', 'improved', 'jumping', 'about current business conditions', 'from the first quarter', 'from 40 to 55']
['assessment', 'current business conditions', 'the conference board', 'the first quarter', 'improved', 'said', 'jumping', 'of current business conditions', 'to 55', 'from 40 in the first quarter', 'in the first quarter']


In [231]:
similarity(sent1_phr_lst, sent2_phr_lst)

"word 'of' not in vocabulary"
feelings , current business conditions , 0.1891333
"word 'of' not in vocabulary"
current business conditions , the first quarter , 0.30059728
"word 'of' not in vocabulary"
the first quarter , the conference board , 0.35872227
"word 'of' not in vocabulary"
improved , the first quarter , 0.27405083
"word 'of' not in vocabulary"
jumping , the first quarter , 0.1505425
"word 'of' not in vocabulary"
about current business conditions , current business conditions , 0.92669064
"word 'of' not in vocabulary"
from the first quarter , the first quarter , 0.9443298
"word '40' not in vocabulary"


  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  


After Cleaning

In [235]:
print(sent1_phr_lst)
print(sent2_phr_lst)

['feelings current business conditions', 'improved', 'jumping']
['assessment current business conditions', 'conference board', '55 40 quarter', 'improved', 'said jumping']


In [236]:
similarity(sent1_phr_lst, sent2_phr_lst)

"word '55' not in vocabulary"
feelings current business conditions , assessment current business conditions , 0.7705159
"word '55' not in vocabulary"
improved , assessment current business conditions , 0.26115558
"word '55' not in vocabulary"
jumping , conference board , 0.082201235


  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  
