# Data Collection

In [45]:
import glob
file_names = glob.glob("/Users/ibrahim/Desktop/Preprocess/query/*.txt")
file_names

['/Users/ibrahim/Desktop/Preprocess/query/PNight5.txt',
 '/Users/ibrahim/Desktop/Preprocess/query/PNight4.txt',
 '/Users/ibrahim/Desktop/Preprocess/query/PNight1.txt',
 '/Users/ibrahim/Desktop/Preprocess/query/PNight3.txt',
 '/Users/ibrahim/Desktop/Preprocess/query/PNight2.txt']

In [46]:
raw_documents=[]
for file in file_names:
    try:
        with open(file, "r", encoding="utf-8") as f: raw_documents.append(f.read())
    except:
        pass

In [47]:
print("Number of documents: ",len(raw_documents))

Number of documents:  5


In [48]:
querydoc=glob.glob("/Users/ibrahim/Desktop/Preprocess/AllStories/*.txt")
querydoc

['/Users/ibrahim/Desktop/Preprocess/AllStories/BNight1.txt',
 '/Users/ibrahim/Desktop/Preprocess/AllStories/BNight3.txt',
 '/Users/ibrahim/Desktop/Preprocess/AllStories/BNight2.txt',
 '/Users/ibrahim/Desktop/Preprocess/AllStories/BNight5.txt',
 '/Users/ibrahim/Desktop/Preprocess/AllStories/BNight4.txt',
 '/Users/ibrahim/Desktop/Preprocess/AllStories/BNight3 copy.txt']

In [49]:
query_documents=[]
for file in querydoc:
    try:
        with open(file, "r", encoding="utf-8") as f: query_documents.append(f.read())
    except:
        pass

In [50]:
print("Number of documents: ",len(query_documents))

Number of documents:  6


# 1.1 Locality Sensitive Hashing (LSH) Algorithm (First Layer)
https://nbviewer.org/github/bassimeledath/quora_profile/blob/master/questions_analysis.ipynb

https://www.pinecone.io/learn/locality-sensitive-hashing/

In [51]:
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

In [52]:
#Preprocess will split a string of text into individual tokens/shingles based on whitespace.
def preprocess(text):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    tokens = text
    # Removing punctuations in string
    # Using loop + punctuation string
    for ele in tokens:
        if ele in punc:
            tokens = tokens.replace(ele, "")
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [53]:
#Number of Permutations
permutations = 800

#Number of Recommendations to return
num_recommendations = 5

In [54]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    #print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [55]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    result={}
    result = database.iloc[idx_array]
    
    #print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [56]:
db = pd.DataFrame(raw_documents)#We have 67 Sentences
db=db[0]
print(db)
forest = get_forest(db, permutations)

0    \n\n\n\nWhen it was the fifth night, Dunyazad ...
1    When it was the fourth night, Dunyazad asked h...
2    SHAHRAZAD SAID:\n\tI have heard, O fortunate k...
3    When it was the third night, Dunyazad asked he...
4    When it was the second night, Dunyazad said to...
Name: 0, dtype: object


In [57]:
num_recommendations = 10
query = query_documents[5].lower()
print('query', query)
result1 = predict(query, db, permutations, num_recommendations, forest)
print('\n Top similar sentences \n', result1)

query when it was the third night,
and the king had had his will of the wazir's daughter, dunyazad,
her sister, said to her, "finish for us that tale of thine;" and
she replied, "with joy and goodly gree! it hath reached me, o
auspicious king, that when the third old man told a tale to the
jinni more wondrous than the two preceding, the jinni marvelled
with exceeding marvel, and, shaking with delight, cried, lo! i
have given thee the remainder of the merchant's punishment and
for thy sake have i released him." thereupon the merchant
embraced the old men and thanked them, and these shaykhs wished
him joy on being saved and fared forth each one for his own city.
yet this tale is not more wondrous than the fisherman's story."
asked the king, "what is the fisherman's story?" and she answered
by relating the tale of

the fisherman and the jinni.
it hath reached me, o auspicious king, that there was a fisher
man well stricken in years who had a wife and three children, and
withal was of poor

In [58]:
#Create a list of candiates to be taken to the next layer which is Cosine Simialrity
candidates1=[]
candidates1=result1.values
print(candidates1)

['SHAHRAZAD SAID:\n\tI have heard, O fortunate king, that a wealthy merchant, who had many dealings throughout the lands, rode out one day to settle a matter of business in one of them. When it became hot, he sat down under a tree and put his hand in his saddlebag, from which he took out a piece of bread and a date. He ate and when he had finished with the date he threw away its stone, at which a huge ‘ifrit appeared, with a drawn sword in his hand. This ‘ifrit came up to the merchant and said: ‘Get up so that I can kill you as you killed my son.’ ‘How did I kill your son?’ asked the merchant, and the ‘ifrit told him: ‘When you ate that date and threw away the stone, it struck my son in the chest as he was walking, and he died instantly.’ ‘We belong to God and to Him do we return,’ recited the merchant, adding: ‘There is no might and no power except with God, the Exalted, the Omnipotent. If I killed him, this was by accident, so please forgive me.’ ‘I must kill you,’ insisted the ‘ifri

# 1.2 Cosine Similarity (The second layer)

https://towardsdatascience.com/how-to-rank-text-content-by-semantic-similarity-4d2419a84c32

In [59]:
from re import sub
from gensim.utils import simple_preprocess

#query_string = 'fruit and vegetables'
#documents = ['cars drive on the road', 'tomatoes are actually fruit']

stopwords = []

# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
def preprocess(doc):
    # Tokenize, clean up input document string
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

# Preprocess the documents, including the query string
corpus = [preprocess(document) for document in candidates1]
#print(corpus)
query = preprocess(query)
#print (query)

In [60]:
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

# Load the model: this is a big file, can take a while to download and open
glove = api.load("glove-wiki-gigaword-50")    
similarity_index = WordEmbeddingSimilarityIndex(glove)

# Build the term dictionary, TF-idf model
dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.  
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

100%|██████████████████████████████████████| 1045/1045 [00:08<00:00, 125.10it/s]


In [61]:
# Compute Soft Cosine Measure between the query and the documents.
# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)

doc_similarity_scores = index[query_tf] 


# Output the sorted similarity scores and documents
NW_Candiadates1=[]
sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
for idx in sorted_indexes:
    if doc_similarity_scores[idx]>0.3:
        print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {candidates1[idx]}')
        NW_Candiadates1.append(candidates1[idx])

1 	 0.622 	 When it was the third night, Dunyazad asked her sister to finish the story. ‘With pleasure,’ said Shahrazad and went on: ‘I have heard, O fortunate king, that the third old man told the ‘ifrit a more remarkable story than the other two, and that in his astonishment and delight the ‘ifrit granted him the remaining share of the blood debt and allowed the merchant to go free. For his part, the merchant went and thanked the old men, who congratulated him on his safety, after which each of them went home. This, however, is not more surprising than the tale of the fisherman.’ When the king asked what that was, she went on:

I have heard, O fortunate king, that there once was a poor, elderly fisherman with a wife and three children, who was in the habit of casting his net exactly four times each day. He went out to the shore at noon one day, put down his basket, tucked up his shirt, waded into the sea and cast his net. He waited until it had sunk down before pulling its cords toge

In [62]:
NW_Candiadates1[0]

'When it was the third night, Dunyazad asked her sister to finish the story. ‘With pleasure,’ said Shahrazad and went on: ‘I have heard, O fortunate king, that the third old man told the ‘ifrit a more remarkable story than the other two, and that in his astonishment and delight the ‘ifrit granted him the remaining share of the blood debt and allowed the merchant to go free. For his part, the merchant went and thanked the old men, who congratulated him on his safety, after which each of them went home. This, however, is not more surprising than the tale of the fisherman.’ When the king asked what that was, she went on:\n\nI have heard, O fortunate king, that there once was a poor, elderly fisherman with a wife and three children, who was in the habit of casting his net exactly four times each day. He went out to the shore at noon one day, put down his basket, tucked up his shirt, waded into the sea and cast his net. He waited until it had sunk down before pulling its cords together and 

# LSH --> NEW COSINE ---> NW ON PARAGRAPHS

In [63]:
Target_paragraphs = list(filter(lambda x : x != '', NW_Candiadates1[0].split('.')))#https://stackoverflow.com/questions/53240763/python-how-to-separate-paragraphs-from-text
len(Target_paragraphs)

81

In [64]:
import nltk 
#lower Sentences
Target_paragraphs2=[]
for i in Target_paragraphs:
    i = i.lower()
    
    Target_paragraphs2.append(i)

In [65]:
import re
 
# Removing punctuations in string
# Using loop + punctuation string
final_target_paragraphs_list=[]
for ele in Target_paragraphs2:
    x = re.sub(r'[^\w\s]', '', ele)
    x = x.replace('\n'," ")
    final_target_paragraphs_list.append(x)

In [66]:
#https://stackoverflow.com/questions/50685343/how-to-lemmatize-a-list-of-sentences
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

lmtzr = WordNetLemmatizer()
lemmatized = [[lmtzr.lemmatize(word) for word in word_tokenize(s)]
              for s in final_target_paragraphs_list]
#print(lemmatized[2])
#[['i', 'like', 'car'], ['cat', 'are', 'the', 'best']]

In [67]:
Final_Target_Lemmatized=[]
for i in range(len(final_target_paragraphs_list)):
    Final_Target_Lemmatized.append(" ".join(word for word in lemmatized[i]))

In [68]:
query = query_documents[5].lower()
query_paragraps = list(filter(lambda x : x != '', query.split('.')))#https://stackoverflow.com/questions/53240763/python-how-to-separate-paragraphs-from-text
print(len(query_paragraps))
#print(query_paragraps[7])

17


In [69]:
import nltk 
#lower Sentences
preprocess_query_paragraphs=[]
for i in query_paragraps:
    i = i.lower()
    preprocess_query_paragraphs.append(i)

In [70]:
import re
 
# Removing punctuations in string
# Using loop + punctuation string
final_query_paragraphs_list=[]
for ele in preprocess_query_paragraphs:
    x = re.sub(r'[^\w\s]', '', ele)
    x = x.replace('\n'," ")
    final_query_paragraphs_list.append(x)

In [71]:
#https://stackoverflow.com/questions/50685343/how-to-lemmatize-a-list-of-sentences
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

lmtzr = WordNetLemmatizer()
lemmatized = [[lmtzr.lemmatize(word) for word in word_tokenize(s)]
              for s in final_query_paragraphs_list]
#print(lemmatized[2])
#[['i', 'like', 'car'], ['cat', 'are', 'the', 'best']]

In [72]:
Final_Query_Lemmatized=[]
for i in range(len(final_query_paragraphs_list)):
    Final_Query_Lemmatized.append(" ".join(word for word in lemmatized[i]))

In [73]:
len(Final_Query_Lemmatized)

17

# Final Shit /3 1:10 Am

# Divide actual Tokens of Query

In [74]:
Final_Query_Lemmatized
Final_Query=[]
i=0
if len(Final_Query_Lemmatized) % 3==0:
    while(i < len(Final_Query_Lemmatized)):
        #if len(x) < len(test):
        if Final_Query_Lemmatized[i]==Final_Query_Lemmatized[-2]:
            Final_Query.append(Final_Query_Lemmatized[-2]+" "+Final_Query_Lemmatized[-1])
            break
        else:
            Final_Query.append(Final_Query_Lemmatized[i]+" "+Final_Query_Lemmatized[i+1]+" "+Final_Query_Lemmatized[i+2])
        i += 3
else:
    while(i < len(Final_Query_Lemmatized)):
        #if len(x) < len(test):
        if Final_Query_Lemmatized[i]==Final_Query_Lemmatized[-1]:
            Final_Query.append(Final_Query_Lemmatized[i])
            Final_Query[-1]=Final_Query[-2]+" "+Final_Query[-1]
            Final_Query.remove(Final_Query[-2])
            break
        elif Final_Query_Lemmatized[i]==Final_Query_Lemmatized[-2]:
            Final_Query.append(Final_Query_Lemmatized[i]+" "+Final_Query_Lemmatized[i+1])
            #x.remove(x[-2])
            break
        else:
            Final_Query.append(Final_Query_Lemmatized[i]+" "+Final_Query_Lemmatized[i+1]+" "+Final_Query_Lemmatized[i+2])
        i += 3
#print(Final_Query)

In [75]:
len(Final_Query_Lemmatized)

17

In [76]:
len(Final_Query)

6

In [77]:
Final_Target_Lemmatized
Final_Target=[]
i=0
if len(Final_Target_Lemmatized) % 3==0:
    while(i < len(Final_Target_Lemmatized)):
        #if len(x) < len(test):
        if Final_Target_Lemmatized[i]==Final_Target_Lemmatized[-2]:
            Final_Target.append(Final_Target_Lemmatized[-2]+" "+Final_Target_Lemmatized[-1])
            break
        else:
            Final_Target.append(Final_Target_Lemmatized[i]+" "+Final_Target_Lemmatized[i+1]+" "+Final_Target_Lemmatized[i+2])
        i += 3
else:
    while(i < len(Final_Target_Lemmatized)):
        #if len(x) < len(test):
        if Final_Target_Lemmatized[i]==Final_Target_Lemmatized[-1]:
            Final_Target.append(Final_Target_Lemmatized[i])
            Final_Target[-1]=Final_Target[-2]+" "+Final_Target[-1]
            Final_Target.remove(Final_Target[-2])
            break
        elif Final_Target_Lemmatized[i]==Final_Target_Lemmatized[-2]:
            Final_Target.append(Final_Target_Lemmatized[i]+" "+Final_Target_Lemmatized[i+1])
            #x.remove(x[-2])
            break
        else:
            Final_Target.append(Final_Target_Lemmatized[i]+" "+Final_Target_Lemmatized[i+1]+" "+Final_Target_Lemmatized[i+2])
        i += 3
#print(Final_Query)

In [78]:
len(Final_Target)

27

# NW Spot Checking

In [106]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
IR=0 #IndexRange

query=Final_Query[IR].split()
if IR==0:
    for i in Final_Target[IR:IR+1]:
    #print("t--------------",i)
        alignment = needle.NeedlemanWunsch(query, i.split())
        x = alignment.get_identity()
        y = alignment 
        R[i]=[x , query, i, y]
    max_value = max(R.values())
    print('Query: ',Final_Query[IR],'\n')
    print('Target Sentence: \n',max_value[2],'\n')
    print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3]) 
else:
    for i in Final_Target[IR-1:IR+1]:
    #print("t--------------",i)
        alignment = needle.NeedlemanWunsch(query, i.split())
        x = alignment.get_identity()
        y = alignment 
        R[i]=[x , query, i, y]
    max_value = max(R.values())
    print('Query: ',Final_Query[IR],'\n')
    print('Target Sentence: \n',max_value[2],'\n')
    print('Perecent Identity using NW:',max_value[0]) 

Query:  when it wa the third night and the king had had his will of the wazirs daughter dunyazad her sister said to her finish for u that tale of thine and she replied with joy and goodly gree it hath reached me o auspicious king that when the third old man told a tale to the jinni more wondrous than the two preceding the jinni marvelled with exceeding marvel and shaking with delight cried lo i have given thee the remainder of the merchant punishment and for thy sake have i released him thereupon the merchant embraced the old men and thanked them and these shaykhs wished him joy on being saved and fared forth each one for his own city yet this tale is not more wondrous than the fisherman story 

Target Sentence: 
 when it wa the third night dunyazad asked her sister to finish the story with pleasure said shahrazad and went on i have heard o fortunate king that the third old man told the ifrit a more remarkable story than the other two and that in his astonishment and delight the ifrit 

Query:  anon he plungeth in despite the buffet of the wave the while to sight the bellying net his eager glance strain till joying at the night success a fish he bringeth home whose gullet by the hook of fate wa caught and cut in twain 

Target Sentence: 
 the fisherman rise to earn his keep there is the sea with star woven in the sky 

Perecent Identity using NW: 11.11


In [74]:
test=['Ibrainm','Hasan','Ali','Alyami','test1','test2','test3','test4','test5','test6','test7','test8','test9','test10','test11','test12']
x=[]
i=0
if len(test) % 3==0:
    while(i < len(test)):
        #if len(x) < len(test):
        if test[i]==test[-2]:
            x.append(test[-2]+" "+test[-1])
            break
        else:
            x.append(test[i]+" "+test[i+1]+" "+test[i+2])
        i += 3
else:
    while(i < len(test)):
        #if len(x) < len(test):
        if test[i]==test[-1]:
            x.append(test[i])
            x[-1]=x[-2]+" "+x[-1]
            x.remove(x[-2])
            break
        elif test[i]==test[-2]:
            x.append(test[i]+" "+test[i+1])
            #x.remove(x[-2])
            break
        else:
            x.append(test[i]+" "+test[i+1]+" "+test[i+2])
        i += 3
print(x)

['Ibrainm Hasan Ali', 'Alyami test1 test2', 'test3 test4 test5', 'test6 test7 test8', 'test9 test10 test11 test12']


In [52]:
len(test)


14

In [291]:
test=['Ibrainm','Hasan','Ali','Alyami','test1','test2','test3','test4','test5','test6']
x=[]
i=0
if len(test) % 2==0:
    while(i < len(test)):
        #if len(x) < len(test):
        if test[i]==test[-2]:
            x.append(test[-2]+" "+test[-1])
            break
        else:
            x.append(test[i]+" "+test[i+1]+" "+test[i+2])
        i += 3
else:
    while(i < len(test)):
        #if len(x) < len(test):
        if test[i]==test[-1]:
            x.append(test[i])
            x[-1]=x[-2]+" "+x[-1]
            x.remove(x[-2])
            break
        else:
            x.append(test[i]+" "+test[i+1]+" "+test[i+2])
        i += 3

IndexError: list index out of range

In [290]:
x

['Ibrainm Hasan Ali', 'Alyami test1 test2', 'test3 test4 test5']

# Divide actual Tokens of Target to 4 sentences per token

In [89]:
test=['Ibrainm','Hasan','Ali','Alyami','test1','test2','test3','test4','test5']
x=[]
i=0
if len(test) % 2==0:
    while(i < len(test)):
        #if len(x) < len(test):
        if test[i]==test[-2]:
            x.append(test[-2]+" "+test[-1])
            break
        else:
            x.append(test[i]+" "+test[i+1]+" "+test[i+2])
        i += 3
else:
    while(i < len(test)):
        #if len(x) < len(test):
        if test[i]==test[-1]:
            x.append(test[i])
            x[-1]=x[-2]+" "+x[-1]
            x.remove(x[-2])
            break
        else:
            x.append(test[i]+" "+test[i+1]+" "+test[i+2])
        i += 3
x

['Ibrainm Hasan Ali', 'Alyami test1 test2', 'test3 test4 test5']

101

In [84]:
Final_Target_Lemmatized
Final_Target=[]

i=0
if len(Final_Target_Lemmatized) % 2==0:
    while(i < len(Final_Target_Lemmatized)):
        #if len(x) < len(test):
        if Final_Target_Lemmatized[i]==Final_Target_Lemmatized[-2]:
            Final_Target.append(Final_Target_Lemmatized[-2]+" "+Final_Target_Lemmatized[-1])
            break
        else:
            Final_Target.append(Final_Target_Lemmatized[i]+" "+Final_Target_Lemmatized[i+1]+" "+Final_Target_Lemmatized[i+2])
        i += 3
else:
    while(i < len(Final_Target_Lemmatized)):
        #if len(x) < len(test):
        if Final_Target_Lemmatized[i]==Final_Target_Lemmatized[-1]:
            Final_Target.append(Final_Target_Lemmatized[i])
            Final_Target[-1]=Final_Target[-2]+" "+Final_Target[-1]
            Final_Target.remove(Final_Target[-2])
            break
        else:
            Final_Target.append(Final_Target_Lemmatized[i]+" "+Final_Target_Lemmatized[i+1]+" "+Final_Target_Lemmatized[i+2])
        i += 3
x

IndexError: list index out of range

In [80]:
Final_Target=[]
i=0
if len(Final_Target_Lemmatized) % 2==0:
    while(i < len(Final_Target_Lemmatized)):
        #if len(x) < len(test):
        if Final_Target_Lemmatized[i]==Final_Target_Lemmatized[-2]:
            Final_Target.append(Final_Target_Lemmatized[-2]+" "+Final_Target_Lemmatized[-1])
            break
        else:
            Final_Target.append(Final_Target_Lemmatized[i]+" "+Final_Target_Lemmatized[i+1]+" "+Final_Target_Lemmatized[i+2])
        i += 3
else:
    while(i < len(Final_Target_Lemmatized)):
        #if len(x) < len(test):
        if Final_Target_Lemmatized[i]==Final_Target_Lemmatized[-1]:
            Final_Target.append(Final_Target_Lemmatized[i])
            Final_Target[-1]=Final_Target[-2]+" "+Final_Target[-1]
            Final_Target.remove(Final_Target[-2])
            break
        else:
            Final_Target.append(Final_Target_Lemmatized[i]+" "+Final_Target_Lemmatized[i+1]+" "+Final_Target_Lemmatized[i+2])
        i += 3

IndexError: list index out of range

In [66]:
len(Final_Target_Lemmatized)

101

In [None]:
Final_Target[5]

In [31]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
IR=0 #IndexRange
query=Final_Query[IR].split()
if IR==0:
    for i in Final_Target[IR:IR+1]:
    #print("t--------------",i)
        alignment = needle.NeedlemanWunsch(query, i.split())
        x = alignment.get_identity()
        y = alignment 
        R[i]=[x , query, i, y]
    max_value = max(R.values())
    print('Query: ',Final_Query[IR],'\n')
    print('Target Sentence: \n',max_value[2],'\n')
    print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3]) 
else:
    for i in Final_Target[IR-1:IR+1]:
    #print("t--------------",i)
        alignment = needle.NeedlemanWunsch(query, i.split())
        x = alignment.get_identity()
        y = alignment 
        R[i]=[x , query, i, y]
    max_value = max(R.values())
    print('Query: ',Final_Query[IR],'\n')
    print('Target Sentence: \n',max_value[2],'\n')
    print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3]) 

NameError: name 'Final_Query' is not defined

# Sentence by sentece

In [472]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
IR=11 #IndexRange
query=Final_Query_Lemmatized[IR].split()
if IR==0:
    for i in Final_Target_Lemmatized[IR:IR+3]:
    #print("t--------------",i)
        alignment = needle.NeedlemanWunsch(query, i.split())
        x = alignment.get_identity()
        y = alignment 
        R[i]=[x , query, i, y]
    max_value = max(R.values())
    print('Query: ',Final_Query_Lemmatized[IR],'\n')
    print('Target Sentence: \n',max_value[2],'\n')
    print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3]) 
else:
    for i in Final_Target_Lemmatized[IR-1:IR+2]:
    #print("t--------------",i)
        alignment = needle.NeedlemanWunsch(query, i.split())
        x = alignment.get_identity()
        y = alignment 
        R[i]=[x , query, i, y]
    max_value = max(R.values())
    print('Query: ',Final_Query_Lemmatized[IR],'\n')
    print('Target Sentence: \n',max_value[2],'\n')
    print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3]) 

Query:  quoth i to the herdsman daughter is this true that thou sayest of this calf quoth she yea o my master he is thy son the very core of thy heart 

Target Sentence: 
 when i got to his house his daughter welcomed me kissing my hand while the calf came and rolled on the ground in front of me 

Perecent Identity using NW: 17.65


In [462]:
Final_Target_Lemmatized[]

'the next day a i wa sitting there he came back to me and said i have something to tell you that will please you and you owe me a reward for my good news'

In [261]:
Final_Target[1:limit]

['the ifrit wa listening with astonishment to what the old man with the gazelle wa saying and the man went on lord of the king of the jinni while all this wa going on my wife now this gazelle wa looking on and telling me to kill the calf because it wa fat but i could not bring myself to do this and so i told the herdsman to take it away which he did the next day a i wa sitting there he came back to me and said i have something to tell you that will please you and you owe me a reward for my good news',
 'i agreed to this and he went on master i have a daughter who a a young girl wa taught magic by an old woman we had staying with u yesterday when you gave me the calf i went to the girl and when she saw it she covered her face shed tear but then burst into laughter']

In [85]:
test=['Ibrainm','Hasan','Ali']
x=[]
i=0
if len(test) % 2==0:
    while(i < len(test)):
        if test[i]==test[-1]:
            x.append(test[i])
            x[-1]=x[-2]+" "+x[-1]
            x.remove(x[-2])
            break
        else:
            x.append(test[i]+" "+test[i+1]+" "+test[i+2])
        i += 3
else:
    while(i <= len(test)):
        #if len(x) < len(test):
        if test[i]==test[-1]:
            x.append(test[i])
            x[-1]=x[-2]+" "+x[-1]
            x.remove(x[-2])
            break
        else:
            x.append(test[i]+" "+test[i+1]+" "+test[i+2])
        i += 3

IndexError: list index out of range

In [233]:
x

['Ibrainm Hasan Ali Alyami', 'test1 test2 test3 test4 test5']

In [55]:
nltk.sent_tokenize(test[0])

['This is me.']

# 2 senteces in one token and recheck

In [531]:
Final_Query_Lemmatized
Final_Query=[]
i=0
if len(Final_Query_Lemmatized) % 2==0:
    while(i < len(Final_Query_Lemmatized)):
        #if len(x) < len(test):
        if Final_Query_Lemmatized[i]==Final_Query_Lemmatized[-2]:
            Final_Query.append(Final_Query_Lemmatized[-2]+" "+Final_Query_Lemmatized[-1])
            break
        else:
            Final_Query.append(Final_Query_Lemmatized[i]+" "+Final_Query_Lemmatized[i+1])
        i += 2
else:
    while(i < len(Final_Query_Lemmatized)):
        #if len(x) < len(test):
        if Final_Query_Lemmatized[i]==Final_Query_Lemmatized[-1]:
            Final_Query.append(Final_Query_Lemmatized[i])
            Final_Query[-1]=Final_Query[-2]+" "+Final_Query[-1]
            Final_Query.remove(Final_Query[-2])
            break
        else:
            Final_Query.append(Final_Query_Lemmatized[i]+" "+Final_Query_Lemmatized[i+1])
        i += 2

In [544]:
Final_Query[0]

'when it wa the second night said dunyazad to her sister shahrazad o my sister finish for u that story of the merchant and the jinni and she answered with joy and goodly gree if the king permit me then quoth the king tell thy tale and shahrazad began in these word it hath reached me o auspicious king and heaven directed ruler that when the merchant purposed the sacrifice of the calf but saw it weeping his heart relented and he said to the herdsman keep the calf among my cattle'

In [534]:
len(Final_Query_Lemmatized)

21

In [535]:
Final_Target_Lemmatized
Final_Target=[]

i=0
if len(Final_Target_Lemmatized) % 2==0:
    while(i < len(Final_Target_Lemmatized)):
        #if len(x) < len(test):
        if Final_Target_Lemmatized[i]==Final_Target_Lemmatized[-2]:
            Final_Target.append(Final_Target_Lemmatized[-2]+" "+Final_Target_Lemmatized[-1])
            break
        else:
            Final_Query.append(Final_Target_Lemmatized[i]+" "+Final_Target_Lemmatized[i+1])
        i += 2
else:
    while(i < len(Final_Target_Lemmatized)):
        #if len(x) < len(test):
        if Final_Target_Lemmatized[i]==Final_Target_Lemmatized[-1]:
            Final_Target.append(Final_Target_Lemmatized[i])
            Final_Target[-1]=Final_Target[-2]+" "+Final_Target[-1]
            Final_Target.remove(Final_Target[-2])
            break
        else:
            Final_Target.append(Final_Target_Lemmatized[i]+" "+Final_Target_Lemmatized[i+1])
        i += 2


In [538]:
len(Final_Target)

50

In [541]:
Final_Target[]

'what a good pleasant delightful and sweet story this is exclaimed dunyazad at which shahrazad told her how can this compare with what i shall tell you this coming night if i am still alive and the king spare me by god the king said to himself i am not going to kill her until i hear the rest of this remarkable story and so they spent the rest of the time embracing one another until the sun had fully risen the king then went to his court the troop arrived together with the vizier and when everyone wa there he gave his judgement appointing some official dismissing others and issuing order and prohibition until evening the court wa then dismissed and the king returned to his palace where when night came he lay again with shahrazad'

In [589]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
IR=8 #IndexRange
query=Final_Query[IR].split()
if IR==0:
    for i in Final_Target[IR:IR+2]:
    #print("t--------------",i)
        alignment = needle.NeedlemanWunsch(query, i.split())
        x = alignment.get_identity()
        y = alignment 
        R[i]=[x , query, i, y]
    max_value = max(R.values())
    print('Query: ',Final_Query[IR],'\n')
    print('Target Sentence: \n',max_value[2],'\n')
    print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3]) 
else:
    for i in Final_Target[IR-5:IR+5]:
    #print("t--------------",i)
        alignment = needle.NeedlemanWunsch(query, i.split())
        x = alignment.get_identity()
        y = alignment 
        R[i]=[x , query, i, y]
    max_value = max(R.values())
    print('Query: ',Final_Query[IR],'\n')
    print('Target Sentence: \n',max_value[2],'\n')
    print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3]) 

Query:  then i fell on his neck and said allah upon thee tell me all that the daughter of my uncle did by thee and by thy mother and when he told me what had come to pas between them i said o my son allah favoured thee with one to restore thee and thy right hath returned to thee 

Target Sentence: 
 i wa astonished by this and a soon a i found that it wa morning i came to tell you when i heard what the man had to say i went out with him drunk although not on wine with the joy and delight that i wa feeling 

Perecent Identity using NW: 14.75


In [599]:
Final_Query[8]

'then i fell on his neck and said allah upon thee tell me all that the daughter of my uncle did by thee and by thy mother and when he told me what had come to pas between them i said o my son allah favoured thee with one to restore thee and thy right hath returned to thee'

In [600]:
Final_Target[8]

'at that she took a bowl filled it with water and recited a spell over it after which she sprinkled the water over the calf saying if you are a calf and this is how almighty god created you stay in this shape and dont change but if you are under a spell then return to your original shape with the permission of almighty god the calf shuddered and became a man at which i fell on him and said for god sake tell me what my wife did to you and your mother'

In [None]:
alignment = needle.NeedlemanWunsch(query, i.split())
x = alignment.get_identity()
y = alignment 

# LSH

In [1773]:
db = pd.DataFrame(Final_Target_Lemmatized)#We have 67 Sentences
db=db[0]
print(db)
forest = get_forest(db, permutations)

0      when it wa the second night dunyazad said to s...
1      with pleasure replied shahrazad if the king gi...
2      the ifrit wa listening with astonishment to wh...
3      the next day a i wa sitting there he came back...
4      i agreed to this and he went on master i have ...
                             ...                        
96     when the old man had finished his tale the ifr...
97     morning now dawned and shahrazad broke off fro...
98     what a good pleasant delightful and sweet stor...
99     the king then went to his court the troop arri...
100    the court wa then dismissed and the king retur...
Name: 0, Length: 101, dtype: object


In [1875]:
num_recommendations = 100
query = Final_Query_Lemmatized[20]
print('query:  ', query)
result = predict(query, db, permutations, num_recommendations, forest)
#print('\n Top similar sentences \n', result)

query:   when she deceased my son fared forth to the city of hind even to the city of this man who hath done to thee what hath been donefn51 and i also took this gazelle my cousin and wandered with her from town to town seeking tidings of my


In [1872]:
#Create a list of candiates to be taken to the next layer which is Cosine Simialrity
candidates=[]
candidates=result.values
#print(candidates)

In [1873]:
len(candidates)

53

In [1882]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('stsb-roberta-large')
query = Final_Query_Lemmatized[0]
# encode corpus to get corpus embeddings
candidates_embeddings = model.encode(Final_Target_Lemmatized, convert_to_tensor=True)
# encode sentence to get sentence embeddings
sentence_embedding = model.encode(query, convert_to_tensor=True)
# top_k results to return
top_k=5
# compute similarity scores of the sentence with the corpus
cos_scores = util.pytorch_cos_sim(sentence_embedding, candidates_embeddings)[0]
# Sort the results in decreasing order and get the first top_k
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
print("Sentence:", query, "\n")
print("Top", top_k, "most similar sentences in corpus:")
NW2_Candidates=[]
for idx in range(len(top_results)):
    #print(corpus[idx], "(Score: %.4f)" % (cos_scores[idx]))
    NW2_Candidates.append(corpus[idx])

Sentence: when she deceased my son fared forth to the city of hind even to the city of this man who hath done to thee what hath been donefn51 and i also took this gazelle my cousin and wandered with her from town to town seeking tidings of my 

Top 5 most similar sentences in corpus:


In [1883]:
NW3=[]
for i in NW2_Candidates:
    NW3.append(" ".join(i))

In [1885]:
NW3

['when it was the fifth night dunyazad asked her sister to finish the story if she was not too sleepy and shahrazad said i have heard o fortunate king that king yunan accused his vizier of being jealous of duban and wanting to have him killed then after that i would regret it yunan added as king sindbad regretted killing his falcon excuse me your majesty said the vizier but how was that yunan went on you must know that there was a persian king with a passion for enjoyment and amusement who had a fondness for hunting he had reared a falcon which was his constant companion by night and by day and which would spend the night perched on his wrist he would take it hunting with him and he had a golden bowl made for it which he hung round its neck and from which it could drink one day the chief falconer came to where he was sitting and told him that it was time to go out hunting the king gave the orders and went off with the falcon on his wrist until he and his party reached a wadi where they

In [1993]:
Final_Query_Lemmatized[21]

IndexError: list index out of range

In [1934]:
len(Final_Target_Lemmatized)

101

In [2146]:
test=['This','is','my','car']
y=[]
for i in range(len(test)):
    print(i)
    y.append(test[i]+test[i+1])
    i+=2
    if len(y) == (len(test)):
        print("True")
        break

0
1
2
3


IndexError: list index out of range

In [2147]:
y

['ab', 'bc', 'cd']

In [2127]:
test=['a','b','c','d']
y=[]
for i in range(len(test)):
    print(i)
    y.append(test[i]+test[i+1])
    i+=2
    if len(y) == (len(test)-3):
        print("True")
        break

0
True


In [2125]:
y

['ab']

In [30]:
#Combine Final_Query_Lemmatized using the previous approach
Final_Query=[]
for i in range(len(Final_Query_Lemmatized)):
    #print(i)
    Final_Query.append(Final_Query_Lemmatized[i]+" "+Final_Query_Lemmatized[i+1])
    i+=2
    if len(Final_Query) == (len(Final_Query_Lemmatized)-2):
        #print("True")
        break

In [50]:
Final_Query.append(Final_Query_Lemmatized[20])

'then o jinni i married the herdsman daughter to him and she transformed my wife into this gazelle sayingher shape is a comely and by no mean loathsome after this she abode with u night and day day and night till the almighty took her to himself'

'when she deceased my son fared forth to the city of hind even to the city of this man who hath done to thee what hath been donefn51 and i also took this gazelle my cousin and wandered with her from town to town seeking tidings of my'

In [39]:
#Combine Final_Query_Lemmatized using the previous approach
Final_Target=[]
for i in range(len(Final_Target_Lemmatized)):
    #print(i)
    Final_Target.append(Final_Target_Lemmatized[i]+" "+Final_Target_Lemmatized[i+1])
    i+=1
    if len(Final_Target) == (len(Final_Target_Lemmatized)-1):
        #print("True")
        break

In [56]:
Final_Target.append(Final_Target_Lemmatized[100])

In [30]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('stsb-roberta-large')
query = Final_Query[9]
# encode corpus to get corpus embeddings
candidates_embeddings = model.encode(Final_Target, convert_to_tensor=True)
# encode sentence to get sentence embeddings
sentence_embedding = model.encode(query, convert_to_tensor=True)
# top_k results to return
top_k=30
# compute similarity scores of the sentence with the corpus
cos_scores = util.pytorch_cos_sim(sentence_embedding, candidates_embeddings)[0]
# Sort the results in decreasing order and get the first top_k
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
print("Sentence:", query, "\n")
print("Top", top_k, "most similar sentences in corpus:")
NW_Candiadates1=[]
for idx in range(len(top_results)):
    #print(corpus[idx], "(Score: %.4f)" % (cos_scores[idx]))
    NW_Candiadates1.append(corpus[idx])

NameError: name 'Final_Query' is not defined

In [164]:
Final_NW=[]
for i in NW_Candiadates1:
    Final_NW.append(" ".join(i))

In [165]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
query=Final_Query[19].split()
for i in Final_NW:
    #print("t--------------",i)
    alignment = needle.NeedlemanWunsch(query, i.split())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query: ',Final_Query[19],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3]) 

Query:  when she deceased my son fared forth to the city of hind even to the city of this man who hath done to thee what hath been donefn51 and i also took this gazelle my cousin and wandered with her from town to town seeking tidings of my 

Target Sentence: 
 the girl stayed with u for some time until god chose to take her to himself and my son went off to india the country of the man with whom you have had this experience i myself took my wife this gazelle and have travelled from place to place looking for news of him until fate brought me here and i saw this merchant sitting weeping 

Perecent Identity using NW: 19.31


In [None]:
Correct: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19

In [232]:
test=['a','b','c','d','e']
list1=[]
list2=[]
for i in range(len(test)):
    if (i % 2) == 0:    
        list1.append(test[i])
    else:
        list2.append(test[i]) 

In [233]:
list1

['a', 'c', 'e']

In [234]:
list2

['b', 'd']

In [237]:
import itertools


somelists = [list1] + [list2]
for element in itertools.product(*somelists):
    print(element) 

('a', 'b')
('a', 'd')
('c', 'b')
('c', 'd')
('e', 'b')
('e', 'd')


In [239]:
somelists[0]

['a', 'c', 'e']

In [220]:
prefinal=list(zip(list1, list2))

In [221]:
prefinal

[('a', 'b'), ('c', 'd')]

In [197]:
X

[['a', 'b'], ['c', 'd']]

In [2037]:
len(Final_Target_Lemmatized)

26

In [1920]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
query=Final_Query_Lemmatized[14].split()
for i in Final_Target_Lemmatized:
    #print("t--------------",i)
    alignment = needle.NeedlemanWunsch(query, i.split())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query: ',Final_Query_Lemmatized[14],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3]) 

Query:  now when i heard o jinni these the word of the herdsman daughter i replied beside what thou askest all the cattle and the house hold stuff in thy father charge are thine and a for the daughter of my uncle her blood is lawful to thee 

Target Sentence: 
 when i heard what she had to say i promised to give her what she wanted a well a everything that wa in her father charge adding that i would even give her permission to kill my wife 

Perecent Identity using NW: 17.02


In [1581]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('stsb-roberta-large')
query = Final_Query_Lemmatized[0]
# encode corpus to get corpus embeddings
candidates_embeddings = model.encode(candidates, convert_to_tensor=True)
# encode sentence to get sentence embeddings
sentence_embedding = model.encode(query, convert_to_tensor=True)
# top_k results to return
top_k=len(candidates)
# compute similarity scores of the sentence with the corpus
cos_scores = util.pytorch_cos_sim(sentence_embedding, candidates_embeddings)[0]
# Sort the results in decreasing order and get the first top_k
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
print("Sentence:", query, "\n")
print("Top", top_k, "most similar sentences in corpus:")
NW2_Candidates=[]
for idx in range(len(top_results)):
    print(corpus[idx], "(Score: %.4f)" % (cos_scores[idx]))
    NW2_Candidates.append(corpus[idx])

Sentence: when it wa the second night said dunyazad to her sister shahrazad o my sister finish for u that story of the merchant and the jinni and she answered with joy and goodly gree if the king permit me then quoth the king tell thy tale and shahrazad began in these word it hath reached me o auspicious king and heaven directed ruler that when the merchant purposed the sacrifice of the calf but saw it weeping his heart relented and he said to the herdsman keep the calf among my cattle all this the old shaykh told the jinni who marvelled much at these strange word 

Top 11 most similar sentences in corpus:
['the', 'ifrit', 'was', 'listening', 'with', 'astonishment', 'to', 'what', 'the', 'old', 'man', 'with', 'the', 'gazelle', 'was', 'saying', 'and', 'the', 'man', 'went', 'on', 'lord', 'of', 'the', 'kings', 'of', 'the', 'jinn', 'while', 'all', 'this', 'was', 'going', 'on', 'my', 'wife', 'now', 'this', 'gazelle', 'was', 'looking', 'on', 'and', 'telling', 'me', 'to', 'kill', 'the', 'calf'

In [1582]:
NW3=[]
for i in NW2_Candidates:
    NW3.append(" ".join(i))

In [1583]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
query=Final_Query_Lemmatized[0].split()
for i in NW3:
    alignment = needle.NeedlemanWunsch(query, i.split())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query: ',Final_Query_Lemmatized[0],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

Query:  when it wa the second night said dunyazad to her sister shahrazad o my sister finish for u that story of the merchant and the jinni and she answered with joy and goodly gree if the king permit me then quoth the king tell thy tale and shahrazad began in these word it hath reached me o auspicious king and heaven directed ruler that when the merchant purposed the sacrifice of the calf but saw it weeping his heart relented and he said to the herdsman keep the calf among my cattle all this the old shaykh told the jinni who marvelled much at these strange word 

Target Sentence: 
 the ifrit was listening with astonishment to what the old man with the gazelle was saying and the man went on lord of the kings of the jinn while all this was going on my wife now this gazelle was looking on and telling me to kill the calf because it was fat but i could not bring myself to do this and so i told the herdsman to take it away which he did 

Perecent Identity using NW: 14.32


In [1435]:
Final_Target_Lemmatized[1]

'the ifrit wa listening with astonishment to what the old man with the gazelle wa saying and the man went on lord of the king of the jinni while all this wa going on my wife now this gazelle wa looking on and telling me to kill the calf because it wa fat but i could not bring myself to do this and so i told the herdsman to take it away which he did the next day a i wa sitting there he came back to me and said i have something to tell you that will please you and you owe me a reward for my good news i agreed to this and he went on master i have a daughter who a a young girl wa taught magic by an old woman we had staying with u yesterday when you gave me the calf i went to the girl and when she saw it she covered her face shed tear but then burst into laughter then she said father do you hold me so cheap that you bring strange men in to me where are these strange men i asked and why are you laughing and cry she said this calf you have with you is our master son who is under a spell laid 

In [1425]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('stsb-roberta-large')
query = Final_Query_Lemmatized[0]
# encode corpus to get corpus embeddings
candidates_embeddings = model.encode(candidates, convert_to_tensor=True)
# encode sentence to get sentence embeddings
sentence_embedding = model.encode(query, convert_to_tensor=True)
# top_k results to return
top_k=len(candidates)
# compute similarity scores of the sentence with the corpus
cos_scores = util.pytorch_cos_sim(sentence_embedding, candidates_embeddings)[0]
# Sort the results in decreasing order and get the first top_k
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
print("Sentence:", query, "\n")
print("Top", top_k, "most similar sentences in corpus:")
NW2_Candidates=[]
for idx in range(len(top_results)):
    print(corpus[idx], "(Score: %.4f)" % (cos_scores[idx]))
    NW2_Candidates.append(corpus[idx])

Sentence: when it wa the second night said dunyazad to her sister shahrazad o my sister finish for u that story of the merchant and the jinni and she answered with joy and goodly gree if the king permit me then quoth the king tell thy tale and shahrazad began in these word it hath reached me o auspicious king and heaven directed ruler that when the merchant purposed the sacrifice of the calf but saw it weeping his heart relented and he said to the herdsman keep the calf among my cattle all this the old shaykh told the jinni who marvelled much at these strange word 

Top 9 most similar sentences in corpus:
['the', 'ifrit', 'was', 'listening', 'with', 'astonishment', 'to', 'what', 'the', 'old', 'man', 'with', 'the', 'gazelle', 'was', 'saying', 'and', 'the', 'man', 'went', 'on', 'lord', 'of', 'the', 'kings', 'of', 'the', 'jinn', 'while', 'all', 'this', 'was', 'going', 'on', 'my', 'wife', 'now', 'this', 'gazelle', 'was', 'looking', 'on', 'and', 'telling', 'me', 'to', 'kill', 'the', 'calf',

In [1426]:
NW3=[]
for i in NW2_Candidates:
    NW3.append(" ".join(i))

In [1427]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
query=Final_Query_Lemmatized[0].split()
for i in NW3:
    alignment = needle.NeedlemanWunsch(query, i.split())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query: ',Final_Query_Lemmatized[0],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

Query:  when it wa the second night said dunyazad to her sister shahrazad o my sister finish for u that story of the merchant and the jinni and she answered with joy and goodly gree if the king permit me then quoth the king tell thy tale and shahrazad began in these word it hath reached me o auspicious king and heaven directed ruler that when the merchant purposed the sacrifice of the calf but saw it weeping his heart relented and he said to the herdsman keep the calf among my cattle all this the old shaykh told the jinni who marvelled much at these strange word 

Target Sentence: 
 the ifrit was listening with astonishment to what the old man with the gazelle was saying and the man went on lord of the kings of the jinn while all this was going on my wife now this gazelle was looking on and telling me to kill the calf because it was fat but i could not bring myself to do this and so i told the herdsman to take it away which he did 

Perecent Identity using NW: 14.32


In [784]:
x = NW_Candiadates1[0].split(".")

len(x)

102

In [785]:
z=query_documents[5].split(".")

In [857]:
x[2]

'\n\nThe ‘ifrit was listening with astonishment to what the old man with the gazelle was saying, AND THE MAN WENT ON:\n\nLord of the kings of the jinn, while all this was going on, my wife, now this gazelle, was looking on and telling me to kill the calf, because it was fat, but I could not bring myself to do this and so I told the herdsman to take it away, which he did'

In [858]:
z[2]

'" All this the old Shaykh told the Jinni\nwho marvelled much at these strange words'

In [875]:
from minineedle import needle, core

alignment = needle.NeedlemanWunsch(z[2].split(), x[2].split())
print(alignment.get_identity())
print(alignment)

5.33
Alignment of Query and Target Sentence is:
	-----------------------------------"Allthis---------the---------------oldShaykhtoldtheJinniwhomarvelledmuchatthesestrangewords
	The‘ifritwaslisteningwithastonishmenttowhattheoldmanwiththegazellewassaying,ANDTHEMANWENTON:Lordofthekingsofthejinn,whileallthiswasgoingon,mywife,nowthisgazelle,waslookingonandtellingmetokillthecalf,becauseitwasfat,butIcouldnotbringmyselftodothisandsoItoldtheherdsmantotakeitaway,whichhedid



In [754]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
for i in range(len(x)):
    alignment = needle.NeedlemanWunsch(kk, x[i])
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , y, i, y]
max_value = max(R.values())
print('Query: ',z[0],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

TypeError: 'float' object is not subscriptable

# Split the candidate to set of pages

In [539]:
Splitted = NW_Candiadates1[0].split()
# if you give no arguments, it will separate by whitespaces by default
# ["A", "B", "C", "D"]

Splitted = NW_Candiadates1[0].split()
# you can specify the maximum amount of elements the split() function will output
# ["E", "F", "G"]

'When'

In [574]:
x[2]

[]

# Token Paragraphs of candiadates

In [909]:
Target_sentences = list(filter(lambda x : x != '', NW_Candiadates1[0].split('.')))#https://stackoverflow.com/questions/53240763/python-how-to-separate-paragraphs-from-text
len(Target_sentences)

101

In [910]:
Target_sentences[2]

'\n\nThe ‘ifrit was listening with astonishment to what the old man with the gazelle was saying, AND THE MAN WENT ON:\n\nLord of the kings of the jinn, while all this was going on, my wife, now this gazelle, was looking on and telling me to kill the calf, because it was fat, but I could not bring myself to do this and so I told the herdsman to take it away, which he did'

# Preprocess Paragraphs

In [976]:
import nltk 
#lower Sentences
Target_sentences2=[]
for i in Target_sentences:
    i = i.lower()
    
    Target_sentences2.append(i)
    

In [977]:
import re
 
# Removing punctuations in string
# Using loop + punctuation string
final_target_paragraphs_list=[]
for ele in Target_sentences2:
    x = re.sub(r'[^\w\s]', '', ele)
    x = x.replace('\n'," ")
    final_target_paragraphs_list.append(x)

In [978]:
len(final_target_paragraphs_list)

101

In [981]:
#https://stackoverflow.com/questions/50685343/how-to-lemmatize-a-list-of-sentences
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

lmtzr = WordNetLemmatizer()
lemmatized = [[lmtzr.lemmatize(word) for word in word_tokenize(s)]
              for s in final_target_paragraphs_list]
#print(lemmatized[2])
#[['i', 'like', 'car'], ['cat', 'are', 'the', 'best']]

In [982]:
Final_Target_Lemmatized=[]
for i in range(len(final_target_paragraphs_list)):
    Final_Target_Lemmatized.append(" ".join(word for word in lemmatized[i]))

In [983]:
len(Final_Target_Lemmatized)

101

In [984]:
Final_Target_Lemmatized[13]

'girl i told her if you free him you can have all the beast and everything else that your father look after'

# Token Query text into paragraphs

In [915]:
query = query_documents[5].lower()
query_paragraps = list(filter(lambda x : x != '', query.split('.')))#https://stackoverflow.com/questions/53240763/python-how-to-separate-paragraphs-from-text
print(len(query_paragraps))
#print(query_paragraps[7])

21


# Preprocess Paragraphs of Query

In [916]:
import nltk 
#lower Sentences
preprocess_query_paragraphs=[]
for i in query_paragraps:
    i = i.lower()
    preprocess_query_paragraphs.append(i)
    

In [917]:
import re
 
# Removing punctuations in string
# Using loop + punctuation string
final_query_paragraphs_list=[]
for ele in preprocess_query_paragraphs:
    x = re.sub(r'[^\w\s]', '', ele)
    x = x.replace('\n'," ")
    final_query_paragraphs_list.append(x)

In [918]:
final_query_paragraphs_list[0]

'when it was the second night said dunyazad to her sister shahrazad o my sister finish for us that story of the merchant and the jinni and she answered with joy and goodly gree if the king permit me'

# Lemmatize a list of Query sentences

In [972]:
#https://stackoverflow.com/questions/50685343/how-to-lemmatize-a-list-of-sentences
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

lmtzr = WordNetLemmatizer()
lemmatized = [[lmtzr.lemmatize(word) for word in word_tokenize(s)]
              for s in final_query_paragraphs_list]
#print(lemmatized[2])
#[['i', 'like', 'car'], ['cat', 'are', 'the', 'best']]

In [973]:
Final_Query_Lemmatized=[]
for i in range(len(final_query_paragraphs_list)):
    Final_Query_Lemmatized.append(" ".join(word for word in lemmatized[i]))

In [974]:
Final_Target_Lemmatized

['when it wa the second night said dunyazad to her sister shahrazad o my sister finish for u that story of the merchant and the jinni and she answered with joy and goodly gree if the king permit me',
 'then quoth the king tell thy tale and shahrazad began in these word it hath reached me o auspicious king and heaven directed ruler that when the merchant purposed the sacrifice of the calf but saw it weeping his heart relented and he said to the herdsman keep the calf among my cattle',
 'all this the old shaykh told the jinni who marvelled much at these strange word',
 'then the owner of the gazelle continuedo lord of the king of the jann this much took place and my uncle daughter this gazelle looked on and saw it and said butcher me this calf for surely it is a fat one but i bade the herdsman take it away and he took it and turned his face homewards',
 'on the next day a i wa sitting in my own house lo the herdsman came and standing before me said o my master i will tell thee a thing wh

In [975]:
Final_Query_Lemmatized

['when it wa the second night said dunyazad to her sister shahrazad o my sister finish for u that story of the merchant and the jinni and she answered with joy and goodly gree if the king permit me',
 'then quoth the king tell thy tale and shahrazad began in these word it hath reached me o auspicious king and heaven directed ruler that when the merchant purposed the sacrifice of the calf but saw it weeping his heart relented and he said to the herdsman keep the calf among my cattle',
 'all this the old shaykh told the jinni who marvelled much at these strange word',
 'then the owner of the gazelle continuedo lord of the king of the jann this much took place and my uncle daughter this gazelle looked on and saw it and said butcher me this calf for surely it is a fat one but i bade the herdsman take it away and he took it and turned his face homewards',
 'on the next day a i wa sitting in my own house lo the herdsman came and standing before me said o my master i will tell thee a thing wh

#  LSH ---> Cosine ---> NW

# LSH

In [1054]:
db = pd.DataFrame(final_target_paragraphs_list)#We have 67 Sentences
db=db[0]
print(db)
forest = get_forest(db, permutations)

0      when it was the second night dunyazad said to ...
1       with pleasure replied shahrazad if the king g...
2        the ifrit was listening with astonishment to...
3       the next day as i was sitting there he came b...
4       i agreed to this and he went on master i have...
                             ...                        
96      when the old man had finished his tale the if...
97       morning now dawned and shahrazad broke off f...
98      what a good pleasant delightful and sweet sto...
99      the king then went to his court the troops ar...
100     the court was then dismissed and the king ret...
Name: 0, Length: 101, dtype: object


In [1367]:
num_recommendations = 50
query = Final_Query_Lemmatized[13]
print('query:  ', query)
result = predict(query, db, permutations, num_recommendations, forest)
print('\n Top similar sentences \n', result)

query:   she smiled and answered o my master i have no greed for the good nor will i take them save on two condition the first that thou marry me to thy son and the second that i may bewitch her who bewitched him and imprison her otherwise i can not be safe from her malice and malpractice

 Top similar sentences 
 32     then i checked my accounts and the sales figu...
41     when i did my audit i found that i had two th...
74      when i went home that evening i found these ...
76     who did this to them i asked and she said i s...
12                             this is your darling son
14     she smiled and said master i only want this o...
81      the third old man with the mule now said if ...
83     this was at night and i saw a black slave lyi...
24      at this point the old man with the two saluk...
57     i shall pay you back for this and dont be mis...
61     they spent their time eyeing all this and the...
Name: 0, dtype: object


In [1368]:
#Create a list of candiates to be taken to the next layer which is Cosine Simialrity
candidates=[]
candidates=result.values
#print(candidates)

In [1369]:
len(candidates)

11

In [1370]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
query=Final_Query_Lemmatized[13].split()
for i in candidates:
    alignment = needle.NeedlemanWunsch(query, i.split())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query: ',Final_Query_Lemmatized[13],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

Query:  she smiled and answered o my master i have no greed for the good nor will i take them save on two condition the first that thou marry me to thy son and the second that i may bewitch her who bewitched him and imprison her otherwise i can not be safe from her malice and malpractice 

Target Sentence: 
  she smiled and said master i only want this on two conditions the first being that you marry me to him and the second that i be allowed to put a spell on the one who enchanted him and keep her confined for otherwise i shall not be safe from her scheming 

Perecent Identity using NW: 45.3


# Cosine another source
https://towardsdatascience.com/semantic-similarity-using-transformers-8f3cb5bf66d6

In [1305]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('stsb-roberta-large')
query = Final_Query_Lemmatized[8]
# encode corpus to get corpus embeddings
candidates_embeddings = model.encode(candidates, convert_to_tensor=True)
# encode sentence to get sentence embeddings
sentence_embedding = model.encode(query, convert_to_tensor=True)
# top_k results to return
top_k=29
# compute similarity scores of the sentence with the corpus
cos_scores = util.pytorch_cos_sim(sentence_embedding, candidates_embeddings)[0]
# Sort the results in decreasing order and get the first top_k
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
print("Sentence:", query, "\n")
print("Top", top_k, "most similar sentences in corpus:")
NW2_Candidates=[]
for idx in range(len(top_results)):
    #print(corpus[idx], "(Score: %.4f)" % (cos_scores[idx]))
    NW2_Candidates.append(corpus[idx])

Sentence: then i marvelled at this with exceeding marvel and hardly made sure that day had dawned before i came to tell thee 

Top 29 most similar sentences in corpus:


In [1306]:
NW3=[]
for i in NW2_Candidates:
    NW3.append(" ".join(i))

In [1307]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
query=Final_Query_Lemmatized[8].split()
for i in NW3:
    alignment = needle.NeedlemanWunsch(query, i.split())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query: ',Final_Query_Lemmatized[8],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

Query:  then i marvelled at this with exceeding marvel and hardly made sure that day had dawned before i came to tell thee 

Target Sentence: 
 then i checked my accounts and the sales figures of my shop and i found that i had made a profit of a thousand dinars on a capital of two thousand 

Perecent Identity using NW: 15.62


In [1324]:
Final_Query_Lemmatized[2]

'all this the old shaykh told the jinni who marvelled much at these strange word'

In [1325]:
Final_Target_Lemmatized[2]

'the ifrit wa listening with astonishment to what the old man with the gazelle wa saying and the man went on lord of the king of the jinni while all this wa going on my wife now this gazelle wa looking on and telling me to kill the calf because it wa fat but i could not bring myself to do this and so i told the herdsman to take it away which he did'

In [1281]:
from minineedle import needle, core


query=Final_Query_Lemmatized[8].split()
i=Final_Target_Lemmatized[8].split()
alignment = needle.NeedlemanWunsch(query, i)
print(alignment.get_identity())

30.43


In [1045]:
from re import sub
from gensim.utils import simple_preprocess

#query_string = 'fruit and vegetables'
#documents = ['cars drive on the road', 'tomatoes are actually fruit']

stopwords = []

# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
def preprocess(doc):
    # Tokenize, clean up input document string
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

# Preprocess the documents, including the query string
corpus = [preprocess(document) for document in candidates]
#print(corpus)
query = preprocess(query)
#print (query)

In [994]:
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

# Load the model: this is a big file, can take a while to download and open
glove = api.load("glove-wiki-gigaword-50")    
similarity_index = WordEmbeddingSimilarityIndex(glove)

# Build the term dictionary, TF-idf model
dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.  
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

100%|████████████████████████████████████████| 255/255 [00:01<00:00, 130.26it/s]


In [1052]:
# Compute Soft Cosine Measure between the query and the documents.
# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)

doc_similarity_scores = index[query_tf] 


# Output the sorted similarity scores and documents
NW_Candiadates=[]
sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
for idx in sorted_indexes:
    if doc_similarity_scores[idx]>0.6:
        print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {candidates[idx]}')
        NW_Candiadates.append(candidates[idx])

0 	 0.681 	   the ifrit was listening with astonishment to what the old man with the gazelle was saying and the man went on  lord of the kings of the jinn while all this was going on my wife now this gazelle was looking on and telling me to kill the calf because it was fat but i could not bring myself to do this and so i told the herdsman to take it away which he did
1 	 0.653 	  the next day as i was sitting there he came back to me and said i have something to tell you that will please you and you owe me a reward for my good news
40 	 0.638 	  i took the water and went to my wife whom i found sleeping
29 	 0.637 	  she then carried me to an island where she left me for a time before coming back at dawn and saying i am your servant and it was i who saved your life by carrying you off with the permission of almighty god
39 	 0.632 	  she gave me some water and told me when you find her asleep sprinkle this water over her and say what you like for she will become whatever you want
14 	 

# NW

In [1053]:
from minineedle import needle, core


R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
query=Final_Query_Lemmatized[2].split()
for i in NW_Candiadates:
    alignment = needle.NeedlemanWunsch(query, i.split())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query: ',Final_Query_Lemmatized[2],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

Query:  all this the old shaykh told the jinni who marvelled much at these strange word 

Target Sentence: 
   when i heard what the man had to say i went out with him drunk although not on wine with the joy and delight that i was feeling 

Perecent Identity using NW: 6.9


In [1042]:
Final_Target_Lemmatized[2]

'the ifrit wa listening with astonishment to what the old man with the gazelle wa saying and the man went on lord of the king of the jinni while all this wa going on my wife now this gazelle wa looking on and telling me to kill the calf because it wa fat but i could not bring myself to do this and so i told the herdsman to take it away which he did'

In [1050]:
from minineedle import needle, core


query='the ifrit was listening with astonishment to what the old man with the gazelle was saying and the man went on  lord of the kings of the jinn while all this was going on my wife now this gazelle was looking on and telling me to kill the calf because it was fat but i could not bring myself to do this and so i told the herdsman to take it away which he did'

alignment = needle.NeedlemanWunsch(query, i.split())
print(alignment.get_identity())

0.28


# LSH --> LSH ---> Cosine ---> NW

In [565]:
#Second LSH
# Token Query to Sentences 
import nltk 
query = query_documents[5].lower()
query_sentences2 = nltk.tokenize.sent_tokenize(query)
#lower Sentences
query_sentences =[]
for i in query_sentences2:
    i = i.lower()
    query_sentences.append(i)


In [1366]:
import re
 
# Removing punctuations in string
# Using loop + punctuation string
cleaned_query=[]
for ele in query_sentences:
    x = re.sub(r'[^\w\s]', '', ele)
    cleaned_query.append(x)

In [1387]:
cleaned_query[1]

'then quoth the\nking tell thy tale and shahrazad began in these words it\nhath reached me o auspicious king and heaven directed ruler'

# Combine each two tokens into one token (Query Sentences)


In [1376]:
x=[i for i in range(len(cleaned_query)) if i % 2 == 0]#Even Index  #https://discuss.codecademy.com/t/how-can-i-obtain-just-the-odd-indices-of-a-list/376640
y=[i for i in range(len(cleaned_query)) if i % 2 == 1]#Odd Index  #https://discuss.codecademy.com/t/how-can-i-obtain-just-the-odd-indices-of-a-list/376640


newx=[]
newy=[]
for i in x:
    newx.append(cleaned_query[i])
for j in y:
    newy.append(cleaned_query[j])

In [1391]:
final_query_list = [i +" "+ j for i, j in zip(newx, newy)]
final_query_list[14]

'such is my tale quoth the jinni this story\nis indeed strange and therefore i grant thee the third part of\nhis blood'

IndexError: list index out of range

In [1399]:
x=['a','b','c','d','e']
m=[i for i in range(len(x)) if i % 2 == 0]#Even Index  #https://discuss.codecademy.com/t/how-can-i-obtain-just-the-odd-indices-of-a-list/376640
n=[i for i in range(len(x)) if i % 2 == 1]#Even Index  #https://discuss.codecademy.com/t/how-can-i-obtain-just-the-odd-indices-of-a-list/376640
print("m",n)
newx=[]
newy=[]
for i in m:
    newx.append(x[i])
for j in n:
    newy.append(x[j])


m [1, 3]


In [1400]:
res=[x + y for x in newy for y in [""] + newx]
res

['b', 'ba', 'bc', 'be', 'd', 'da', 'dc', 'de']

In [634]:
# Token Target to Sentences 
targetStory_sentences = nltk.tokenize.sent_tokenize(NW_Candiadates[0])
targetStory_sentences

['when it was the second night dunyazad said to shahrazad sister finish your story of the merchant and the ifrit for us with pleasure replied shahrazad if the king gives me permission and when the king gave it she went on\n\ni have heard o fortunate king and rightly guided ruler that when the merchant was about to cut the throat of the calf he was moved by pity and told the herdsman to keep the calf among the other beasts']

In [635]:
target_sentences =[]
for i in targetStory_sentences:
    i = i.lower()
    target_sentences.append(i)

In [737]:
cleaned_target[1]
i for i in range(len(lst)) if i % 2 == 1

'the ifrit was listening with astonishment to what the old man with the gazelle was saying and the man went on\n\nlord of the kings of the jinn while all this was going on my wife now this gazelle was looking on and telling me to kill the calf because it was fat but i could not bring myself to do this and so i told the herdsman to take it away which he did'

# Combine each two tokens into one token (Target Sentences)

In [919]:
x=[i for i in range(len(cleaned_target)) if i % 2 == 0]#Even Index  #https://discuss.codecademy.com/t/how-can-i-obtain-just-the-odd-indices-of-a-list/376640
y=[i for i in range(len(cleaned_target)) if i % 2 == 1]#Odd Index  #https://discuss.codecademy.com/t/how-can-i-obtain-just-the-odd-indices-of-a-list/376640
print(len(y))

36


In [964]:
new_cleaned_target=[]
for i in x:
    new_cleaned_target.append(cleaned_target[i]+ cleaned_target[j])

In [925]:
new_cleaned_target[0]

'when it was the second night dunyazad said to shahrazad sister finish your story of the merchant and the ifrit for us with pleasure replied shahrazad if the king gives me permission and when the king gave it she went on\n\ni have heard o fortunate king and rightly guided ruler that when the merchant was about to cut the throat of the calf he was moved by pity and told the herdsman to keep the calf among the other beaststhe king then went to his court the troops arrived together with the vizier and when everyone was there he gave his judgements appointing some officials dismissing others and issuing orders and prohibitions until evening'

In [926]:
len(new_cleaned_target)

37

# NOW the second LSH

In [927]:
db = pd.DataFrame(new_cleaned_target)#We have 67 Sentences
db=db[0]
print(db)
forest = get_forest(db, permutations)

0                                                                                                                                                              when it was the second night dunyazad said to shahrazad sister finish your story of the merchant and the ifrit for us with pleasure replied shahrazad if the king gives me permission and when the king gave it she went on\n\ni have heard o fortunate king and rightly guided ruler that when the merchant was about to cut the throat of the calf he was moved by pity and told the herdsman to keep the calf among the other beaststhe king then went to his court the troops arrived together with the vizier and when everyone was there he gave his judgements appointing some officials dismissing others and issuing orders and prohibitions until evening
1                                                                                                                                                                                                    

In [956]:
num_recommendations = 10
query = cleaned_query[3]
print('query:  ', query)
result = predict(query, db, permutations, num_recommendations, forest)
#print('\n Top similar sentences \n', result)

query:   all this the old shaykh told the jinni
who marvelled much at these strange words


In [957]:
#Create a list of candiates to be taken to the next layer which is Cosine Simialrity
candidates=[]
candidates=result.values
#print(candidates)

AttributeError: 'NoneType' object has no attribute 'values'

# Cosing Again

In [953]:
from re import sub
from gensim.utils import simple_preprocess

#query_string = 'fruit and vegetables'
#documents = ['cars drive on the road', 'tomatoes are actually fruit']

stopwords = []

# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
def preprocess(doc):
    # Tokenize, clean up input document string
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

# Preprocess the documents, including the query string
corpus = [preprocess(document) for document in candidates]
#print(corpus)
query = preprocess(query)
#print (query)

In [937]:
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

# Load the model: this is a big file, can take a while to download and open
glove = api.load("glove-wiki-gigaword-50")    
similarity_index = WordEmbeddingSimilarityIndex(glove)

# Build the term dictionary, TF-idf model
dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.  
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

INFO:keyedvectors.py:1954: loading projection weights from /Users/ibrahim/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz
INFO:utils.py:447: KeyedVectors lifecycle event {'msg': 'loaded (400000, 50) matrix of type float32 from /Users/ibrahim/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-03-27T01:15:31.794785', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'load_word2vec_format'}
INFO:dictionary.py:199: adding document #0 to Dictionary(0 unique tokens: [])
INFO:dictionary.py:204: built Dictionary(260 unique tokens: ['about', 'among', 'and', 'appointing', 'arrived']...) from 11 documents (total 962 corpus positions)
INFO:utils.py:447: Dictionary lifecycle event {'msg': "built Dictionary(260 unique tokens: ['about', 'among', 'and', 'appointing', 'arrived']...) from 11 documents (total 962 corpus positions)", '

100%|████████████████████████████████████████| 260/260 [00:02<00:00, 129.30it/s]

INFO:termsim.py:306: constructed a sparse term similarity matrix with 9.310651% density





In [954]:
# Compute Soft Cosine Measure between the query and the documents.
# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)

doc_similarity_scores = index[query_tf] 


# Output the sorted similarity scores and documents
NW_Candiadates=[]
sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
for idx in sorted_indexes:
    if doc_similarity_scores[idx]>0.4:
        print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {candidates[idx]}')
        NW_Candiadates.append(candidates[idx])

0 	 0.865 	 when it was the second night dunyazad said to shahrazad sister finish your story of the merchant and the ifrit for us with pleasure replied shahrazad if the king gives me permission and when the king gave it she went on

i have heard o fortunate king and rightly guided ruler that when the merchant was about to cut the throat of the calf he was moved by pity and told the herdsman to keep the calf among the other beaststhe king then went to his court the troops arrived together with the vizier and when everyone was there he gave his judgements appointing some officials dismissing others and issuing orders and prohibitions until evening
2 	 0.784 	 then she said father do you hold me so cheap that you bring strange men in to me where are these strange men i asked and why are you laughing and crying she said this calf you have with you is our masters son who is under a spell laid upon him and his mother by his fathers wifethe king then went to his court the troops arrived toget

# Now, NW

In [955]:
from minineedle import needle, core

#query= 'it hath reached me, o auspicious king, that there was a fisher man well stricken in years who had a wife and three children, and withal was of poor condition.'

R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
for i in NW_Candiadates:
    alignment = needle.NeedlemanWunsch(query, i.split())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query: ',cleaned_query[2],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

Query:  that when the merchant purposed the sacrifice of the calf but saw
it weeping his heart relented and he said to the herdsman keep
the calf among my cattle 

Target Sentence: 
 when it was the second night dunyazad said to shahrazad sister finish your story of the merchant and the ifrit for us with pleasure replied shahrazad if the king gives me permission and when the king gave it she went on

i have heard o fortunate king and rightly guided ruler that when the merchant was about to cut the throat of the calf he was moved by pity and told the herdsman to keep the calf among the other beaststhe king then went to his court the troops arrived together with the vizier and when everyone was there he gave his judgements appointing some officials dismissing others and issuing orders and prohibitions until evening 

Perecent Identity using NW: 12.74


# Mar 24

# Token Query to Sentences 

In [399]:
import nltk 
query = query_documents[5].lower()
query_sentences2 = nltk.tokenize.sent_tokenize(query)
#lower Sentences
query_sentences =[]
for i in query_sentences2:
    i = i.lower()
    query_sentences.append(i)

In [400]:
#query_sentences

In [401]:
import re
 
# Removing punctuations in string
# Using loop + punctuation string
cleaned_query=[]
for ele in query_sentences:
    x = re.sub(r'[^\w\s]', '', ele)
    cleaned_query.append(x)

In [404]:
cleaned_query[4]

'then the owner of the\n\n\x0cgazelle continuedo lord of the kings of the jann this much\ntook place and my uncles daughter this gazelle looked on and\nsaw it and said butcher me this calf for surely it is a fat\none but i bade the herdsman take it away and he took it and\nturned his face homewards'

# Token Target to Sentences 

In [410]:
targetStory_sentences = nltk.tokenize.sent_tokenize(NW_Candiadates[0])
targetStory_sentences[3]

'Yesterday when you gave me the calf, I went to the girl and, when she saw it, she covered her face, shed tears but then burst into laughter.'

In [411]:
target_sentences =[]
for i in targetStory_sentences:
    i = i.lower()
    target_sentences.append(i)

In [412]:
import re
 
# Removing punctuations in string
# Using loop + punctuation string
cleaned_target=[]
for ele in target_sentences:
    x = re.sub(r'[^\w\s]', '', ele)
    cleaned_target.append(x)

In [415]:
cleaned_target[3]

'yesterday when you gave me the calf i went to the girl and when she saw it she covered her face shed tears but then burst into laughter'

### Compare the first sentece from Query against all sentences in Target sentences

In [416]:
db = pd.DataFrame(cleaned_target)#We have 67 Sentences
db=db[0]
print(db)
forest = get_forest(db, permutations)

0     when it was the second night dunyazad said to ...
1     the ifrit was listening with astonishment to w...
2     the next day as i was sitting there he came ba...
3     yesterday when you gave me the calf i went to ...
4     then she said father do you hold me so cheap t...
                            ...                        
68    \n\n\n\nis that true the man asked the mule at...
69    morning now dawned and shahrazad broke off fro...
70    what a good pleasant delightful and sweet stor...
71    the king then went to his court the troops arr...
72    the court was then dismissed and the king retu...
Name: 0, Length: 73, dtype: object


In [440]:
num_recommendations = 20
query= cleaned_query[8]
print('Query: ',query)
result = predict(query, db, permutations, num_recommendations, forest)
print('\n Top similar sentences \n', result)

Query:  then said he o merchant i have a daughter and she
learned magic in her childhood from an old woman who lived with
us

 Top similar sentences 
 0     when it was the second night dunyazad said to ...
1     the ifrit was listening with astonishment to w...
2     the next day as i was sitting there he came ba...
4     then she said father do you hold me so cheap t...
17          i asked him how he was and he said dont ask
20    i divided this with my brother telling him to ...
21     he took the money gladly and opened another shop
22    some time later my second brother now this oth...
24    he too spent a whole year away before coming b...
27    my brother opened another shop but after a tim...
37    we were about to sail off again when on the sh...
38    she kissed my hand and asked if i was a charit...
42    i treated her with respect and as our journey ...
48    for i believe in him and in his apostle may go...
56    he told me his story and i decided not to leav...
58    it 

### Save this list to candidates list to be sent to Cosine Simialrity

In [441]:
#Create a list of candiates to be taken to the next layer which is Cosine Simialrity
candidates=[]
candidates=result.values[:]
#print(candidates)

# 2.2 Cosine Similarity (The second layer)


In [442]:
from re import sub
from gensim.utils import simple_preprocess

#query_string = 'fruit and vegetables'
#documents = ['cars drive on the road', 'tomatoes are actually fruit']

stopwords = []

# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
def preprocess(doc):
    # Tokenize, clean up input document string
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

# Preprocess the documents, including the query string
corpus = [preprocess(document) for document in candidates]
query = preprocess(query)

In [443]:
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

# Load the model: this is a big file, can take a while to download and open
glove = api.load("glove-wiki-gigaword-50")    
similarity_index = WordEmbeddingSimilarityIndex(glove)

# Build the term dictionary, TF-idf model
dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.  
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

100%|████████████████████████████████████████| 270/270 [00:02<00:00, 125.35it/s]


In [444]:
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)

doc_similarity_scores = index[query_tf] 


# Output the sorted similarity scores and documents
NW_Candiadates=[]
sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
for idx in sorted_indexes:
    if doc_similarity_scores[idx]>0.01:
        print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {candidates[idx]}')
        NW_Candiadates.append(candidates[idx])

17 	 0.769 	 the butcher saw me and took me into his house where his daughter covered her face from me and said are you bringing a man in to me where is there a man asked her father and she said this dog is a man over whom his wife has cast a spell but i can free him from it do that for gods sake said her father and she took a jug of water spoke some words over it and sprinkled some of it on me
2 	 0.714 	 the next day as i was sitting there he came back to me and said i have something to tell you that will please you and you owe me a reward for my good news i agreed to this and he went on master i have a daughter who as a young girl was taught magic by an old woman we had staying with us
3 	 0.698 	 then she said father do you hold me so cheap that you bring strange men in to me where are these strange men i asked and why are you laughing and crying she said this calf you have with you is our masters son who is under a spell laid upon him and his mother by his fathers wife
12 	 0.614 

In [445]:
NW_Candiadates

['the butcher saw me and took me into his house where his daughter covered her face from me and said are you bringing a man in to me where is there a man asked her father and she said this dog is a man over whom his wife has cast a spell but i can free him from it do that for gods sake said her father and she took a jug of water spoke some words over it and sprinkled some of it on me',
 'the next day as i was sitting there he came back to me and said i have something to tell you that will please you and you owe me a reward for my good news i agreed to this and he went on master i have a daughter who as a young girl was taught magic by an old woman we had staying with us',
 'then she said father do you hold me so cheap that you bring strange men in to me where are these strange men i asked and why are you laughing and crying she said this calf you have with you is our masters son who is under a spell laid upon him and his mother by his fathers wife',
 'i treated her with respect and as 

# 2.3 Needleman Wunch (The Last Layer)

In [448]:
from minineedle import needle, core

#query= 'it hath reached me, o auspicious king, that there was a fisher man well stricken in years who had a wife and three children, and withal was of poor condition.'

R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet

for i in NW_Candiadates:
    alignment = needle.NeedlemanWunsch(query, i.split())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query: ',cleaned_query[8],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

Query:  then said he o merchant i have a daughter and she
learned magic in her childhood from an old woman who lived with
us 

Target Sentence: 
 the next day as i was sitting there he came back to me and said i have something to tell you that will please you and you owe me a reward for my good news i agreed to this and he went on master i have a daughter who as a young girl was taught magic by an old woman we had staying with us 

Perecent Identity using NW: 16.92


In [168]:
from minineedle import needle, core

#query= 'it hath reached me, o auspicious king, that there was a fisher man well stricken in years who had a wife and three children, and withal was of poor condition.'
print(query)
threshold = 10.0
R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet

for i in NW_Candiadates:
    alignment = needle.NeedlemanWunsch(query, i)
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query:\n ',max_value[1],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

['when', 'it', 'was', 'the', 'second', 'night', 'said', 'dunyazad', 'to', 'her', 'sister', 'shahrazad', 'o', 'my', 'sister', 'finish', 'for', 'us', 'that', 'story', 'of', 'the', 'merchant', 'and', 'the', 'jinni', 'and', 'she', 'answered', 'with', 'joy', 'and', 'goodly', 'gree', 'if', 'the', 'king', 'permit', 'me']
Query:
  ['when', 'it', 'was', 'the', 'second', 'night', 'said', 'dunyazad', 'to', 'her', 'sister', 'shahrazad', 'o', 'my', 'sister', 'finish', 'for', 'us', 'that', 'story', 'of', 'the', 'merchant', 'and', 'the', 'jinni', 'and', 'she', 'answered', 'with', 'joy', 'and', 'goodly', 'gree', 'if', 'the', 'king', 'permit', 'me'] 

Target Sentence: 
 the butcher saw me and took me into his house where his daughter covered her face from me and said are you bringing a man in to me where is there a man asked her father and she said this dog is a man over whom his wife has cast a spell but i can free him from it do that for gods sake said her father and she took a jug of water spoke som

# My Proposed Algorithm

In [18]:
#Using LSH
# After getting the right story now we will do #1.Divide the query into sentences.
import nltk 
query = query_documents[5].lower()
query_sentences2 = nltk.tokenize.sent_tokenize(query)
#lower Sentences
query_sentences =[]
for i in query_sentences2:
    i = i.lower()
    query_sentences.append(i)

In [19]:
query_sentences

['when it was the second night,\nsaid dunyazad to her sister shahrazad, "o my sister, finish for\nus that story of the merchant and the jinni;" and she answered\n"with joy and goodly gree, if the king permit me."',
 'then quoth the\nking, "tell thy tale;" and shahrazad began in these words: it\nhath reached me, o auspicious king and heaven directed ruler!',
 'that when the merchant purposed the sacrifice of the calf but saw\nit weeping, his heart relented and he said to the herdsman, "keep\nthe calf among my cattle."',
 'all this the old shaykh told the jinni\nwho marvelled much at these strange words.',
 'then the owner of the\n\n\x0cgazelle continued:--o lord of the kings of the jann, this much\ntook place and my uncle\'s daughter, this gazelle, looked on and\nsaw it, and said, "butcher me this calf, for surely it is a fat\none;" but i bade the herdsman take it away and he took it and\nturned his face homewards.',
 'on the next day as i was sitting in my\nown house, lo!',
 'the herds

In [20]:
targetStory_sentences = nltk.tokenize.sent_tokenize(NW_Candiadates[0])

In [21]:
db2 = pd.DataFrame(targetStory_sentences)#We have 67 Sentences
db2=db2[0]
print(db2)
forest = get_forest(db2, permutations)

0     When it was the second night, Dunyazad said to...
1     The ‘ifrit was listening with astonishment to ...
2     The next day, as I was sitting there, he came ...
3     Yesterday when you gave me the calf, I went to...
4     Then she said: “Father, do you hold me so chea...
                            ...                        
68    *\n\n\n\n‘Is that true?’ the man asked the mul...
69    Morning now dawned and Shahrazad broke off fro...
70    ‘What a good, pleasant, delightful and sweet s...
71    The king then went to his court; the troops ar...
72    The court was then dismissed and the king retu...
Name: 0, Length: 73, dtype: object


In [22]:
num_recommendations = 1
LSHcandidates=[]
for query2 in query_sentences:
    #print(query2)
    result = predict(query2, db2, permutations, num_recommendations, forest)
    result
    LSHcandidates.append(result)

In [23]:
LSHcandidates

[70    ‘What a good, pleasant, delightful and sweet s...
 Name: 0, dtype: object,
 0    When it was the second night, Dunyazad said to...
 Name: 0, dtype: object,
 67    I sprinkled her with the water and said: ‘Leav...
 Name: 0, dtype: object,
 62    She sprinkled the water over me and said: ‘Lea...
 Name: 0, dtype: object,
 52    She continued to insist, despite my pleading w...
 Name: 0, dtype: object,
 60    This was at night and I saw a black slave lyin...
 Name: 0, dtype: object,
 67    I sprinkled her with the water and said: ‘Leav...
 Name: 0, dtype: object,
 8    ‘This is your darling son.’ ‘Girl,’ I told her...
 Name: 0, dtype: object,
 27    My brother opened another shop, but after a ti...
 Name: 0, dtype: object,
 2    The next day, as I was sitting there, he came ...
 Name: 0, dtype: object,
 4    Then she said: “Father, do you hold me so chea...
 Name: 0, dtype: object,
 29    Every year they would make the same proposal t...
 Name: 0, dtype: object,
 61    My wife caugh

In [24]:
FinalLSH=[]
for i in range(len(LSHcandidates)):
    FinalLSH.append(LSHcandidates[i].values[:])

In [25]:

FinalLSH[0].tolist()
FinalList=[]
for i in range(len(LSHcandidates)):
    FinalList.append(FinalLSH[i].tolist())


In [26]:
FinalList

[['‘What a good, pleasant, delightful and sweet story this is!’ exclaimed Dunyazad, at which Shahrazad told her: ‘How can this compare with what I shall tell you this coming night, if I am still alive and the king spares me?’ ‘By God,’ the king said to himself, ‘I am not going to kill her until I hear the rest of this remarkable story,’ and so they spent the rest of the time embracing one another until the sun had fully risen.'],
 ['When it was the second night, Dunyazad said to Shahrazad: ‘Sister, finish your story of the merchant and the ‘ifrit for us.’ ‘With pleasure,’ replied Shahrazad, ‘if the king gives me permission,’ and when the king gave it, SHE WENT ON:\n\nI have heard, O fortunate king and rightly guided ruler, that when the merchant was about to cut the throat of the calf, he was moved by pity and told the herdsman to keep the calf among the other beasts.'],
 ['I sprinkled her with the water and said: ‘Leave this shape and become a mule,’ which she did there and then, and 

# Transfer the text to doc

In [27]:
from docx import Document

document = Document()
document.add_heading('Similar Document', level=1)
document.add_paragraph(NW_Candiadates[0])

document.save('Target.docx')

In [28]:
FinalList

[['‘What a good, pleasant, delightful and sweet story this is!’ exclaimed Dunyazad, at which Shahrazad told her: ‘How can this compare with what I shall tell you this coming night, if I am still alive and the king spares me?’ ‘By God,’ the king said to himself, ‘I am not going to kill her until I hear the rest of this remarkable story,’ and so they spent the rest of the time embracing one another until the sun had fully risen.'],
 ['When it was the second night, Dunyazad said to Shahrazad: ‘Sister, finish your story of the merchant and the ‘ifrit for us.’ ‘With pleasure,’ replied Shahrazad, ‘if the king gives me permission,’ and when the king gave it, SHE WENT ON:\n\nI have heard, O fortunate king and rightly guided ruler, that when the merchant was about to cut the throat of the calf, he was moved by pity and told the herdsman to keep the calf among the other beasts.'],
 ['I sprinkled her with the water and said: ‘Leave this shape and become a mule,’ which she did there and then, and 

In [54]:
FinalList[1]

['When it was the second night, Dunyazad said to Shahrazad: ‘Sister, finish your story of the merchant and the ‘ifrit for us.’ ‘With pleasure,’ replied Shahrazad, ‘if the king gives me permission,’ and when the king gave it, SHE WENT ON:\n\nI have heard, O fortunate king and rightly guided ruler, that when the merchant was about to cut the throat of the calf, he was moved by pity and told the herdsman to keep the calf among the other beasts.']

In [60]:
Final=' '.join(str(x) for x in FinalList)

In [76]:
FinalList[0]

['‘What a good, pleasant, delightful and sweet story this is!’ exclaimed Dunyazad, at which Shahrazad told her: ‘How can this compare with what I shall tell you this coming night, if I am still alive and the king spares me?’ ‘By God,’ the king said to himself, ‘I am not going to kill her until I hear the rest of this remarkable story,’ and so they spent the rest of the time embracing one another until the sun had fully risen.']

In [138]:
# Import docx NOT python-docx
import docx
from docx.enum.text import WD_COLOR_INDEX
  
# Create an instance of a word document
doc = docx.Document()
# Add a Title to the document 
doc.add_heading('Target Story', 0)
  
# Creating paragraph with some content and Highlighting it.
highlight_para = doc.add_paragraph(
    ).add_run(
           NW_Candiadates
                 ).font.highlight_color = WD_COLOR_INDEX.WHITE
  
# Now save the document to a location 
doc.save('Target-Story.docx')

In [149]:
import docx
from docx.enum.text import WD_COLOR_INDEX
  
# create an instance of a 
# word document we want to open
doc1 = Document('/Users/ibrahim/Desktop/Grad-Research-Spring22/Preprocessing/Target-Story.docx')
for i in FinalList:
     print(i)
        for i in doc1.paragraphs:
            print("True")

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 9)

In [196]:
import docx
from docx.enum.text import WD_COLOR_INDEX
  
# Create an instance of a word document
doc = docx.Document('/Users/ibrahim/Desktop/Grad-Research-Spring22/Preprocessing/Target-Story.docx')
  
# Add a Title to the document 
doc.add_heading('Paragraphs', 0)
inserted=[]
for i in FinalList:
    print(i)
    inserted.append(i)
    if i in doc.paragraphs:
        print()

for i in range(len(inserted)):
    #print(inserted[i])# paragraph with some content and Highlighting it.
    highlight_para = doc.add_paragraph(
        ).add_run(
                 ).font.highlight_color = WD_COLOR_INDEX.YELLOW
  
# Now save the document to a location 
doc.save('gfg.docx')

['‘What a good, pleasant, delightful and sweet story this is!’ exclaimed Dunyazad, at which Shahrazad told her: ‘How can this compare with what I shall tell you this coming night, if I am still alive and the king spares me?’ ‘By God,’ the king said to himself, ‘I am not going to kill her until I hear the rest of this remarkable story,’ and so they spent the rest of the time embracing one another until the sun had fully risen.']
['When it was the second night, Dunyazad said to Shahrazad: ‘Sister, finish your story of the merchant and the ‘ifrit for us.’ ‘With pleasure,’ replied Shahrazad, ‘if the king gives me permission,’ and when the king gave it, SHE WENT ON:\n\nI have heard, O fortunate king and rightly guided ruler, that when the merchant was about to cut the throat of the calf, he was moved by pity and told the herdsman to keep the calf among the other beasts.']
['I sprinkled her with the water and said: ‘Leave this shape and become a mule,’ which she did there and then, and it is

In [137]:
# Import docx NOT python-docx
import docx
from docx.enum.text import WD_COLOR_INDEX
from docx import Document

  
# Create an instance of a word document
doc1 = Document('/Users/ibrahim/Desktop/Grad-Research-Spring22/Preprocessing/Target-Story.docx')
doc1=Document()
  
# Add a Title to the document 
#doc1.add_heading('Target Story', 0)
  
# Creating paragraph with some content
for i in FinalList:
    #print(i)
    for i in doc1.paragraphs:
        print("True")
        doc1.add_run(i
                  ).font.highlight_color = WD_COLOR_INDEX.RED

# Now save the document to a location 
doc.save('gfg.docx')

In [110]:



for para in doc1.paragraphs:
    for run in para.runs:
        run.font.highlight_color = WD_COLOR_INDEX.RED
document.save(source_folder+'new.docx')

/Users/ibrahim/Desktop/Grad-Research-Spring22/Preprocessing/Target-Story.docxTarget-Story.docx


PackageNotFoundError: Package not found at '/Users/ibrahim/Desktop/Grad-Research-Spring22/Preprocessing/Target-Story.docxTarget-Story.docx'

In [440]:
#Compare query_sentences using cosine with the candiadates we got from LSH
from re import sub
from gensim.utils import simple_preprocess

#query_string = 'fruit and vegetables'
#documents = ['cars drive on the road', 'tomatoes are actually fruit']

stopwords = []

# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
def preprocess(doc):
    # Tokenize, clean up input document string
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

# Preprocess the documents, including the query string
corpus2 = [preprocess(document) for document in FinalLSH]
print(corpus2)

TypeError: cannot use a string pattern on a bytes-like object

In [35]:
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

# Load the model: this is a big file, can take a while to download and open
glove = api.load("glove-wiki-gigaword-50")    
similarity_index = WordEmbeddingSimilarityIndex(glove)

# Build the term dictionary, TF-idf model
dictionary = Dictionary(corpus2+[query2])
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.  
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

100%|██████████████████████████████████████████| 75/75 [00:00<00:00, 117.44it/s]


In [39]:
# Compute Soft Cosine Measure between the query and the documents.
# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
query_tf = tfidf[dictionary.doc2bow(query2)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus2]],
            similarity_matrix)

doc_similarity_scores = index[query_tf] 


# Output the sorted similarity scores and documents
NW_Candiadates2=[]
sorted_indexes2 = np.argsort(doc_similarity_scores)[::-1]
for idx in sorted_indexes2:
    if doc_similarity_scores[idx]>0.100:
        print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {candidates2[idx]}')
        NW_Candiadates2.append(candidates2[idx])

1 	 0.305 	 Four hundred years later, I promised that I would grant three wishes, but when I still remained imprisoned, I became furiously angry and said to myself that I would kill whoever saved me, giving him a choice of how he wanted to die.


In [40]:
# Compare Compare query_sentences using NW with the candiadates we got from cosine
NW_Candiadates2

['Four hundred years later, I promised that I would grant three wishes, but when I still remained imprisoned, I became furiously angry and said to myself that I would kill whoever saved me, giving him a choice of how he wanted to die.']

In [43]:
from minineedle import needle, core

#query2 = "t is related  O auspicious King that there was a merchant of the merchants who had much wealth and business in various cities Now on a day he mounted horse and went forth to re cover monies in certain towns and the heat sore oppressed him so he sat beneath a tree and putting his hand into his saddle bags took thence some broken bread and dry dates and began to break his fast When he had ended eating the dates he threw away the stones with force and lo an Ifrit appeared huge of stature and brandishing a drawn sword wherewith he approached the mer chant and said Stand up that I may slay thee even as thou slewest my son Asked the merchant How have I slain thy son and he answered When thou atest dates and threwest away the stones they struck my son full in the breast as he was walking by so that he died forthwith F40 Quoth the merchant Verily from Allah we proceeded and unto Allah are we re turning There is no Majesty and there is no Might save in Allah the Glorious the Great If I slew thy son I slew him by chance medley".lower()

threshold = 10.0
R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet

for i in NW_Candiadates2:
    alignment = needle.NeedlemanWunsch(query2, i)
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query:\n ',max_value[1],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

Query:
  i fulfil three wishes." yet no one set me free. thereupon i waxed
wroth with exceeding wrath and said to myself, "whoso shall
release me from this time forth, him will i slay and i will give
him choice of what death he will die; and now, as thou hast
released me, i give thee full choice of deaths." the fisherman,
hearing the words of the ifrit, said, "o allah! the wonder of it
that i have not come to free thee save in these days!" adding,
"spare my life, so allah spare thine; and slay me not, lest allah
set one to slay thee." replied the contumacious one, "there is no
help for it; die thou must; so ask me by way of boon what manner
of death thou wilt die." albeit thus certified the fisherman
again addressed the ifrit saying, "forgive me this my death as a
generous reward for having freed thee;" and the ifrit, "surely i
would not slay thee save on account of that same release." "o
chief of the ifrits," said the fisherman, "i do thee good and
thou requitest me with evil! in very

In [44]:
# Save targetStory_sentences to the Final List
FinalList=[]
FinalList.append(max_value[2])

In [490]:
FinalList[20]

['I shall pay you back for this and don’t be misled by the state I am in now.’\n\nWhen I heard this, I felt a yearning for her, as God, the Great and Glorious, had decreed, and so I took her, gave her clothes and provided her with elegantly furnished accommodation on the ship.']

# 1.3 Needleman Wunch (The Last Layer)

In [203]:
from minineedle import needle, core

#query2 = "t is related  O auspicious King that there was a merchant of the merchants who had much wealth and business in various cities Now on a day he mounted horse and went forth to re cover monies in certain towns and the heat sore oppressed him so he sat beneath a tree and putting his hand into his saddle bags took thence some broken bread and dry dates and began to break his fast When he had ended eating the dates he threw away the stones with force and lo an Ifrit appeared huge of stature and brandishing a drawn sword wherewith he approached the mer chant and said Stand up that I may slay thee even as thou slewest my son Asked the merchant How have I slain thy son and he answered When thou atest dates and threwest away the stones they struck my son full in the breast as he was walking by so that he died forthwith F40 Quoth the merchant Verily from Allah we proceeded and unto Allah are we re turning There is no Majesty and there is no Might save in Allah the Glorious the Great If I slew thy son I slew him by chance medley".lower()

threshold = 10.0
R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet

for i in NW_Candiadates:
    alignment = needle.NeedlemanWunsch(query, i.split('.'))
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query:\n ',max_value[1],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

Query:
  ['i', 'fulfil', 'three', 'wishes', 'yet', 'no', 'one', 'set', 'me', 'free', 'thereupon', 'i', 'waxed', 'wroth', 'with', 'exceeding', 'wrath', 'and', 'said', 'to', 'myself', 'whoso', 'shall', 'release', 'me', 'from', 'this', 'time', 'forth', 'him', 'will', 'i', 'slay', 'and', 'i', 'will', 'give', 'him', 'choice', 'of', 'what', 'death', 'he', 'will', 'die', 'and', 'now', 'as', 'thou', 'hast', 'released', 'me', 'i', 'give', 'thee', 'full', 'choice', 'of', 'deaths', 'the', 'fisherman', 'hearing', 'the', 'words', 'of', 'the', 'ifrit', 'said', 'o', 'allah', 'the', 'wonder', 'of', 'it', 'that', 'i', 'have', 'not', 'come', 'to', 'free', 'thee', 'save', 'in', 'these', 'days', 'adding', 'spare', 'my', 'life', 'so', 'allah', 'spare', 'thine', 'and', 'slay', 'me', 'not', 'lest', 'allah', 'set', 'one', 'to', 'slay', 'thee', 'replied', 'the', 'contumacious', 'one', 'there', 'is', 'no', 'help', 'for', 'it', 'die', 'thou', 'must', 'so', 'ask', 'me', 'by', 'way', 'of', 'boon', 'what', 'manner'

# Get the title of the story (i.e 1st story, 2nd story ...etc)

In [78]:
with open("Target-1.txt") as f:
    lines = f.read() ##Assume the sample file has 3 lines
    first = lines.split(',', 1)[0]

print(first)

When it was the third night


In [79]:
# Function which returns last word
def lastWord(string):
   
    # split by space and converting
    # string to list and
    lis = list(string.split(" "))
     
    # length of list
    length = len(lis)
     
    # returning last element in list
    return lis[length-2]
 
 
# Driver code
x=lastWord(first).upper()
print(lastWord(first).upper(),'night stroty'.upper())

THIRD NIGHT STROTY


# Print the title into new text file 

# Print the whole story again under the title

# First Filter (Cosine Similarity)
https://www.machinelearningplus.com/nlp/cosine-similarity/

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(candidates)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=[candidates])
df

In [None]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity(df, df))

In [None]:
query_term_matrix = sparse_matrix.todense()
df_query = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=[query])

df_query

# NW to test our candidates

In [None]:
from minineedle import needle, core

query= 'it hath reached me, o auspicious king, that there was a fisher man well stricken in years who had a wife and three children, and withal was of poor condition.'

threshold = 10.0
R = {} # Dicitonary to Save query, target sentence, precent identity and the actula aligmnet

for i in candidates:
    alignment = needle.NeedlemanWunsch(query.lower(), i.lower())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query:\n ',max_value[1],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
print('\n',max_value[3])

# LSH Algorithm 4

# LSH itself

# Another Resource


# Test NW with Lemmatization+ POS+ Synonyms

In [106]:
from minineedle import needle, core

seq1='Here we are!'.lower().split()
seq2='We are here!'.lower().split()

alignment = needle.NeedlemanWunsch(seq1, seq2)

alignment.align()

print('Score',alignment.get_score())

print('Identity',alignment.get_identity())

print(alignment)

Score -2
Identity 25.0
Alignment of Query and Target Sentence is:
	herewe-are!
	-wearehere!



In [75]:
from minineedle import needle, core

seq1='This is a good night'.lower()
seq2='This is a good day'.lower()

alignment = needle.NeedlemanWunsch(seq1, seq2)

alignment.align()

print('Score',alignment.get_score())

print('Identity',alignment.get_identity())

print(alignment)

Score 10
Identity 75.0
Alignment of SEQUENCE 1 and SEQUENCE 2:
	this is a good night
	this is a good --day

