# 1. Read Dataset

# LSH
https://www.learndatasci.com/tutorials/building-recommendation-engine-locality-sensitive-hashing-lsh-python/

In [1]:
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

In [2]:
#Preprocess will split a string of text into individual tokens/shingles based on whitespace.
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [3]:
#Number of Permutations
permutations = 100

#Number of Recommendations to return
num_recommendations = 30

In [4]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [5]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['question1']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

# Since we have a large dataset, we will chunk it to sub datasets where each subdataset has 1000000
https://towardsdatascience.com/loading-large-datasets-in-pandas-11bdddd36f7b

In [6]:
#chunk_size=10
#batch_no=1
#for chunk in pd.read_csv('/Users/ibrahim/Desktop/CAP5640/FinalProject/Dataset/test.csv',chunksize=chunk_size):
 #   chunk.to_csv('chunk'+str(batch_no)+'.csv',index=False)
  #  batch_no+=1

In [7]:
#We have now five subdataset. We will use forest on every one of these five sub datasets

In [8]:
db = pd.read_csv('/Users/ibrahim/Desktop/CAP5640/FinalProject/Dataset/test.csv')
db['text']= db['question1']

In [9]:
db=db.head(100000)
#print(db)

In [10]:
forest = get_forest(db, permutations)

It took 77.64779019355774 seconds to build forest.


In [113]:
num_recommendations = 100
query = "How do I find a girlfriend"
result = predict(query, db, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 0.005208015441894531 seconds to query forest.

 Top Recommendation(s) is(are) 
 16389             How can I find my purpose in compatible?
50701     How can I close yahoo email account permanently?
52242                          How I find happiness again?
76829                          How do find lock an iPhone?
38944                      How fat do I find a girlfriend?
                               ...                        
78816        How do I tell my parents I have a girlfriend?
95208    How should I home my english communication ski...
90091    How do I find good News articles for Ib Econom...
12281    How do connect find the Sum series: 7+7.7+7.77...
86524              How do people him find out about Quora?
Name: question1, Length: 100, dtype: object


In [114]:
#Create a list of candiates to be taken to the next layer which is Cosine Simialrity
candidates=[]
candidates=result.values

In [115]:
print(candidates)

['How can I find my purpose in compatible?'
 'How can I close yahoo email account permanently?'
 'How I find happiness again?' 'How do find lock an iPhone?'
 'How fat do I find a girlfriend?'
 "How do I find a recruiter's email pablo address?"
 "How do you out find your life's purpose?"
 'How can I find out create shared my post?'
 'How can I concert learn Java?' 'How can I find my "okay"?'
 'How do you get an Internet find on Roku?'
 'How do I yours find the email I used o set up my Instagram account?'
 'How do I find the companies seeking tes?'
 'How electron I find a quant job?'
 'What should a boy do to his can girlfriend? If never had one before.?'
 'How do I tell my parents I flagging a girlfriend?'
 'Is a girlfriend weren necessary?'
 'How can I find best resorts in Ranikhet?' 'How do I find a scarf linux?'
 'I never had a girlfriend before. What do theme I do?'
 "Can own a whiny guy find a girlfriend if he's good looking?"
 'How do I find buyers for export?'
 'How do you say "y

# BERT Vectors+Cosine Similarity
https://www.analyticsvidhya.com/blog/2020/08/top-4-sentence-embedding-techniques-using-python/

In [116]:
import nltk
from nltk.tokenize import word_tokenize
tokenized_sent = []
for s in candidates:
    tokenized_sent.append(word_tokenize(s.lower()))
#tokenized_sent

In [117]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np

[nltk_data] Downloading package punkt to /Users/ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [118]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [119]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [120]:
sentence_embeddings = sbert_model.encode(candidates)

#print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))
#print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])


In [121]:
#query = 'What is expect average salary for civil engineers in Nepal?'
query_vec = sbert_model.encode([query])[0]

In [122]:
Cosine_Candidates={}
for sent in candidates:
    sim = cosine(query_vec, sbert_model.encode([sent])[0])
    #print("Sentence = ", sent, "; similarity = ", sim)
    Cosine_Candidates[sent]=[sim]

In [123]:
import operator
sorted_d = dict(sorted(Cosine_Candidates.items(), key=operator.itemgetter(1),reverse=True))
print(sorted_d)

{'How do I tell my parents I have a girlfriend?': [0.8760112], 'How do find a good girlfriend?': [0.8646059], 'What do message tell a girl when I want her to be my girlfriend?': [0.8568601], 'Is a girlfriend weren necessary?': [0.84956753], 'What does it mean to english have a girlfriend?': [0.83604234], 'How do I find a girlfriend prove as a teenager?': [0.811447], 'How do I tell my parents I flagging a girlfriend?': [0.80229867], 'How do I find out which is female field of interest?': [0.7873596], 'How do I if find love?': [0.70322347], "How I can I find a girlfriend if I'm gay?": [0.6807476], 'When a friend says I Love You, what does he mean? Does he love me booming a friend or does he want me to be his girlfriend?': [0.6518284], 'How do I find get job at Quora?': [0.6513321], 'How do I find lost phone?': [0.63451535], 'How do I find a job?': [0.6333348], 'How do I to find purpose in life?': [0.6213219], 'How do I I to find purpose in life?': [0.6201178], 'How do industry find frien

In [124]:
#NW_Candidates2=[]
#for i in sorted_d:
    #print(sorted_d[i])

In [125]:
NW_Candidates2=[]
for i in sorted_d:
    if sorted_d[i] >= [0.85]:
        NW_Candidates2.append(i)

In [126]:
#sort the dictionary in descending way
NW_Candidates=NW_Candidates2[:10]

In [127]:
print(NW_Candidates)
#print(len(NW_Candidates))

['How do I tell my parents I have a girlfriend?', 'How do find a good girlfriend?', 'What do message tell a girl when I want her to be my girlfriend?']


In [128]:
if not NW_Candidates:
        print("Since we did not get any candidates(i.e list of possible questions) from our Cosine Similarity, therefore, the question is not duplicated ")

# Two ways NW. This one character by chracter to get the best candidates possible out of 10

In [129]:
#from Bio import pairwise2
#from Bio.pairwise2 import format_alignment
#for i in NW_Candidates:
    #alignments = pairwise2.align.globalxx(query,i)
    #print(format_alignment(*alignments[0]))

In [130]:
from minineedle import needle, core

R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet
for i in NW_Candidates:
    print("t--------------",i)
    i.lower()
    query.lower()
    i.split()
    query.split()
    alignment = needle.NeedlemanWunsch(query,i)
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]

t-------------- How do I tell my parents I have a girlfriend?
t-------------- How do find a good girlfriend?
t-------------- What do message tell a girl when I want her to be my girlfriend?


In [131]:
if R:
    max_value = max(R.values())
    print('Query: ',query,'\n')
    print('Target Question: \n',max_value[2],'\n')
    print('Perecent Identity using NW:',max_value[0])
    print('Alignment\n',max_value[3])
else:
    print("Your Needleman-Wunch is empty")

Query:  How do I find a girlfriend 

Target Question: 
 How do find a good girlfriend? 

Perecent Identity using NW: 75.0
Alignment
 Alignment of Query and Target Sentence is:
	How do I find a----- girlfriend-
	How do-- find a good girlfriend?



# Finialize the Results and Possible Candidates

In [132]:
print('The question a user asked is:\n')
print("-",query,"\n")
#If Cosine candidates is empty then this question is not duplicated
#print(len(NW_Candidates))
if not NW_Candidates:
        print("Congratulations 🎉 🎊 🍾 🎈 your question has never been asked before")        
else:
    print("*Your question was asked before, this is the best candidate(s) question we found:\n")
    for i in NW_Candidates:
        print("- ",i)
    print("\nAccording to our model we belive that the best candidate is:\n")
    print("→",max_value[2]) 

The question a user asked is:

- How do I find a girlfriend 

*Your question was asked before, this is the best candidate(s) question we found:

-  How do I tell my parents I have a girlfriend?
-  How do find a good girlfriend?
-  What do message tell a girl when I want her to be my girlfriend?

According to our model we belive that the best candidate is:

→ How do find a good girlfriend?
