In [1]:
import nltk
import textblob as txb
import pandas as pd
import numpy as np
import re
from time import time
import pickle

In [2]:
#workflow
'''
1. The user enters a question
2. Preprocess it and extract the features of the question
3. extract the tokenized features of it and save it for optimized comparision
4. construct the reduced set of questions similar to the features of the input questions
5. perform jaccard sim among the union of the reduced set 
6. threshold to give related questions
'''

'\n1. The user enters a question\n2. Preprocess it and extract the features of the question\n3. extract the tokenized features of it and save it for optimized comparision\n4. construct the reduced set of questions similar to the features of the input questions\n5. perform jaccard sim among the union of the reduced set \n6. threshold to give related questions\n'

In [3]:
#global elements that need to be booted up when the software starts
config = {
    'ps' : nltk.PorterStemmer(),
    'inp_q_features' : None,
    'inp_q_tokens' : None,
    'inp_q' : None, 
    'helping_word' : ['am', 'are', 'is', 'was', 'were', 'be', 'being', 'been','have', 'has', 'had', 'shall', 'will','do', 'does', 'did', 'may', 'must', 'might', 'can', 'could', 'would', 'should', 'i'],
    'clusters' : None,
    'len_nulls' : None,
    'largest_cluster' : None,
    'smallest_cluster' : None
}

def preprocessing(sentence):
    review_text = sentence.lower()
    #review_text = re.sub(r"[A-Za-z0-9]", " ", review_text)
    review_text = re.sub(r"[^A-Za-z0-9(),!.?\'\`\"]", " ", review_text)
    review_text = re.sub(r"\'s", " ", review_text)
    review_text = re.sub(r"\'ve", " ", review_text)
    review_text = re.sub(r"n\'t", " ", review_text)
    review_text = re.sub(r"\'re", " ", review_text)
    review_text = re.sub(r"\'d", " ", review_text)
    review_text = re.sub(r"\'ll", " ", review_text)
    review_text = re.sub(r",", " ", review_text)
    review_text = re.sub(r"\.", " ", review_text)
    review_text = re.sub(r"!", " ", review_text)
    review_text = re.sub(r"\(", " ( ", review_text)
    review_text = re.sub(r"\)", " ) ", review_text)
    review_text = re.sub(r"\?", " ", review_text)
    review_text = re.sub(r"\'", "", review_text)
    review_text = re.sub(r"\s{2,}", " ", review_text)
    return review_text

def get_nouns(text):
    ps = config['ps']
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    
    #getting all the nouns
    nouns = [i for i in tagged if i[1][0]=='N']
    f_nouns = []
    for i in nouns:
        '''if(i[1]=='NNS'):#if it is common noun in plural form then get its stemmed word
            f_nouns.append(ps.stem(i[0]))
        else:
            f_nouns.append(i[0].lower())'''
        f_nouns.append(ps.stem(i[0]))
    return set(f_nouns)

def get_noun_phrases(sentence):
    wiki = txb.TextBlob(sentence)
    noun_phrases = [str(i) for i in wiki.noun_phrases]
    return set(noun_phrases)

def get_verbs_with_addons(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    return set([i[0] for i in tagged if i[1][0] in ('R','V') and i[0] not in config['helping_word']])

def get_features(sentence):
    #removing redundancy by eliminating all the nouns which are already present in the noun phrase
    text = preprocessing(sentence)
    
    nouns = get_nouns(text)
    noun_phrases = get_noun_phrases(text)
    verbs_adverbs = get_verbs_with_addons(text)
    
    #if both the sets are empty then return a null set else if one is empty then return other
    if((len(nouns)==0) and (len(noun_phrases)==0)):
        return verbs_adverbs
    if(len(nouns)==0):
        return noun_phrases.union(verbs_adverbs)
    if(len(noun_phrases)==0):
        return nouns.union(verbs_adverbs)
    #if both non empty then select just the uniques
    nn = [[i,0] for i in nouns]
    for np in noun_phrases:
        for n in nn:
            if(n[0] in np):
                n[1]=1
    x=[]
    for i in nn:
        if(i[1]==0):
            x.append(i[0])

    x = [i[0] for i in nn if i[1]==0]
    return set(x+list(noun_phrases)+list(verbs_adverbs))

def construct_questions_pool(dataset):
    questions_pool = []
    for i in dataset.id:
        if(dataset.loc[i,'is_duplicate']==1):
            questions_pool.append(preprocessing(dataset.loc[i,'question1']))
        else:
            questions_pool.append(preprocessing(dataset.loc[i,'question1']))
            questions_pool.append(preprocessing(dataset.loc[i,'question2']))
    return set(questions_pool)

#this function saves just the indexes of the questions in the clusters for us retrieve later
def build_indexed_clusters(dataset):
    clusters = dict()
    nulls = list()
    
    questions = dataset.question
    indexes = dataset.index
    for i in indexes:
        features = get_features(questions[i])    
        if(len(features)!=0):
            for j in features:
                if(j in clusters.keys()):
                    clusters[j].append(i)
                else:
                    clusters[j]=list()
                    clusters[j].append(i)
        else:
            nulls.append(i)
    
    clusters['-1']=nulls
    return clusters

def build_dataset_with_features(questions):
    ind = []
    features = []
    questions = list(questions)
    for i in range(len(questions)):
        ind.append(i)
        features.append(tokenized_features(questions[i]))
    
    return pd.DataFrame({'index' : ind,
                        'question' : questions,
                        'features' : features})

def save_dataset(dataset):
    dataset.to_csv('featured_questions.csv', sep=',', encoding='utf-8', index=False)
    
def save_clusters(dataset):
    obj = build_indexed_clusters(dataset)
    with open('clusters.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def save_word_index(m_dict):
    with open('word_index.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def load_clusters(name='clusters.pkl'):
    with open(name, 'rb') as f:
        clusters = pickle.load(f)
        config['clusters'] = sum([1 for i in clusters.keys()])
        config['len_nulls'] = len(clusters['-1'])
        tmp = [len(clusters[i]) for i in clusters.keys()]
        config['largest_cluster'] = max(tmp)
        config['smallest_cluster'] = min(tmp)
        return clusters

#returns True if there are any features of the input question
#returns False if there are no features of the input question
def init_question(question):
    question = preprocessing(question)
    config['inp_q'] = question
    config['inp_q_features'] = get_features(question)
    config['inp_q_tokens'] = tokenized_features(question)
    if(config['inp_q_features'] is None):
        return False
    else:
        return True
    
#construct a reduced set of questions by taking the union of the clusters of the keywords present in the input questions
def construct_reduced_question_pool(clusters, state):
    if(state==True):
        q_tokens = list(config['inp_q_features'])
        f_list = []
        for i in q_tokens:
            try:
                f_list = f_list + clusters[i]
            except:
                print('Error : No clusters found having for the keyword ',i)
        f_list = list(set(f_list))
        return f_list
    else:
        #if the input question did not give us features then check for it in the common null pool
        return list(clusters['-1'])
    
#this function returns the best 5 matching questions from among the dataset
def rank(indexed_features):
    q_tokens = config['inp_q_tokens']
    results = list()
    for i in indexed_features.index:
        sim_ind = jaccard(q_tokens, indexed_features.loc[i,'features'])
        results.append({'sim_index':sim_ind,'question':indexed_features.loc[i,'question']})
    results.sort(key = lambda x:x['sim_index'], reverse=True)
    #return results[0:5]
    return [i['question'] for i in results][0:20]
    
def tokenized_features(inp_ques):
    t_features = []
    features = get_features(inp_ques)
    for i in features:
        t_features = t_features + [j for j in i.split()]
    return set(t_features)

def jaccard(tokens1, tokens2):
    #receives the set of tokens1 and token2
    inter = tokens1.intersection(tokens2)
    uni = tokens1.union(tokens2)
    return float(len(inter)/len(uni))

In [None]:
#reading the dataset train.csv to get the questions
file_path = 'C:\\Users\\Administrator\\Related questions\\Dataset\\train.csv'

#doing all the initializing stuff
start_time = time()

df = pd.read_csv(file_path, encoding = 'utf8')
df = df.dropna()
#remove this sampling when doing for the final time
#df = df.sample(n=540)

questions = construct_questions_pool(df)
featured_dataset = build_dataset_with_features(questions)

save_dataset(featured_dataset)
save_clusters(featured_dataset)

end_time = time()
print('Time taken for creating the clusters and saving things : ', (end_time - start_time), 'secs')

In [4]:
#loading up the clusters and featured dataset and performing the operations

start_time = time()
clusters = load_clusters()
df = pd.read_csv('featured_questions.csv')
end_time = time()
print('Time taken for initialization : ', (end_time - start_time), 'secs')

Time taken for initialization :  2.2512102127075195 secs


In [5]:
#final execution script for the execution flow
question = input('Enter the question : ')
val = init_question(question)
questions_list = construct_reduced_question_pool(clusters, val)
reduced_with_features = df.iloc[[i for i in questions_list],:]
result = rank(reduced_with_features)

Enter the question : What is something that never fails to make you happy?


In [6]:
%store question

Stored 'question' (str)


In [7]:
result

['how can i make money utilising polyvore ',
 'why do some quora questions never get answers even when they get views ',
 'how can i send a text message and make it look like it came from someone else phone number ',
 'how do i urge a girl to kiss me what are some body gestures required to seduce her or something like that ',
 'what can i do add to make oatmeal less bland tasting ',
 'what is one band you have never gotten bored of ',
 'are my parents abusive i grew up on imaginary friends because i never knew there was an outside world i try to impress them they were never there ',
 'how can i make money on whatsapp or by whatsapp ',
 'using a demand and supply diagram analyse why a fall in incomes may reduce the market price of houses how would i make this graph ',
 'relationship advice how do you know if you in love or obsessed with a guy if you never met him in person but feel like you have known him for ages ',
 'what are the best strategies to make money trading binary options on

In [8]:
%store result

Stored 'result' (list)
