In [1]:
#import libraries
import numpy as np
import pandas as pd
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
#define a function to train the tdf vectorizer and store the model in a pickle file
def tfidf_features(X_train, X_test, vectorizer_path):
    """Performs TF-IDF transformation and dumps the model."""
    
    # Train a vectorizer on X_train data.
    # Transform X_train and X_test data.
    # Pickle the trained vectorizer to 'vectorizer_path'
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1),min_df=5,max_df=0.95,token_pattern='(\S+)')
    X_train=  tfidf_vectorizer.fit_transform(X_train)
    X_test=tfidf_vectorizer.transform(X_test)
    fileObject = open(vectorizer_path,'wb') 
    pickle.dump(tfidf_vectorizer,fileObject)
    fileObject.close()
    return X_train, X_test


In [3]:
#load corpus data
corpus_df=pd.read_csv('data.csv',sep=',',header=0)

In [60]:
messages = list(corpus_df.iloc[:,2].values)

In [59]:
messages_id = list(corpus_df.iloc[:,1])

In [7]:
corpus_df

Unnamed: 0.1,Unnamed: 0,path,content
0,0,/ContentService/Books/Towards Infinity (Anant ...,Now I may throw some light on the state of con...
1,1,/ContentService/Books/Towards Infinity (Anant ...,Briefly here I may add that the feeling of ah...
2,2,/ContentService/Books/Towards Infinity (Anant ...,This state is attainable after thousands of ye...
3,3,/ContentService/Books/Towards Infinity (Anant ...,The same exactly is true about this nucleus to...
4,4,/ContentService/Books/Towards Infinity (Anant ...,We have to arrive at the real thing which is u...
5,5,/ContentService/Books/Towards Infinity (Anant ...,Shri Ram Chandraji Maharaj is one of the forem...
6,6,/ContentService/Books/Towards Infinity (Anant ...,Thus whatever existed between the thought and ...
7,7,/ContentService/Books/Towards Infinity (Anant ...,How to awaken the various states of the real w...
8,8,/ContentService/Books/Towards Infinity (Anant ...,First Knot The real state of enlightenment com...
9,9,/ContentService/Books/Towards Infinity (Anant ...,Our jnana-hinata too has now come to an end. T...


In [8]:
#split into train and test set (though we won't be using the test set)
x_train,x_test=train_test_split(corpus_df['content'].values.astype('U'), test_size=0.1, random_state=0)

In [6]:
#use train data to train the vectorizer and store in pickle file
x_train_tfidf, x_test_tfidf = tfidf_features(x_train, x_test, 'tfidf_vectorizer.pkl')

In [None]:
#load the search string
search_str=pd.read_csv('/Users/jnarayanan/Downloads/search.csv',sep=',',header=None)

In [10]:
#load the vectorizer from the pickle file
v = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

In [11]:
v

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(\\S+)', tokenizer=None, use_idf=True,
        vocabulary=None)

In [None]:
#convert search string to tfidf
search_tfidf=v.transform(search_str[0].values)

In [12]:
#convert corpus to tfidf
corpus_tfidf=v.transform(corpus_df['content'].values.astype('U'))

In [13]:
corpus_tfidf

<12014x13253 sparse matrix of type '<class 'numpy.float64'>'
	with 1026295 stored elements in Compressed Sparse Row format>

In [14]:
#compute cosine similarity
y=cosine_similarity(corpus_tfidf,corpus_tfidf)

In [49]:
#sort the similarity matrix and get top five thrteads
t_vec=[-x[1] for x in y]
best_thread = np.argsort(t_vec)[1:6]

In [56]:
top_ind=y[0].argsort()[::-1][:6]


In [57]:
top_ind

array([    0,   460,   109, 10286,  5224,  7531], dtype=int64)

In [50]:
content = corpus_df['content']

In [51]:
content[best_thread]

11908           Dear one, May you live long.  Blessings...
7531     Reply to My Letter from the Revered Mahatmaji ...
5292     Happy was the time when you saw me and happier...
7197     Letter to the Master with Diary for November, ...
8125     Deaths and births are happening everywhere and...
Name: content, dtype: object

In [31]:
content[0]

'Now I may throw some light on the state of consciousness which blooms after this first sub- point. When we cross this point to breathe in the next, everything seems to be changed. When we get merged into it a godly state begins to reign within and when identicality is developed we begin to feel that a peculiar state reflecting a tinge of divine touch prevails over all objects, animate and inanimate. There is such an indescribable outburst of emotional feelings at this point as often drives people, who lack the support of a worthy guru of calibre, to a state of mad ecstasy —  avadhuta — which keeps him entangled in it for ever, putting a permanent full stop to his onward advancement. Now let us ask those who boastfully profess to be the enlightened ones, or gyaani, whether they have ever come across such a state, which in fact is but the beginning of the first chapter of the book of knowledge. If we try to attain this state by applying our force of thought, it will only be artificial a

In [32]:
content[460]

'The real state of enlightenment comes when we get into full consciousness of the condition of enlightenment, and after imbibing its effect secure our merging in it. When we develop this state and merge into its consciousness we come to know all about it and thus become gyaani, i.e., enlightened — up to that extent. If we try to gain the knowledge by applying our force of thought, it will only be artificial and not true and genuine. The real knowledge of a state means complete identicality with the state we have merged in. The help that knowledge at different states offers is that it infuses us with a longing for the search for the Ultimate. '

In [33]:
content[109]

'The real knowledge of a state means complete identicality with the state we have merged in. It brings us to full consciousness of the condition and for this very reason it is interpreted as the real life which can be experienced only by an animate being. Being thus encouraged we now begin to aspire impatiently for further knowledge. '

In [34]:
content[10286]

"But something is essential for the abhyasi also. In the first place he must have full trust in the Master and must fully co-operate with him in every respect. If it is so he will positively go on developing day by day, and begin to feel himself changed and transformed. The state of waking consciousness of the lower type will get transformed, and his journey through higher and higher types of consciousness will be commenced. Usually consciousness is spoken of as of three levels: conscious, sub- conscious and super-conscious. They are however the broader divisions, and there are still innumerable levels of it in each one. The effect of the activities of the lower consciousness settles down upon the sub-conscious mind forming fate. The first thing to be undertaken is therefore the correction of the lower consciousness by right thinking and practice, so that it may itself be converted into force to bring the sub-conscious mind into a state of splendour. This brings us to the state of supe

In [66]:
dfObj = pd.DataFrame(columns = ['Main_Para_Id','Similar_Para_Id', 'Main_Para','Similar_Para', 'Similiarity_Index'])
#dfObj3 = pd.DataFrame(columns = ['Sent', 'Sent_Sim', 'Index_Sim'])
#df1 = pd.DataFrame([[1,2],], columns=['a','b'])

for i in tqdm(range(len(messages))):
#     x=corr[i,:];
#     print(x)
    top_ind=y[i].argsort()[::-1][:6]
#     print(top_ind)
    for j in range(1,6):
        a= messages_id[i]
        b= messages_id[top_ind[j]]
        c=messages[i]
#         print(a)
        d=messages[top_ind[j]]
#         print(b)
        e=y[i,top_ind[j]]
        #print(c)
#         print(dfObj)
        #print (a,b,c)
        #df1 = pd.DataFrame([[1,2],], columns=['a','b'])
        #print(a,b,c,d,e)
        dfObj2 = pd.DataFrame( [[a, b, c,d,e],], columns =['Main_Para_Id','Similar_Para_Id', 'Main_Para','Similar_Para', 'Similiarity_Index'])
        #print(dfObj2)
        dfObj=dfObj.append(dfObj2)

100%|████████████████████████████████████████████████████████████████████████████| 12014/12014 [04:28<00:00, 44.80it/s]


In [65]:
from tqdm import tqdm

In [67]:
dfObj

Unnamed: 0,Main_Para_Id,Similar_Para_Id,Main_Para,Similar_Para,Similiarity_Index
0,/ContentService/Books/Towards Infinity (Anant ...,/ContentService/Books/Sahaj Marg Philosophy/EN...,Now I may throw some light on the state of con...,The real state of enlightenment comes when we ...,0.394180
0,/ContentService/Books/Towards Infinity (Anant ...,/ContentService/Books/Towards Infinity (Anant ...,Now I may throw some light on the state of con...,The real knowledge of a state means complete i...,0.312557
0,/ContentService/Books/Towards Infinity (Anant ...,/ContentService/Books/Voice Real - The Second ...,Now I may throw some light on the state of con...,But something is essential for the abhyasi als...,0.308624
0,/ContentService/Books/Towards Infinity (Anant ...,/ContentService/Books/Voice Real - The First S...,Now I may throw some light on the state of con...,Comparing the spiritual state of an advanced m...,0.295749
0,/ContentService/Books/Towards Infinity (Anant ...,/ContentService/Books/Autobiography of Ram Cha...,Now I may throw some light on the state of con...,Reply to My Letter from the Revered Mahatmaji ...,0.292491
0,/ContentService/Books/Towards Infinity (Anant ...,/ContentService/Books/Complete Works of Ram Ch...,Briefly here I may add that the feeling of ah...,"Dear one, May you live long. Blessings...",0.313344
0,/ContentService/Books/Towards Infinity (Anant ...,/ContentService/Books/Autobiography of Ram Cha...,Briefly here I may add that the feeling of ah...,Reply to My Letter from the Revered Mahatmaji ...,0.312326
0,/ContentService/Books/Towards Infinity (Anant ...,/ContentService/Books/Voice Real - The First S...,Briefly here I may add that the feeling of ah...,Happy was the time when you saw me and happier...,0.296013
0,/ContentService/Books/Towards Infinity (Anant ...,/ContentService/Books/Autobiography of Ram Cha...,Briefly here I may add that the feeling of ah...,"Letter to the Master with Diary for November, ...",0.282834
0,/ContentService/Books/Towards Infinity (Anant ...,/ContentService/Books/Complete Works of Ram Ch...,Briefly here I may add that the feeling of ah...,Deaths and births are happening everywhere and...,0.269182


In [68]:
dfObj.to_csv('TFIDF.csv')