In [1]:
import nltk
nltk.download('punkt')

True

In [2]:
from nltk.tokenize import word_tokenize

data = "Mars is approximately half the diameter of Earth."
print(word_tokenize(data))

['Mars', 'is', 'approximately', 'half', 'the', 'diameter', 'of', 'Earth', '.']


In [3]:
from nltk.tokenize import sent_tokenize

data = "Mars is a cold desert world. It is half the size of Earth."
print(sent_tokenize(data))

['Mars is a cold desert world.', 'It is half the size of Earth.']


In [4]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

file_docs = []

with open ('demo_multiple_docs2.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file_docs.append(line)

print("Number of documents:", len(file_docs))

Number of documents: 4


In [5]:
gen_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in file_docs]

print(gen_docs)

[['hello', ',', 'my', 'name', 'is', 'irina', '.'], ['i', 'study', 'computer', 'science', '.'], ['ham', 'sandwiches', 'from', 'lidl', 'are', 'really', 'tasty', '.'], ['malls', 'are', 'very', 'convenient', 'and', 'a', 'really', 'nice', 'place', 'to', 'shop', '.']]


In [6]:
import gensim

dictionary = gensim.corpora.Dictionary(gen_docs)

print(dictionary.token2id)

{',': 0, '.': 1, 'hello': 2, 'irina': 3, 'is': 4, 'my': 5, 'name': 6, 'computer': 7, 'i': 8, 'science': 9, 'study': 10, 'are': 11, 'from': 12, 'ham': 13, 'lidl': 14, 'really': 15, 'sandwiches': 16, 'tasty': 17, 'a': 18, 'and': 19, 'convenient': 20, 'malls': 21, 'nice': 22, 'place': 23, 'shop': 24, 'to': 25, 'very': 26}


In [7]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(1, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(1, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)], [(1, 1), (11, 1), (15, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)]]


In [8]:
import numpy as np

tf_idf = gensim.models.TfidfModel(corpus)
for doc in tf_idf[corpus]:
    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

[[',', 0.41], ['hello', 0.41], ['irina', 0.41], ['is', 0.41], ['my', 0.41], ['name', 0.41]]
[['computer', 0.5], ['i', 0.5], ['science', 0.5], ['study', 0.5]]
[['are', 0.21], ['from', 0.43], ['ham', 0.43], ['lidl', 0.43], ['really', 0.21], ['sandwiches', 0.43], ['tasty', 0.43]]
[['are', 0.16], ['really', 0.16], ['a', 0.32], ['and', 0.32], ['convenient', 0.32], ['malls', 0.32], ['nice', 0.32], ['place', 0.32], ['shop', 0.32], ['to', 0.32], ['very', 0.32]]


In [9]:
 # building the index
 sims = gensim.similarities.Similarity('./',tf_idf[corpus],
                                        num_features=len(dictionary))

In [10]:
file2_docs = []

# with open ('demo_sentences2.txt') as f:
with open ('demo_multiple_docs.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file2_docs.append(line)

print("Number of documents:",len(file2_docs))  
for line in file2_docs:
    query_doc = [w.lower() for w in word_tokenize(line)]
    query_doc_bow = dictionary.doc2bow(query_doc) # update an existing dictionary and create bag of words

Number of documents: 1


In [11]:
# perform a similarity query against the corpus
query_doc_tf_idf = tf_idf[query_doc_bow]
# print(document_number, document_similarity)
print('Comparing Result:', sims[query_doc_tf_idf]) 

Comparing Result: [0.5477226 0.        0.        0.       ]


In [12]:
sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
print(sum_of_sims)

0.5477226


In [13]:
percentage_of_similarity = round(float((sum_of_sims / len(file_docs)) * 100))

print(f'Average similarity float: {float(sum_of_sims / len(file_docs))}')
print(f'Average similarity percentage: {float(sum_of_sims / len(file_docs)) * 100}')
print(f'Average similarity rounded percentage: {percentage_of_similarity}')

Average similarity float: 0.1369306445121765
Average similarity percentage: 13.693064451217651
Average similarity rounded percentage: 14


In [14]:
avg_sims = [] # array of averages

# for line in query documents
for line in file2_docs:
    # tokenize words
    query_doc = [w.lower() for w in word_tokenize(line)]
    # create bag of words
    query_doc_bow = dictionary.doc2bow(query_doc)
    # find similarity for each document
    query_doc_tf_idf = tf_idf[query_doc_bow]
    # print (document_number, document_similarity)
    print('Comparing Result:', sims[query_doc_tf_idf]) 
    # calculate sum of similarities for each query doc
    sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
    # calculate average of similarity for each query doc
    avg = sum_of_sims / len(file_docs)
    # print average of similarity for each query doc
    print(f'avg: {sum_of_sims / len(file_docs)}')
    # add average values into array
    avg_sims.append(avg)  
    # calculate total average
    total_avg = np.sum(avg_sims, dtype=np.float)
    # round the value and multiply by 100 to format it as percentage
    percentage_of_similarity = round(float(total_avg) * 100)
    # if percentage is greater than 100
    # that means documents are almost same
    if percentage_of_similarity >= 100:
        percentage_of_similarity = 100

Comparing Result: [0.5477226 0.        0.        0.       ]
avg: 0.1369306445121765


Unnamed: 0,Similarity,Sentence 1,Sentence 2
0,4.20,"In Nigeria, Chevron has been accused by the Al...","In Nigeria, the whole ijaw indigenous showed C..."
1,4.25,I know that in France they have had whole herd...,"I know that in France, the principle of slaugh..."
2,4.80,"Unfortunately, the ultimate objective of a Eur...",Unfortunately the final objective of a Europea...
3,4.80,The right of a government arbitrarily to set a...,The right for a government to draw aside its c...
4,4.80,The right of a government arbitrarily to set a...,The right for a government to dismiss arbitrar...
...,...,...,...
61,2.75,Mr Morse is charged with assault and Mr Darvis...,His partner Bijan Darvish is charged with fili...
62,3.25,"The mock explosion, the first event in the dri...","The mock explosion of a radioactive ""dirty bom..."
63,3.25,"The third appointment was to a new job, execut...","Bruce N. Hawthorne, 53, was named executive vi..."
64,2.50,The commission dropped charges that Patton imp...,Patton also appointed Conner to the Kentucky L...




Number of documents: 66
