In [1]:
!pip install kshingle
!pip install datasketch
!pip install textdistance



In [2]:
from bs4 import BeautifulSoup
import urllib.request
import re
import string
import kshingle as ks
import datasketch
import pandas as pd
import textdistance
from scipy.spatial.distance import cosine
from random import randint
import numpy as np

In [3]:
def preprocess(df_list):
  for df in df_list:
    df_temp = globals() [df]
    df_temp = df_temp.apply(lambda x:x.astype(str).str.lower())
    df_temp[0] = df_temp[0].str.replace('[^\w\s]','')
    df_temp[0] = df_temp[0].apply(lambda x: ' '.join(x.split()))
    df_temp = df_temp[0].str.strip()
    globals() [df] = df_temp

In [4]:
def createCharacterShingles(k, textfiles, df_list):
  shingles_k = []
  for filename, item in zip(textfiles, df_list):
    input = ' '.join(globals() [item].values[1:])
    shingles = ks.shingleset_k(input, k)
    # print(shingles)
    shingles_k.append(shingles)
    print("Filename : {0} \nSize of {1} Character Shingle set : {2} \n{3} Character Shingle set : {4} \n ".format(filename, k, len(shingles), k, shingles))
  return shingles_k

In [5]:
def createWordShingles(k, textfiles, df_list):
  word_shingles_k = []
  for filename, item in zip(textfiles, df_list):
    input = ' '.join(globals() [item].values[1:]).split()
    shingles = ks.shingleseqs_list(input, klist=[k])
    shingles_set = []
    # print(shingles)
    for shingle in shingles:
      string = ""
      for shingle_str in shingle:
        string = ' '.join(shingle_str)
        shingles_set.append(string)
    print("Filename:", filename)
    print("Length of {0} word shingle list before converting to set: {1}".format(k,len(shingles_set)))
    # print(set(shingles_set))
    word_shingles_k.append(set(shingles_set))
    print("Length of {0} word shingle list after converting to set: {1}".format(k,len(set(shingles_set))))
  return word_shingles_k

In [6]:
def jaccard(shingle_lists, textfiles):
  for filename,index in zip(textfiles,range(len(textfiles))):
    print("Comparing file: {0}".format(filename))
    for idx in range(len(textfiles)):
      Jaccard_similarity = textdistance.jaccard(shingle_lists[index],shingle_lists[idx])
      print("Jaccard similarity with file{0}: {1}".format(textfiles[idx],Jaccard_similarity))
    print("\n")

In [7]:
def cosine(shingle_lists, textfiles):
  for filename,index in zip(textfiles,range(len(textfiles))):
    print("Comparing file: {0}".format(filename))
    for idx in range(len(textfiles)):
      Jaccard_similarity = textdistance.cosine(shingle_lists[index],shingle_lists[idx])
      print("cosine similarity with file{0}: {1}".format(textfiles[idx],Jaccard_similarity))
    print("\n")

In [38]:
N = 128
max_val = (2**32)-1
perms = [ (randint(0,max_val), randint(0,max_val)) for i in range(N)]
vec = [float('inf') for i in range(N)]

def minhash(s, prime=4294967311):
  vec = [float('inf') for i in range(N)]
  for val in s:
    if not isinstance(val, int): 
      val = hash(val)
    for perm_idx, perm_vals in enumerate(perms):
      a, b = perm_vals
      output = (a * val + b) % prime
      if vec[perm_idx] > output:
        vec[perm_idx] = output
  return vec

In [39]:
def CompareUsingMinHash(shingle_lists, textfiles):
  for filename,index in zip(textfiles,range(len(textfiles))):
    print("Comparing file: {0}".format(filename))
    vec1 = minhash(shingle_lists[index])
    vec1 = np.array(vec1) / max(vec1)
    for filename2,idx in zip(textfiles,range(len(textfiles))):
      vec2 = minhash(shingle_lists[idx])
      vec2 = np.array(vec2) / max(vec2)
      Cosine_similarity_with_min_hash = 1 - cosine(vec1, vec2)
      print("cosine similarity with file{0}: {1}".format(filename2,Cosine_similarity_with_min_hash))
    print("\n")

In [41]:
def MinHash(shingle_lists, textfiles):
  for fname, idx in zip(textfiles, range(len(textfiles))):
    print("\nComparing file: {0}".format(fname))
    min_hash_src = datasketch.MinHash(num_perm = 128)
    for set_item in shingle_lists[idx]:
      min_hash_src.update(set_item.encode('utf8'))

    for itr_idx in range(len(textfiles)):
      min_hash_tgt = datasketch.MinHash(num_perm = 128)
      for set_item in shingle_lists[itr_idx]:
        min_hash_tgt.update(set_item.encode('utf8'))
      jac_sim_metric = min_hash_src.jaccard(min_hash_tgt)
      print("Jaccard Similarity using min hash with file : {0} is {1} after min hashing".format(textfiles[itr_idx], jac_sim_metric))

# Downloading textfiles

In [10]:
urllib.request.urlretrieve("http://www.textfiles.com/stories/13chil.txt", "13chil.txt")
urllib.request.urlretrieve("http://www.textfiles.com/stories/3wishes.txt", "3wishes.txt")
urllib.request.urlretrieve("http://www.textfiles.com/stories/3lpigs.txt", "3lpigs.txt")
urllib.request.urlretrieve("http://www.textfiles.com/stories/6ablemen.txt", "6ablemen.txt")

('6ablemen.txt', <http.client.HTTPMessage at 0x7f8fd5f092d0>)

In [11]:
textfiles_english = ["13chil.txt", "3lpigs.txt", "3wishes.txt", "6ablemen.txt"]

#File Verification 

In [12]:
for textfile in textfiles_english:
  print(textfile)
  sample_lines = !(head -4 $textfile)
  num_of_lines = !(wc -l $textfile)
  print("Filename : {0} \nSample Lines : {1} \nNumber of Lines : {2} \n ".format(textfile, sample_lines, num_of_lines) )

13chil.txt
Filename : 13chil.txt 
Sample Lines : ['                                FOR CHILDREN:', '', '                                   Sly Fox', ''] 
Number of Lines : ['170 13chil.txt'] 
 
3lpigs.txt
Filename : 3lpigs.txt 
Sample Lines : ['                  THE THREE LITTLE PIGS', '', '   Once upon a time . . . there were three little pigs, who left their mummy', 'and daddy to see the world.'] 
Number of Lines : ['84 3lpigs.txt'] 
 
3wishes.txt
Filename : 3wishes.txt 
Sample Lines : ['                     THE THREE WISHES', '', '   Once upon a time . . . a woodcutter lived happily with his wife in a pretty', 'little log cabin in the middle of a thick forest. Each morning he set off '] 
Number of Lines : ['69 3wishes.txt'] 
 
6ablemen.txt
Filename : 6ablemen.txt 
Sample Lines : ['                               SIX ABLE MEN  ', '', '   Once upon a time there lived a young soldier named Martin who had enlisted ', 'in the royal army to flght a war. The war was long but victorious and 

# Preprocessing

In [13]:
df_list_english = []
for fname in textfiles_english:
  name = "df_"+fname.split(".txt")[0]
  print(name)
  globals() [name] = pd.read_fwf(fname, header=None)
  df_list_english.append(name)
preprocess(df_list_english)

df_13chil
df_3lpigs
df_3wishes
df_6ablemen


  """


# Creating Shingles

In [14]:
list_5_character_shingles_english = createCharacterShingles(5, textfiles_english, df_list_english)

Filename : 13chil.txt 
Size of 5 Character Shingle set : 9056 
5 Character Shingle set : {'othe', 't s', 'he lo', 'ox a', 'ast ', 'ard', 'ght g', 'ry ve', 'apa ', 'ingl', 'ender', 'ally', 'e roa', 'scen', ' fox ', 'they', 'foole', 'p in ', 'what ', 'oodl', 'ude', 'usp', 'ment ', 'morr', 'ctio', 't dow', 'y one', 'ks l', 'h ho', 'o fie', 'rep', 're mr', 'rrell', 'hered', 'x c', 'nks', 'ing t', 'cles', 'beg', 'op in', 'ec', 'des', ' gon', 'll d', 'it h', 'im wi', ' wa', 's hea', 'ula', 'ring ', 'k s', 'g s', 'ch t', 'ile m', 'oked ', 'it an', 'claim', 'ily l', 'ent', 'r old', 'o go', ' all ', 'r on ', 'ped a', 'wicke', 'furio', 'd j', ' a go', 'look ', 'y li', 'ming ', ' dart', 'art t', 'r it', 'ief', 'lid', 'd ol', 'e poo', 'o mu', 'ou', ' mama', 'grin', 'et be', 'rem', 'erri', 'y pl', ' wha', 'r ton', 'e in ', 'nd c', 'g ha', ' has', 'em a ', 'ips', 'edro', 'k h', 'rels ', 'mped ', 'oods ', 'eed t', 'pere', 'dry', 'arm ', 't wi', 'iall', 'huckl', 'ut my', 'row d', ' we p', ' rain', 'bu

In [15]:
list_8_character_shingles_english = createCharacterShingles(8, textfiles_english, df_list_english)

Filename : 13chil.txt 
Size of 8 Character Shingle set : 26329 
8 Character Shingle set : {'earts t', 't s', 'spicio', 'ox a', 'fox di', 'fall he', 'ght g', 'ry ve', 'apa ', ' rubbe', 'yed on t', 'come an', 'nly he s', ' a fiel', 'ude', 'ment ', ' is mr r', 'oved a', 'r is hav', 'ctio', 't move', 'o repl', ' had s', 'e and c', 'x c', 'ed how ', 'ow long', 'at he fa', 'ly drib', 'des', 'd by old', 'er aga', 'ily upon', 'itting b', 'n her ', 'h rocki', 'pples a', 'ile m', 'tree as ', 'claim', 'ily l', 'uld not ', 'er voic', 'r on ', 'r going ', 'furio', ' a go', ' and cr', 'y li', 'ming ', 'crysta', ' to eat ', ' and ch', 'on in th', 'there ', ' want m', ' trembl', 'et be', 'ney woo', 'life b', 'walking', 'tree wh', ' sat o', 'ng mrs', 'em a ', 'grabbed', 'breath ', 'socks fo', 'mped ', 'oods ', 'eed t', ' of all', 'f of bar', 'eft mr ', 'irly d', 'feet bea', 'huckl', 'ut my', 'row d', ' shot ', 'bus', 'obedie', 'n i', 'givin', 'hot he', 'd allow', 'xcited r', 'x whis', 'u in ', 'come a'

In [16]:
list_4_word_shingles_english = createWordShingles(4, textfiles_english, df_list_english)

Filename: 13chil.txt
Length of 4 word shingle list before converting to set: 1448
Length of 4 word shingle list after converting to set: 1440
Filename: 3lpigs.txt
Length of 4 word shingle list before converting to set: 994
Length of 4 word shingle list after converting to set: 984
Filename: 3wishes.txt
Length of 4 word shingle list before converting to set: 757
Length of 4 word shingle list after converting to set: 754
Filename: 6ablemen.txt
Length of 4 word shingle list before converting to set: 1276
Length of 4 word shingle list after converting to set: 1273


# Jaccard Similarity

In [17]:
print("5 Character Shingles Jaccard Similarity")
jaccard(list_5_character_shingles_english, textfiles_english)

5 Character Shingles Jaccard Similarity
Comparing file: 13chil.txt
Jaccard similarity with file13chil.txt: 1
Jaccard similarity with file3lpigs.txt: 0.2660766037440276
Jaccard similarity with file3wishes.txt: 0.24882589734988259
Jaccard similarity with file6ablemen.txt: 0.2707483983692487


Comparing file: 3lpigs.txt
Jaccard similarity with file13chil.txt: 0.2660766037440276
Jaccard similarity with file3lpigs.txt: 1
Jaccard similarity with file3wishes.txt: 0.25696804894629505
Jaccard similarity with file6ablemen.txt: 0.2590938616433907


Comparing file: 3wishes.txt
Jaccard similarity with file13chil.txt: 0.24882589734988259
Jaccard similarity with file3lpigs.txt: 0.25696804894629505
Jaccard similarity with file3wishes.txt: 1
Jaccard similarity with file6ablemen.txt: 0.2439045704797693


Comparing file: 6ablemen.txt
Jaccard similarity with file13chil.txt: 0.2707483983692487
Jaccard similarity with file3lpigs.txt: 0.2590938616433907
Jaccard similarity with file3wishes.txt: 0.243904570479

In [18]:
print("8 Character Shingles Jaccard Similarity")
jaccard(list_8_character_shingles_english, textfiles_english)

8 Character Shingles Jaccard Similarity
Comparing file: 13chil.txt
Jaccard similarity with file13chil.txt: 1
Jaccard similarity with file3lpigs.txt: 0.12243690851735016
Jaccard similarity with file3wishes.txt: 0.11637228442401731
Jaccard similarity with file6ablemen.txt: 0.12646835499575684


Comparing file: 3lpigs.txt
Jaccard similarity with file13chil.txt: 0.12243690851735016
Jaccard similarity with file3lpigs.txt: 1
Jaccard similarity with file3wishes.txt: 0.12010857272110383
Jaccard similarity with file6ablemen.txt: 0.12228669118789826


Comparing file: 3wishes.txt
Jaccard similarity with file13chil.txt: 0.11637228442401731
Jaccard similarity with file3lpigs.txt: 0.12010857272110383
Jaccard similarity with file3wishes.txt: 1
Jaccard similarity with file6ablemen.txt: 0.11486543609976047


Comparing file: 6ablemen.txt
Jaccard similarity with file13chil.txt: 0.12646835499575684
Jaccard similarity with file3lpigs.txt: 0.12228669118789826
Jaccard similarity with file3wishes.txt: 0.11486

In [19]:
print("4 Word Shingles Jaccard Similarity")
jaccard(list_4_word_shingles_english, textfiles_english)

4 Word Shingles Jaccard Similarity
Comparing file: 13chil.txt
Jaccard similarity with file13chil.txt: 1
Jaccard similarity with file3lpigs.txt: 0.0012391573729863693
Jaccard similarity with file3wishes.txt: 0.0013692377909630307
Jaccard similarity with file6ablemen.txt: 0.0


Comparing file: 3lpigs.txt
Jaccard similarity with file13chil.txt: 0.0012391573729863693
Jaccard similarity with file3lpigs.txt: 1
Jaccard similarity with file3wishes.txt: 0.0005757052389176742
Jaccard similarity with file6ablemen.txt: 0.0008869179600886918


Comparing file: 3wishes.txt
Jaccard similarity with file13chil.txt: 0.0013692377909630307
Jaccard similarity with file3lpigs.txt: 0.0005757052389176742
Jaccard similarity with file3wishes.txt: 1
Jaccard similarity with file6ablemen.txt: 0.0014822134387351778


Comparing file: 6ablemen.txt
Jaccard similarity with file13chil.txt: 0.0
Jaccard similarity with file3lpigs.txt: 0.0008869179600886918
Jaccard similarity with file3wishes.txt: 0.0014822134387351778
Jacc

# Cosine Similarity 

In [20]:
print("5 Character Shingles Cosine Similarity")
cosine(list_5_character_shingles_english, textfiles_english)

5 Character Shingles Cosine Similarity
Comparing file: 13chil.txt
cosine similarity with file13chil.txt: 1
cosine similarity with file3lpigs.txt: 0.42340270705178207
cosine similarity with file3wishes.txt: 0.40815856013554747
cosine similarity with file6ablemen.txt: 0.426426495089597


Comparing file: 3lpigs.txt
cosine similarity with file13chil.txt: 0.42340270705178207
cosine similarity with file3lpigs.txt: 1
cosine similarity with file3wishes.txt: 0.4108617390898646
cosine similarity with file6ablemen.txt: 0.4129897794499623


Comparing file: 3wishes.txt
cosine similarity with file13chil.txt: 0.40815856013554747
cosine similarity with file3lpigs.txt: 0.4108617390898646
cosine similarity with file3wishes.txt: 1
cosine similarity with file6ablemen.txt: 0.3986810704556671


Comparing file: 6ablemen.txt
cosine similarity with file13chil.txt: 0.426426495089597
cosine similarity with file3lpigs.txt: 0.4129897794499623
cosine similarity with file3wishes.txt: 0.3986810704556671
cosine simila

In [21]:
print("8 Character Shingles Cosine Similarity")
cosine(list_8_character_shingles_english, textfiles_english)

8 Character Shingles Cosine Similarity
Comparing file: 13chil.txt
cosine similarity with file13chil.txt: 1
cosine similarity with file3lpigs.txt: 0.22087380381427824
cosine similarity with file3wishes.txt: 0.2159338810283226
cosine similarity with file6ablemen.txt: 0.22475675949656504


Comparing file: 3lpigs.txt
cosine similarity with file13chil.txt: 0.22087380381427824
cosine similarity with file3lpigs.txt: 1
cosine similarity with file3wishes.txt: 0.21573578042070682
cosine similarity with file6ablemen.txt: 0.21932957068880848


Comparing file: 3wishes.txt
cosine similarity with file13chil.txt: 0.2159338810283226
cosine similarity with file3lpigs.txt: 0.21573578042070682
cosine similarity with file3wishes.txt: 1
cosine similarity with file6ablemen.txt: 0.21118709256924872


Comparing file: 6ablemen.txt
cosine similarity with file13chil.txt: 0.22475675949656504
cosine similarity with file3lpigs.txt: 0.21932957068880848
cosine similarity with file3wishes.txt: 0.21118709256924872
cosin

In [22]:
print("4 Word Shingles Cosine Similarity")
cosine(list_4_word_shingles_english, textfiles_english)

4 Word Shingles Cosine Similarity
Comparing file: 13chil.txt
cosine similarity with file13chil.txt: 1
cosine similarity with file3lpigs.txt: 0.0025202432454547244
cosine similarity with file3wishes.txt: 0.002879083998155492
cosine similarity with file6ablemen.txt: 0.0


Comparing file: 3lpigs.txt
cosine similarity with file13chil.txt: 0.0025202432454547244
cosine similarity with file3lpigs.txt: 1
cosine similarity with file3wishes.txt: 0.0011609587199117057
cosine similarity with file6ablemen.txt: 0.0017869740450141421


Comparing file: 3wishes.txt
cosine similarity with file13chil.txt: 0.002879083998155492
cosine similarity with file3lpigs.txt: 0.0011609587199117057
cosine similarity with file3wishes.txt: 1
cosine similarity with file6ablemen.txt: 0.003062114175327031


Comparing file: 6ablemen.txt
cosine similarity with file13chil.txt: 0.0
cosine similarity with file3lpigs.txt: 0.0017869740450141421
cosine similarity with file3wishes.txt: 0.003062114175327031
cosine similarity with f

# All in Hindi

In [23]:
textfiles_hindi = ["hindi1.txt", "hindi2.txt", "hindi3.txt", "hindi4.txt"]

## File verification

In [24]:
for textfile in textfiles_hindi:
  print(textfile)
  sample_lines = !(head -4 $textfile)
  num_of_lines = !(wc -l $textfile)
  print("Filename : {0} \nSample Lines : {1} \nNumber of Lines : {2} \n ".format(textfile, sample_lines, num_of_lines) )

hindi1.txt
Filename : hindi1.txt 
Sample Lines : ['कछुआ और खरगोश', 'एक ज़माने में, एक खरगोश और एक कछुआ बहुत अच्छे दोस्त थे। लेकिन ', 'खरगोश को अपनी गति पर बहुत गर्व था क्योंकि कछुआ खरगोश से धीमा है। इसलिए', ' खरगोश हमेशा अपनी धीमी गति के लिए कछुए को छेड़ता था।'] 
Number of Lines : ['20 hindi1.txt'] 
 
hindi2.txt
Filename : hindi2.txt 
Sample Lines : ['अब तुम पत्थर गिनो', 'एक अमीर आदमी एक गाँव में रहा करता था। वह बहुत कंजूस था। उसके पास कई सोने के सिक्के थे। उसने एक सिक्के में सोने के सिक्के रखे थे और उसे पिछवाड़े में गाड़ दिया। हर दिन आधी रात को कंजूस जागता था और', ' पिछवाड़े जाकर देखता था कि सोने के सिक्के सुरक्षित हैं या ', 'नहीं। आओ। देखते हैं कि मेरे सोने के सिक्के सुरक्षित हैं या नहीं। एक। दो। तीन। चार पाच। छह। और 1000 में।'] 
Number of Lines : ['38 hindi2.txt'] 
 
hindi3.txt
Filename : hindi3.txt 
Sample Lines : ['जिसकी लाठी उसकी भैंस', 'क गाँव में एक दूधवाला रहता था। उसके पास बहुत सारी भैंसें थीं। वह अपनी भैंसों का बहुत ख्याल रखता था। वह अपना दूध अलग-अलग गाँवों में बेचता था। वह 

## Preprocessing

In [25]:
df_list_hindi = []
for fname in textfiles_hindi:
  name = "df_"+fname.split(".txt")[0]
  print(name)
  globals() [name] = pd.read_fwf(fname, header=None)
  df_list_hindi.append(name)
preprocess(df_list_hindi)

df_hindi1
df_hindi2
df_hindi3
df_hindi4


  """


## Creating Shingles

In [26]:
list_5_character_shingles_hindi = createCharacterShingles(5, textfiles_hindi, df_list_hindi)

Filename : hindi1.txt 
Size of 5 Character Shingle set : 1796 
5 Character Shingle set : {'लगय ', 'रत थ', 'ज', ' खरग', ' पहच ', 'य वह', 'क ल', 'रधर', 'क छ', 'श न ', ' हत ह', 'तज स ', 'श न', 'सस म', 'कर चक', 'हत प', 'द उस', 'छआ अ', 'त थ', 'ढत र', 'थर द', ' थ व', 'दख', 'हत ग', ' ह कभ', ' सथर ', ' गर', ' च', 'नद ', 'न च', 'त थ ', 'श अप', ' गय थ', ' पड ', ' सथ', ' जगय ', 'दड शर', 'हत अच', 'म ए', 'सथर द', 'ड आरम', ' मझ ', ' थ लक', 'खलफ', 'महस', ' जन द', 'शर', 'ह क', 'स नद', 'ए कछआ', 'भ ', 'आ ख', ' क उस', 'क ह ', 'और दड', 'ड ल', 'गय औ', 'भ नह ', 'च', 'र स', 'भल', 'कर ', 'ड क', 'म फन', 'भ न', ' थ भल', 'क लए ', 'दड और', 'त', 'श और', 'आन ', ' अचछ', 'उस थ', ' कर ', ' पहल', 'तज', 'क छडत', 'न लकष', 'र चलत', 'य और', ' खलफ', 'छआ क', 'कछआ अ', 'र च', 'शर हत', 'हए ', 'हए', ' धम ', ' क लए', 'एग', 'गश कछ', ' जल', ' महसस', 'हत शर', 'पस ', 'हए कछ', ' रस क', 'हग म', ' बढ', 'सच क', 'जगय', 'क ओ', 'छ ', 'आरम ', 'छआ धर', ' स क', 'अपन', 'सच क ', 'श कछ', 'आग क ', 'श सत ', 'गश क', 'दड ज', ' पह', ' पस ब', 'खरगश ', 

In [27]:
list_8_character_shingles_hindi = createCharacterShingles(8, textfiles_hindi, df_list_hindi)

Filename : hindi1.txt 
Size of 8 Character Shingle set : 3965 
8 Character Shingle set : {'कछआ अत ', 'रगश क अ', 'थ त एक ', 'र हत ह ', ' और वह ', 'आ अपन इ', 'रट थ वह ', 'म एक पड', 'थ त एक', ' पहच ', 'सच क कछ', 'रधर', 'कछआ जत', ' उस पड ', 'क छ', 'श न ', 'ग खरगश क', 'श न', 'रगश आग ', 'सस म', 'कर चक', 'हत प', 'द उस', ' अत म ', 'दसत थ लक', 'कछआ बहत', ' आ गय और', 'स पड क उ', 'थर द', ' थ व', 'छआ धरध', 'दख', ' ह कभ', 'बहत घस', ' और खद', 'चक गय व', 'खरगश न ', ' सथर ', ' गर', 'ह खन लग ', ' च', 'नद ', 'नद आन लग', 'खरगश और', 'थ लकन खर', 'श तज स ', ' गय थ', 'गश आग ', 'सच क क', ' पड ', 'रस कर औ', ' सथ', 'दड शर', 'भ हर म', 'रगश स धम', 'म ए', 'खरगश क', 'ड आरम', 'ज थ लकन', 'भ हर मत ', ' जन द', 'शर', 'ह क', 'भ ', 'आ ख', 'क ह ', 'य दड श', 'ड ल', 'मत मन ध', 'च', 'र स', 'लए कछए ', ' क पस ब', 'ड शर हत', 'दड लगय', 'कर ', 'ड क', ' थ भल', 'क लए ', ' थड आरम', 'दड और', ' दड जत', 'आन ', 'अचछ दसत', 'तग चल दड', ' पहल', 'क छडत', 'तज', 'र कह तम', 'य और', ' खलफ', 'कछआ अ', 'उस हसल ', 'इसलए ख', 'र च', ' थ त एक ', 'हए', 

In [28]:
list_4_word_shingles_hindi = createWordShingles(4, textfiles_hindi, df_list_hindi)

Filename: hindi1.txt
Length of 4 word shingle list before converting to set: 238
Length of 4 word shingle list after converting to set: 238
Filename: hindi2.txt
Length of 4 word shingle list before converting to set: 710
Length of 4 word shingle list after converting to set: 640
Filename: hindi3.txt
Length of 4 word shingle list before converting to set: 428
Length of 4 word shingle list after converting to set: 424
Filename: hindi4.txt
Length of 4 word shingle list before converting to set: 345
Length of 4 word shingle list after converting to set: 294


## Jaccard Similarity

In [29]:
print("5 Character Shingles Jaccard Similarity")
jaccard(list_5_character_shingles_hindi, textfiles_hindi)

5 Character Shingles Jaccard Similarity
Comparing file: hindi1.txt
Jaccard similarity with filehindi1.txt: 1
Jaccard similarity with filehindi2.txt: 0.12058080808080808
Jaccard similarity with filehindi3.txt: 0.13516834603096584
Jaccard similarity with filehindi4.txt: 0.13315850815850816


Comparing file: hindi2.txt
Jaccard similarity with filehindi1.txt: 0.12058080808080808
Jaccard similarity with filehindi2.txt: 1
Jaccard similarity with filehindi3.txt: 0.1670034907220283
Jaccard similarity with filehindi4.txt: 0.13713592233009708


Comparing file: hindi3.txt
Jaccard similarity with filehindi1.txt: 0.13516834603096584
Jaccard similarity with filehindi2.txt: 0.1670034907220283
Jaccard similarity with filehindi3.txt: 1
Jaccard similarity with filehindi4.txt: 0.1371732593106639


Comparing file: hindi4.txt
Jaccard similarity with filehindi1.txt: 0.13315850815850816
Jaccard similarity with filehindi2.txt: 0.13713592233009708
Jaccard similarity with filehindi3.txt: 0.1371732593106639
Jacc

In [30]:
print("8 Character Shingles Jaccard Similarity")
jaccard(list_8_character_shingles_hindi, textfiles_hindi)

8 Character Shingles Jaccard Similarity
Comparing file: hindi1.txt
Jaccard similarity with filehindi1.txt: 1
Jaccard similarity with filehindi2.txt: 0.05377507699991676
Jaccard similarity with filehindi3.txt: 0.06273512963904423
Jaccard similarity with filehindi4.txt: 0.061242128657858996


Comparing file: hindi2.txt
Jaccard similarity with filehindi1.txt: 0.05377507699991676
Jaccard similarity with filehindi2.txt: 1
Jaccard similarity with filehindi3.txt: 0.07628500531726339
Jaccard similarity with filehindi4.txt: 0.05897313622635511


Comparing file: hindi3.txt
Jaccard similarity with filehindi1.txt: 0.06273512963904423
Jaccard similarity with filehindi2.txt: 0.07628500531726339
Jaccard similarity with filehindi3.txt: 1
Jaccard similarity with filehindi4.txt: 0.06250597343018255


Comparing file: hindi4.txt
Jaccard similarity with filehindi1.txt: 0.061242128657858996
Jaccard similarity with filehindi2.txt: 0.05897313622635511
Jaccard similarity with filehindi3.txt: 0.0625059734301825

In [31]:
print("4 Word Shingles Jaccard Similarity")
jaccard(list_4_word_shingles_hindi, textfiles_hindi)

4 Word Shingles Jaccard Similarity
Comparing file: hindi1.txt
Jaccard similarity with filehindi1.txt: 1
Jaccard similarity with filehindi2.txt: 0.0011402508551881414
Jaccard similarity with filehindi3.txt: 0.0
Jaccard similarity with filehindi4.txt: 0.0


Comparing file: hindi2.txt
Jaccard similarity with filehindi1.txt: 0.0011402508551881414
Jaccard similarity with filehindi2.txt: 1
Jaccard similarity with filehindi3.txt: 0.0
Jaccard similarity with filehindi4.txt: 0.0


Comparing file: hindi3.txt
Jaccard similarity with filehindi1.txt: 0.0
Jaccard similarity with filehindi2.txt: 0.0
Jaccard similarity with filehindi3.txt: 1
Jaccard similarity with filehindi4.txt: 0.0


Comparing file: hindi4.txt
Jaccard similarity with filehindi1.txt: 0.0
Jaccard similarity with filehindi2.txt: 0.0
Jaccard similarity with filehindi3.txt: 0.0
Jaccard similarity with filehindi4.txt: 1




## Cosine Similarity

In [32]:
print("5 Character Shingles Cosine Similarity")
cosine(list_5_character_shingles_hindi, textfiles_hindi)

5 Character Shingles Cosine Similarity
Comparing file: hindi1.txt
cosine similarity with filehindi1.txt: 1
cosine similarity with filehindi2.txt: 0.22760176554010386
cosine similarity with filehindi3.txt: 0.2442609738440635
cosine similarity with filehindi4.txt: 0.2357102235395571


Comparing file: hindi2.txt
cosine similarity with filehindi1.txt: 0.22760176554010386
cosine similarity with filehindi2.txt: 1
cosine similarity with filehindi3.txt: 0.2879934529874923
cosine similarity with filehindi4.txt: 0.2494705304687703


Comparing file: hindi3.txt
cosine similarity with filehindi1.txt: 0.2442609738440635
cosine similarity with filehindi2.txt: 0.2879934529874923
cosine similarity with filehindi3.txt: 1
cosine similarity with filehindi4.txt: 0.24395776172651423


Comparing file: hindi4.txt
cosine similarity with filehindi1.txt: 0.2357102235395571
cosine similarity with filehindi2.txt: 0.2494705304687703
cosine similarity with filehindi3.txt: 0.24395776172651423
cosine similarity with f

In [33]:
print("8 Character Shingles Cosine Similarity")
cosine(list_8_character_shingles_hindi, textfiles_hindi)

8 Character Shingles Cosine Similarity
Comparing file: hindi1.txt
cosine similarity with filehindi1.txt: 1
cosine similarity with filehindi2.txt: 0.1100274642859895
cosine similarity with filehindi3.txt: 0.12165826518383219
cosine similarity with filehindi4.txt: 0.11576294992758161


Comparing file: hindi2.txt
cosine similarity with filehindi1.txt: 0.1100274642859895
cosine similarity with filehindi2.txt: 1
cosine similarity with filehindi3.txt: 0.1432783278706205
cosine similarity with filehindi4.txt: 0.11695090634318579


Comparing file: hindi3.txt
cosine similarity with filehindi1.txt: 0.12165826518383219
cosine similarity with filehindi2.txt: 0.1432783278706205
cosine similarity with filehindi3.txt: 1
cosine similarity with filehindi4.txt: 0.1193343038227822


Comparing file: hindi4.txt
cosine similarity with filehindi1.txt: 0.11576294992758161
cosine similarity with filehindi2.txt: 0.11695090634318579
cosine similarity with filehindi3.txt: 0.1193343038227822
cosine similarity with

In [34]:
print("4 Word Shingles Cosine Similarity")
cosine(list_4_word_shingles_hindi, textfiles_hindi)

4 Word Shingles Cosine Similarity
Comparing file: hindi1.txt
cosine similarity with filehindi1.txt: 1
cosine similarity with filehindi2.txt: 0.0025622501927837116
cosine similarity with filehindi3.txt: 0.0
cosine similarity with filehindi4.txt: 0.0


Comparing file: hindi2.txt
cosine similarity with filehindi1.txt: 0.0025622501927837116
cosine similarity with filehindi2.txt: 1
cosine similarity with filehindi3.txt: 0.0
cosine similarity with filehindi4.txt: 0.0


Comparing file: hindi3.txt
cosine similarity with filehindi1.txt: 0.0
cosine similarity with filehindi2.txt: 0.0
cosine similarity with filehindi3.txt: 1
cosine similarity with filehindi4.txt: 0.0


Comparing file: hindi4.txt
cosine similarity with filehindi1.txt: 0.0
cosine similarity with filehindi2.txt: 0.0
cosine similarity with filehindi3.txt: 0.0
cosine similarity with filehindi4.txt: 1




# Using Min hash

## For english

In [47]:
print("5 Character Shingles Cosine Similarity using a different min hash function")
MinHash(list_5_character_shingles_english, textfiles_english)

5 Character Shingles Cosine Similarity using a different min hash function

Comparing file: 13chil.txt
Jaccard Similarity using min hash with file : 13chil.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : 3lpigs.txt is 0.296875 after min hashing
Jaccard Similarity using min hash with file : 3wishes.txt is 0.296875 after min hashing
Jaccard Similarity using min hash with file : 6ablemen.txt is 0.21875 after min hashing

Comparing file: 3lpigs.txt
Jaccard Similarity using min hash with file : 13chil.txt is 0.296875 after min hashing
Jaccard Similarity using min hash with file : 3lpigs.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : 3wishes.txt is 0.3125 after min hashing
Jaccard Similarity using min hash with file : 6ablemen.txt is 0.2421875 after min hashing

Comparing file: 3wishes.txt
Jaccard Similarity using min hash with file : 13chil.txt is 0.296875 after min hashing
Jaccard Similarity using min hash with file : 3lpigs.txt is 0.3

In [45]:
print("8 Character Shingles Cosine Similarity using a different min hash function")
MinHash(list_8_character_shingles_english, textfiles_english)

8 Character Shingles Cosine Similarity using a different min hash function

Comparing file: 13chil.txt
Jaccard Similarity using min hash with file : 13chil.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : 3lpigs.txt is 0.140625 after min hashing
Jaccard Similarity using min hash with file : 3wishes.txt is 0.1171875 after min hashing
Jaccard Similarity using min hash with file : 6ablemen.txt is 0.1328125 after min hashing

Comparing file: 3lpigs.txt
Jaccard Similarity using min hash with file : 13chil.txt is 0.140625 after min hashing
Jaccard Similarity using min hash with file : 3lpigs.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : 3wishes.txt is 0.1875 after min hashing
Jaccard Similarity using min hash with file : 6ablemen.txt is 0.1015625 after min hashing

Comparing file: 3wishes.txt
Jaccard Similarity using min hash with file : 13chil.txt is 0.1171875 after min hashing
Jaccard Similarity using min hash with file : 3lpigs.txt is

In [46]:
print("4 Word Shingles Cosine Similarity using a different min hash function")
MinHash(list_4_word_shingles_english, textfiles_english)

4 Word Shingles Cosine Similarity using a different min hash function

Comparing file: 13chil.txt
Jaccard Similarity using min hash with file : 13chil.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : 3lpigs.txt is 0.0078125 after min hashing
Jaccard Similarity using min hash with file : 3wishes.txt is 0.0 after min hashing
Jaccard Similarity using min hash with file : 6ablemen.txt is 0.0 after min hashing

Comparing file: 3lpigs.txt
Jaccard Similarity using min hash with file : 13chil.txt is 0.0078125 after min hashing
Jaccard Similarity using min hash with file : 3lpigs.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : 3wishes.txt is 0.0 after min hashing
Jaccard Similarity using min hash with file : 6ablemen.txt is 0.0 after min hashing

Comparing file: 3wishes.txt
Jaccard Similarity using min hash with file : 13chil.txt is 0.0 after min hashing
Jaccard Similarity using min hash with file : 3lpigs.txt is 0.0 after min hashing
Jaccard

## For Hindi

In [42]:
print("5 Character Shingles Cosine Similarity using a different min hash function")
MinHash(list_5_character_shingles_hindi, textfiles_hindi)

5 Character Shingles Cosine Similarity using a different min hash function

Comparing file: hindi1.txt
Jaccard Similarity using min hash with file : hindi1.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : hindi2.txt is 0.1015625 after min hashing
Jaccard Similarity using min hash with file : hindi3.txt is 0.1328125 after min hashing
Jaccard Similarity using min hash with file : hindi4.txt is 0.140625 after min hashing

Comparing file: hindi2.txt
Jaccard Similarity using min hash with file : hindi1.txt is 0.1015625 after min hashing
Jaccard Similarity using min hash with file : hindi2.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : hindi3.txt is 0.171875 after min hashing
Jaccard Similarity using min hash with file : hindi4.txt is 0.1171875 after min hashing

Comparing file: hindi3.txt
Jaccard Similarity using min hash with file : hindi1.txt is 0.1328125 after min hashing
Jaccard Similarity using min hash with file : hindi2.txt is 0.1

In [43]:
print("8 Character Shingles Cosine Similarity using a different min hash function")
MinHash(list_8_character_shingles_hindi, textfiles_hindi)

8 Character Shingles Cosine Similarity using a different min hash function

Comparing file: hindi1.txt
Jaccard Similarity using min hash with file : hindi1.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : hindi2.txt is 0.0234375 after min hashing
Jaccard Similarity using min hash with file : hindi3.txt is 0.0546875 after min hashing
Jaccard Similarity using min hash with file : hindi4.txt is 0.0546875 after min hashing

Comparing file: hindi2.txt
Jaccard Similarity using min hash with file : hindi1.txt is 0.0234375 after min hashing
Jaccard Similarity using min hash with file : hindi2.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : hindi3.txt is 0.046875 after min hashing
Jaccard Similarity using min hash with file : hindi4.txt is 0.0390625 after min hashing

Comparing file: hindi3.txt
Jaccard Similarity using min hash with file : hindi1.txt is 0.0546875 after min hashing
Jaccard Similarity using min hash with file : hindi2.txt is 0.

In [44]:
print("4 Word Shingles Cosine Similarity using a different min hash function")
MinHash(list_4_word_shingles_hindi, textfiles_hindi)

4 Word Shingles Cosine Similarity using a different min hash function

Comparing file: hindi1.txt
Jaccard Similarity using min hash with file : hindi1.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : hindi2.txt is 0.0 after min hashing
Jaccard Similarity using min hash with file : hindi3.txt is 0.0 after min hashing
Jaccard Similarity using min hash with file : hindi4.txt is 0.0 after min hashing

Comparing file: hindi2.txt
Jaccard Similarity using min hash with file : hindi1.txt is 0.0 after min hashing
Jaccard Similarity using min hash with file : hindi2.txt is 1.0 after min hashing
Jaccard Similarity using min hash with file : hindi3.txt is 0.0 after min hashing
Jaccard Similarity using min hash with file : hindi4.txt is 0.0 after min hashing

Comparing file: hindi3.txt
Jaccard Similarity using min hash with file : hindi1.txt is 0.0 after min hashing
Jaccard Similarity using min hash with file : hindi2.txt is 0.0 after min hashing
Jaccard Similarity using m