# Scoring | Jaccard Coefficient

**Imports**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import json
import string
import re
from nltk.stem import PorterStemmer

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Load data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
raw_data =json.load(open("/content/drive/MyDrive/IR_Assignments/docs.json", "r"))
file_types =json.load(open("/content/drive/MyDrive/IR_Assignments/special_docs.json", "r"))

**Preprocessing**

In [None]:
def preprocess(input):
    input=input.replace('\a',' ')
    input=input.replace('\b',' ')
    input=input.replace('\f',' ')
    input=input.replace('\n',' ')    
    input=input.replace('\r',' ')
    input=input.replace('\t',' ')
    input=input.replace('\v',' ')
    # convert to lower case
    output = input.lower()
    # remove punctuations
    punctuations=string.punctuation.replace("'",'')
    output = "".join([char if char not in punctuations else ' ' for char in output])
    output = output.replace("'",'')
    # tokenize
    output = nltk.word_tokenize(output)
    # removing words with special characters
    output = [word for word in output if re.sub(r'[^\x20-\x7e]','',word) == word]
    # remove stopwords and numeric tokens
    output = [word.strip() for word in output if word not in nltk.corpus.stopwords.words('english') and not word.isnumeric()]
    # stemming
    # output = [PorterStemmer().stem(word) for word in output]
    return output

In [None]:
# preprocessing
for doc in raw_data:
    raw_data[doc] = preprocess(raw_data[doc])

**Creating document-token index**

In [None]:
# creating document-token set
def create_index(data):
    index = {}
    for doc in data:
        index[doc] = set(raw_data[doc])
    return index

index = create_index(raw_data)

# Jaccard

**Jaccard Coefficient**

In [None]:
# calculating jaccard coefficient
def jaccard(query, doc):
    inter = len(query.intersection(doc))
    uni = len(query.union(doc))
    return inter/uni

**Processing the input query and displaying results**

In [None]:
# process the input query and operation sequence
def process(input):
    terms = preprocess(input)
    query = set(terms)

    best_docs = []
    for doc in index:
        coeff = jaccard(query, index[doc])
        best_docs.append([coeff, doc])
    
    # finding 5 best documents
    best_docs.sort(reverse = True)
    return best_docs[:5]

In [None]:
# input and output
def run():
    query = input("Input query: ")
    result = process(query)
    print('Top 5 documents:')
    for doc in result:
        print(doc[1])
    print(result)

In [None]:
run()

Input query: american dream
Top 5 documents:
p-law.hum
oxymoron.txt
psalm_re.aga
psalm_nixon
psalm.reagan
[[0.016129032258064516, 'p-law.hum'], [0.014925373134328358, 'oxymoron.txt'], [0.014084507042253521, 'psalm_re.aga'], [0.014084507042253521, 'psalm_nixon'], [0.014084507042253521, 'psalm.reagan']]


In [None]:
run()

Input query: lion king
Top 5 documents:
jrrt.riddle
puzzles.jok
cereal.txt
smokers.txt
yogisays.txt
[[0.012195121951219513, 'jrrt.riddle'], [0.010101010101010102, 'puzzles.jok'], [0.00909090909090909, 'cereal.txt'], [0.007936507936507936, 'smokers.txt'], [0.006289308176100629, 'yogisays.txt']]
