In [2]:
import glob
import math
from tqdm import tqdm

from model import Model

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords as sw

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
ps = PorterStemmer()

file_names = glob.glob('../Corpus/**/*.txt', recursive=True)
corpus_file_names = []

stopwords = sw.words('english')

for file_name in file_names:
    if file_name[-6:] != 'Qs.txt' and file_name[-6:] != 'As.txt':
        corpus_file_names.append(file_name)

#print(len(corpus_file_names))

tf_documents = dict()
idf_documents = dict()
num_docs = len(corpus_file_names)

def update_tf(tf_docs, token, doc_name):
    if doc_name not in tf_docs:
        tf_docs[doc_name] = {token: 1}
    else:
        if token not in tf_docs[doc_name]:
            tf_docs[doc_name][token] = 1
        else:
            tf_docs[doc_name][token] += 1   


def update_idf(idf_docs, token):
    if token not in idf_docs:
        idf_docs[token] = 1
    else:
        idf_docs[token] += 1

def calculate_tfidf(tf, idf, num_docs):
    return tf*math.log(float(num_docs)/float(idf))


for file_name in tqdm(corpus_file_names):
    with open(file_name, 'r') as file:
        contents = word_tokenize(file.read())
        for word in contents:
            if word not in stopwords:
                update_tf(tf_documents, word, file_name)
                if tf_documents[file_name][word] == 1:
                    update_idf(idf_documents, word)
            

question = 'Where can tourists go when they visit Cambridge?'
question_tokenized = word_tokenize(question)

best_score = float('-inf')
best_file = ''

for file_name in corpus_file_names:
    mag_q = 0
    mag_doc = 0
    numerator = 0
    for word in question_tokenized:
        if word not in stopwords:
            mag_q += 1
            if word in idf_documents:
                if word in tf_documents[file_name]:
                    tf_val = tf_documents[file_name][word]
                else:
                    tf_val = 0
                idf_val = idf_documents[word]
                res = calculate_tfidf(tf_val, idf_val, num_docs)
                mag_doc += (res**2)
                numerator += res

    if (math.sqrt(float(mag_q)) * math.sqrt(float(mag_doc))) == 0:
        cosine = float('-inf')
    else:
        cosine = float(numerator) / (math.sqrt(float(mag_q)) * math.sqrt(float(mag_doc)))
    print(file_name, cosine)
    if cosine > best_score:
        best_file = file_name
        best_score = cosine 

    print(best_file)

100%|███████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 171.83it/s]

../Corpus/Pharmacy/Pharmacy.txt -inf

../Corpus/PrivateSchool/PrivateSchool.txt -inf

../Corpus/Oxygen/Oxygen.txt -inf

../Corpus/JacksonvilleFL/JacksonvilleFL.txt -inf

../Corpus/PacketSwitching/PacketSwitching.txt -inf

../Corpus/SteamEngine/SteamEngine.txt -inf

../Corpus/BlackDeath/BlackDeath.txt -inf

../Corpus/Geology/Geology.txt -inf

../Corpus/PrimeNumber/PrimeNumber.txt -inf

../Corpus/ImmuneSystem/ImmuneSystem.txt -inf

../Corpus/SCali/SCali.txt -inf

../Corpus/Construction/Construction.txt -inf

../Corpus/CivilDisobedience/CivilDisobedience.txt -inf

../Corpus/Imperialism/Imperialism.txt 0.4082482904638631
../Corpus/Imperialism/Imperialism.txt
../Corpus/Force/Force.txt -inf
../Corpus/Imperialism/Imperialism.txt
../Corpus/Islamism/Islamism.txt -inf
../Corpus/Imperialism/Imperialism.txt
../Corpus/AmazonRainforest/AmazonRainforest.txt -inf
../Corpus/Imperialism/Imperialism.txt
../Corpus/YuanDynasty/YuanDynasty.txt -inf
../Corpus/Imperialism/Imperialism.txt
../Corpus/Harvard/Har




In [4]:
tf_documents['../Corpus/PrimeNumber/PrimeNumber.txt']

{'A': 3,
 'prime': 15,
 'number': 12,
 '(': 4,
 ')': 4,
 'natural': 2,
 'greater': 4,
 '1': 11,
 'positive': 2,
 'divisors': 2,
 '.': 20,
 'called': 2,
 'composite': 3,
 'For': 1,
 'example': 1,
 ',': 28,
 '5': 2,
 'integer': 4,
 'factors': 2,
 'whereas': 1,
 '6': 2,
 '2': 4,
 '3': 5,
 'addition': 1,
 'The': 4,
 'fundamental': 1,
 'theorem': 3,
 'arithmetic': 1,
 'establishes': 1,
 'central': 1,
 'role': 1,
 'primes': 7,
 'theory': 2,
 ':': 1,
 'expressed': 2,
 'product': 1,
 'unique': 1,
 'ordering': 1,
 'uniqueness': 1,
 'requires': 1,
 'excluding': 1,
 'one': 1,
 'include': 2,
 'arbitrarily': 1,
 'many': 3,
 'instances': 1,
 'factorization': 1,
 'e.g.': 1,
 '·': 3,
 'etc': 1,
 'valid': 1,
 'factorizations': 1,
 'property': 1,
 'primality': 5,
 'simple': 2,
 'slow': 2,
 'method': 1,
 'verifying': 1,
 'given': 2,
 'n': 3,
 'known': 3,
 'trial': 2,
 'division': 2,
 'It': 1,
 'consists': 1,
 'testing': 1,
 'whether': 1,
 'multiple': 1,
 'Algorithms': 1,
 'much': 1,
 'efficient': 1,
 'de

In [6]:
'prime' in stopwords

False

In [7]:
'prime' in idf_documents

True

In [8]:
idf_documents['prime']

1

In [9]:
res = calculate_tfidf(15, 1, 20)

In [10]:
res

44.93598410330986