In [None]:
import math
import os
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

class tf_idf_search_engine():
    def __init__(self, folder_path):
        self.path = folder_path
        self.file_cnt = len(os.listdir(folder_path + "/data"))
        self.overlapping_doc_cnt = -1
        self.data_lst = list()
        self.data_split_lst = list() 
        
        #PATH initialize
        for i in range(self.file_cnt):
            data_path = path + "/data/" + str(i) + ".txt"
            data = open(data_path, encoding='utf8').readline()
            self.data_lst.append(data)  
            
        #for stemming
        self.stemmer = PorterStemmer()
        self.english_stops = list(set(stopwords.words('english')))
        
        #delete special-char at stop words, special-char and stemming
        for idx,stop_word in enumerate(self.english_stops):
            after_word = ""
            stop_word = self.stemmer.stem(stop_word)
            for c in stop_word:
                if c.isalnum():
                    after_word += c
            if stop_word == after_word:
                pass
            else:
                self.english_stops[idx] = after_word
                
        #query
        self.query_path = path + "/query.txt" 
        self.query = open(self.query_path, encoding='utf8').readline()
        self.query_lst = self.query.split()
        
        #stemming query, if not stemming , can't catch 'president --> presidi' 
        for idx,query in enumerate(self.query_lst):
            self.query_lst[idx] = self.stemmer.stem(query)
            
        #TF_lst , IDF_lst , TF_IDF
        self.TF_lst = list() 
        self.inverted_idx_dict=dict()
        self.tf_idf = dict()
        self.overlapping_doc_lst = list()
        
    def preprocess_data(self):
        #all char get lower
        for i in range(self.file_cnt):
            self.data_lst[i] = self.data_lst[i].lower()
            self.data_split_lst.append(self.data_lst[i].split())
            #delete all ""(cnn)""  mark at all doc
            if "(cnn)" in self.data_split_lst[i][0]:
                self.data_split_lst[i][0] = self.data_split_lst[i][0].replace("(cnn)","")
                
            #delete special-char
            for idx,word in enumerate(self.data_split_lst[i]):
                after_word = ""
                for c in word:
                    if c.isalnum():
                        after_word += c     
                if word == after_word:
                    pass
                else:
                    self.data_split_lst[i][idx] = after_word
                    
            #stemming    
            for idx, original_word in enumerate(self.data_split_lst[i]):
                if original_word == '':
                    self.data_split_lst[i].remove(original_word)
                else:
                    changed_word = self.stemmer.stem(original_word)
                    if original_word == changed_word:
                        pass
                    else:
                        self.data_split_lst[i][idx] = changed_word       
                        
            #delete stopwords
            for word in self.data_split_lst[i]:
                if word in self.english_stops:
                    while word in self.data_split_lst[i]:
                        self.data_split_lst[i].remove(word)
                        
    #get doc with query
    def get_overlapping_doc(self):
        query_len = len(self.query_lst)
        if query_len != 0:
            overlapping_doc_lst = list()
            query_set = set(self.query_lst)
            for idx, split_doc in enumerate(self.data_split_lst):
                split_doc_set = set(split_doc)
                intersection_cnt = len(list(query_set.intersection(split_doc_set)))
                if intersection_cnt == query_len:
                    doc_no = "doc" + str(idx)
                    overlapping_doc_lst.append(doc_no)
                else:
                    pass
            self.overlapping_doc_lst = overlapping_doc_lst
            self.overlapping_doc_cnt = len(overlapping_doc_lst)
        else:
            print("There is no Query!")
        
    def calculating_TF_lst(self):
        for i in range(self.file_cnt):
            word_cnt=dict()
            for word in self.data_split_lst[i]:
                if word not in word_cnt:
                    word_cnt[word]=1
                else:
                    word_cnt[word]= word_cnt[word]+1  
            self.TF_lst.append(word_cnt) 
            
    def getting_TF(self, doc_no, term):
        if term in self.TF_lst[doc_no]:
            TF = math.log10(1+self.TF_lst[doc_no][term])
        else:
            TF = 0.0
        return TF
    
    def making_inverted_idx(self):
        for i in range(len(self.TF_lst)): 
            for key in self.TF_lst[i].keys():
                # key(term) not in IDF_dict
                if key not in self.inverted_idx_dict.keys():
                    doc_num = "doc" + str(i)
                    self.inverted_idx_dict[key]=[doc_num]  
                # key(term) in IDF_dict, only add doc_num
                elif key in self.inverted_idx_dict.keys() and key not in self.inverted_idx_dict[key]:
                    doc_num = "doc" + str(i)
                    self.inverted_idx_dict[key].append(doc_num) 
                    
    def getting_IDF(self, term):
        IDF = math.log10(float(self.file_cnt)/len(self.inverted_idx_dict[term]))
        return IDF
    
    def getting_weight(self, doc_no, term):
        return self.getting_TF(doc_no, term) * self.getting_IDF(term)  
    
    def building_tf_idf(self):
        for i in range(self.file_cnt):
            w_dict = dict()
            for key in self.inverted_idx_dict.keys():
                w_dict[key] = self.getting_weight(i, key)   
            doc_no = "doc" + str(i)
            self.tf_idf[doc_no] = w_dict 
            
        #make tf-idf for query
        w_dict = dict()
        for key in self.inverted_idx_dict.keys():
            w_dict[key] = 0.0
        for query in self.query_lst:
            TF = math.log10(1+1)
            IDF = math.log10(self.file_cnt/len(self.inverted_idx_dict[query]))
            w_dict[query] =  TF * IDF
        self.tf_idf['query'] = w_dict
        
    #get vector size
    def get_size(self,vector):
        size = len(vector)
        sum = 0.0
        for v in vector:
            sum += math.pow(v,2)
        result = math.sqrt(sum)
        return result
    
    #get vector innerproduct
    def get_innerproduct(self,v1, v2):
        size = len(v1)
        result = 0.0
        for a,b in zip(v1,v2):
            result += a*b
        return result
    
    def cosine_similarity(self, v1, v2):
        return self.get_innerproduct(v1,v2) / (self.get_size(v1) * self.get_size(v2))
    
    #get doc with highest score
    def get_score(self):
        top_sum = -100.0
        top_no = "doc"  
        score_dict =dict()
        query_vector = list(self.tf_idf['query'].values())
        
        for doc_no in self.overlapping_doc_lst:
            doc_vector = list(self.tf_idf[doc_no].values())
            score = self.cosine_similarity(query_vector, doc_vector)
            score_dict[doc_no] = score 
            if top_sum < score :
                top_sum = score
                top_no = doc_no
        
        score_tuple = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)          
        return top_sum, top_no , score_tuple

In [None]:
path = "" #your path
en = tf_idf_search_engine(path)
en.preprocess_data()
en.get_overlapping_doc()
en.calculating_TF_lst()
en.making_inverted_idx()
en.building_tf_idf()
en.get_score()