In [8]:
import os
import math
import pandas as pd

In [9]:
class TFIDF():
    
    def __init__(self, documents):
        self.documents = documents
        self.sentences = [x.split() for x in documents]
        self.unique_words = set(sum(self.sentences, []))
        self.word_tf_in_sentences = {}
        
    def calculate_tf(self):
        for word in self.unique_words:
            tf_in_sentences = tuple(sentence.count(word) / len(sentence) if len(sentence) > 0 else 0 for sentence in self.sentences)
            self.word_tf_in_sentences[word] = tf_in_sentences
        return self.word_tf_in_sentences
    
    def calculate_idf(self):
        idf = {}
        for word in self.unique_words:
            idf[word] = 1 + math.log((len(self.sentences) + 1) / (sum([1.0 for i in self.sentences if word in i])+1))
        return idf
    
    def calculate_tfidf(self):
        tfidf = {}
        idf = self.calculate_idf()
        tf = self.calculate_tf()
        for word in self.unique_words:
            tfidf[word] = tuple(tf[word][i] * idf[word] for i in range(len(self.sentences)))
        return tfidf
    
    def calculate_cosine_similarity(self):
        tfidf = self.calculate_tfidf()
        cosine_similarity = {}
        for i in range(len(self.sentences)):
            for j in range(len(self.sentences)):
                if i != j:
                    cosine_similarity[(i, j)] = sum([tfidf[word][i] * tfidf[word][j] for word in self.unique_words])
        return cosine_similarity
        

In [10]:
tfidf = TFIDF(["Język programowania python jest super", "Język programowania javascript odraża mnie"])
tfidf.calculate_tfidf()

{'Język': (0.2, 0.2),
 'jest': (0.2810930216216329, 0.0),
 'javascript': (0.0, 0.2810930216216329),
 'programowania': (0.2, 0.2),
 'super': (0.2810930216216329, 0.0),
 'odraża': (0.0, 0.2810930216216329),
 'mnie': (0.0, 0.2810930216216329),
 'python': (0.2810930216216329, 0.0)}

In [11]:
tfidf.calculate_cosine_similarity()

{(0, 1): 0.08000000000000002, (1, 0): 0.08000000000000002}

In [39]:
'''
nie umiałam tego zrobić do zadanego szablonu, więc zrobiłam na podstawie poprzedniej klasy
'''
class TFIDF2:
    
    def __init__(self, path_to_pubs):
        self.path_to_pubs = path_to_pubs
        self.documents = self.get_pdf_files()
        self.sentences = [x.split() for x in self.documents]
        self.unique_words = set(sum(self.sentences, []))
        self.word_tf_in_sentences = {}
        
    def get_pdf_files(self):
        files = [os.path.splitext(f)[0] for f in os.listdir(self.path_to_pubs) if f.endswith(".pdf")]
        return files

    def calculate_tf(self):
        for word in self.unique_words:
            tf_in_sentences = tuple(sentence.count(word) / len(sentence) if len(sentence) > 0 else 0 for sentence in self.sentences)
            self.word_tf_in_sentences[word] = tf_in_sentences
        return self.word_tf_in_sentences
    
    def calculate_idf(self):
        idf = {}
        for word in self.unique_words:
            idf[word] = 1 + math.log((len(self.sentences) + 1) / (sum([1.0 for i in self.sentences if word in i])+1))
        return idf
    
    def calculate_tfidf(self):
        tfidf_matrix = []
        idf = self.calculate_idf()
        tf = self.calculate_tf()
        for i, sentence in enumerate(self.sentences):
            tfidf_vector = [tf[word][i] * idf[word] if word in sentence else 0.0 for word in self.unique_words]
            tfidf_matrix.append(tfidf_vector)
        return tfidf_matrix
    
    def calculate_cosine_similarity(self):
        tfidf = self.calculate_tfidf()
        cosine_similarity = {}
        for i in range(len(self.sentences)):
            for j in range(len(self.sentences)):
                if i != j:
                    cosine_similarity[(i, j)] = sum([tfidf[word][i] * tfidf[word][j] for word in self.unique_words])
        return cosine_similarity

In [40]:
tfidf2 = TFIDF2('cw8/test/')
tfidf_values = tfidf2.calculate_tfidf()
print(tfidf_values)

[[0.0, 0.5, 0.7027325540540822], [0.7027325540540822, 0.5, 0.0]]
