In [None]:
import PyPDF4
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF4.PdfFileReader(file)
        text = ""
        for page in range(reader.numPages):
            text += reader.getPage(page).extractText()
    return text

good_docs_paths = [""]  # 90-100%
great_docs_paths = [""]  # 75%
avg_docs_paths = [""]  # 50%
below_avg_docs_paths = [""]  # 25%
bad_docs_paths = [""]  # <25%

good_docs = [extract_text_from_pdf(file) for file in good_docs_paths]
great_docs = [extract_text_from_pdf(file) for file in great_docs_paths]
avg_docs = [extract_text_from_pdf(file) for file in avg_docs_paths]
below_avg_docs = [extract_text_from_pdf(file) for file in below_avg_docs_paths]
bad_docs = [extract_text_from_pdf(file) for file in bad_docs_paths]

all_docs = good_docs + great_docs + avg_docs + below_avg_docs + bad_docs

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_docs)


def calculate_score(query_documents):
    total_score = 0
    for query_document in query_documents:
        query_vector = vectorizer.transform([query_document])
        
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
        
        good_sim = sum(cosine_similarities[:len(good_docs)]) / len(good_docs)
        great_sim = sum(cosine_similarities[len(good_docs):len(good_docs)+len(great_docs)]) / len(great_docs)
        avg_sim = sum(cosine_similarities[len(good_docs)+len(great_docs):len(good_docs)+len(great_docs)+len(avg_docs)]) / len(avg_docs)
        below_avg_sim = sum(cosine_similarities[len(good_docs)+len(great_docs)+len(avg_docs):len(good_docs)+len(great_docs)+len(avg_docs)+len(below_avg_docs)]) / len(below_avg_docs)
        bad_sim = sum(cosine_similarities[-len(bad_docs):]) / len(bad_docs)
        
        total_score += 1.0 * good_sim + 0.75 * great_sim + 0.5 * avg_sim + 0.25 * below_avg_sim + 0.0 * bad_sim
    
    final_score = total_score / len(query_documents)
    return final_score

In [None]:
# Load data and run the model

list_of_your_company_documents = [""]
query_documents = [extract_text_from_pdf(path) for path in list_of_your_company_documents]
score = calculate_score(query_documents)
print(f"Score Médio: {score:.2f}")