In [1]:
from pdf_reader import PDFReader
from preprocessor import Preprocessor
from bert_model import BertModel
from analyzer import Analyzer
from gensim.models import KeyedVectors
import numpy as np
import os
import torch

In [2]:
def document_to_vec(words, model):
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)


def process_documents(document_paths, pdf_reader, preprocessor, bert_model, word2vec_model):
    document_analyses = []
    embeddings = []

    for path in document_paths:
        text = pdf_reader.extract_text(path)
        processed_text = preprocessor.process(text)

        # Detailed analisys
        analysis_details = preprocessor.analyze_details(processed_text)
        complexity_score = preprocessor.analyze_complexity(processed_text)
        style_score = preprocessor.analyze_style(processed_text)
        vocabulary_diversity = preprocessor.analyze_vocabulary(processed_text)

        # Extract key-terms
        key_terms = preprocessor.extract_key_terms(processed_text)

        # Calculate topics
        topics = preprocessor.analyze_topics(processed_text)

        analysis = {
            "details": analysis_details,
            "complexity": complexity_score,
            "style": style_score,
            "vocabulary_diversity": vocabulary_diversity,
            "key_terms": key_terms,
            "topics": topics
        }

        bert_embedding = bert_model.get_embeddings(processed_text)
        w2v_embedding = document_to_vec(processed_text, word2vec_model)
        combined_embedding = torch.from_numpy(np.concatenate((bert_embedding.detach().numpy(), w2v_embedding)))
        
        embeddings.append(combined_embedding)
        document_analyses.append((path, analysis))

    return embeddings, document_analyses


def generate_feedback(test_embedding, train_embeddings):
    distances = [torch.dist(test_embedding, train_emb, 2).item() for train_emb in train_embeddings]
    avg_distance = sum(distances) / len(distances)
    if avg_distance > 0.21:
        return "O documento pode ser melhorado em [aspecto específico]."
    else:
        return "O documento está alinhado com os padrões de treino."


def get_document_paths(directory):
    doc_names = [filename for filename in os.listdir(directory) if filename.endswith('.pdf')]
    file_paths = [os.path.join(directory, filename) for filename in doc_names]
    return file_paths


def calculate_benchmarks(train_analyses):
    # Calculating averages and standard deviations
    sentiment_scores = [analysis['details']['sentiment']['compound'] for _, analysis in train_analyses]
    num_sentences = [analysis['details']['num_sentences'] for _, analysis in train_analyses]

    avg_sentiment = sum(sentiment_scores) / len(sentiment_scores)
    std_sentiment = np.std(sentiment_scores)
    avg_num_sentences = sum(num_sentences) / len(num_sentences)
    std_num_sentences = np.std(num_sentences)

    return {
        "avg_sentiment": avg_sentiment,
        "std_sentiment": std_sentiment,
        "avg_num_sentences": avg_num_sentences,
        "std_num_sentences": std_num_sentences
    }


def main():
    pdf_reader = PDFReader()
    preprocessor = Preprocessor()
    bert_model = BertModel('./finetuned_model')
    analyzer = Analyzer()
    word2vec_model = KeyedVectors.load_word2vec_format('word2Vec_models/cbow_s1000.txt', binary=False)

    train_doc_paths = get_document_paths('train/')
    test_doc_paths = get_document_paths('test/')
    
    train_embeddings, train_analyses = process_documents(train_doc_paths, pdf_reader, preprocessor, bert_model, word2vec_model)
    test_embeddings, test_analyses = process_documents(test_doc_paths, pdf_reader, preprocessor, bert_model, word2vec_model)

    benchmarks = calculate_benchmarks(train_analyses)

    for test_embedding, test_analysis in zip(test_embeddings, test_analyses):
        test_path = test_analysis[0]  # Doc path
        analysis_data = test_analysis[1]

        # Euclidian distance
        avg_distance = analyzer.average_distance_to_train_docs(test_embedding, train_embeddings)
        print(f"Documento: {test_path}, Distância média para documentos de treino = {avg_distance:.3f}")

        # Similarity analisys
        analysis_result = analyzer.analyze(test_embedding, train_embeddings)
        print(f"Documento: {test_path}, Média da similaridade por cosseno: {analysis_result['average_similarity']:.3f}")

        # Embedding-based feedback
        feedback = generate_feedback(test_embedding, train_embeddings)

        # Detailed-analisys feedback
        detailed_feedback = f"Análise Detalhada: Sentimento - {analysis_data['details']['sentiment']}, Número de Frases - {analysis_data['details']['num_sentences']}"
        feedback += f"\n{detailed_feedback}"

        # Compare benchmarks and add feedbacks
        sentiment_diff = analysis_data['details']['sentiment']['compound'] - benchmarks['avg_sentiment']
        num_sentences_diff = analysis_data['details']['num_sentences'] - benchmarks['avg_num_sentences']

        if abs(sentiment_diff) > benchmarks['std_sentiment']:
            feedback += f"\nO sentimento deste documento é significativamente {'positivo' if sentiment_diff > 0 else 'negativo'} em comparação com os documentos de treino."
        if abs(num_sentences_diff) > benchmarks['std_num_sentences']:
            feedback += f"\nEste documento tem {'mais' if num_sentences_diff > 0 else 'menos'} frases do que a média dos documentos de treino."

        # Add new feedbacks based on the analysis
        feedback += f"\nComplexidade do Texto (Flesch-Kincaid): {analysis_data['complexity']}"
        feedback += f"\nEstilo de Escrita: {'Passiva' if analysis_data['style'] > 0 else 'Ativa'}"
        feedback += f"\nDiversidade de Vocabulário: {analysis_data['vocabulary_diversity']:.2f}"
        feedback += f"\nTermos-chave identificados: {', '.join(analysis_data['key_terms'])}"
        feedback += f"\nTópicos principais: {', '.join(analysis_data['topics'])}"


        # Print final feedback
        print(f"Documento: {test_path}, Feedback Final: {feedback}\n")


In [3]:
if __name__ == "__main__":
    main()

Documento: test/risk.pdf, Distância média para documentos de treino = 0.246
Documento: test/risk.pdf, Média da similaridade por cosseno: 0.953
Documento: test/risk.pdf, Feedback Final: O documento pode ser melhorado em [aspecto específico].
Análise Detalhada: Sentimento - {'neg': 0.001, 'neu': 0.988, 'pos': 0.011, 'compound': 0.9898}, Número de Frases - 552
Complexidade do Texto (Flesch-Kincaid): 27.3
Estilo de Escrita: Ativa
Diversidade de Vocabulário: 0.26
Termos-chave identificados: risco
Tópicos principais: risco, empresa, Riscos, Governança, evidenciação

Documento: test/politicaSegurancaInformacao.pdf, Distância média para documentos de treino = 0.207
Documento: test/politicaSegurancaInformacao.pdf, Média da similaridade por cosseno: 0.958
Documento: test/politicaSegurancaInformacao.pdf, Feedback Final: O documento está alinhado com os padrões de treino.
Análise Detalhada: Sentimento - {'neg': 0.007, 'neu': 0.986, 'pos': 0.007, 'compound': -0.3182}, Número de Frases - 133
O senti