In [1]:
from pdf_reader import PDFReader
from preprocessor import Preprocessor
from bert_model import BertModel
from analyzer import Analyzer
from gensim.models import KeyedVectors
import numpy as np
import os
import torch

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def document_to_vec(words, model):
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)


def process_documents(document_paths, pdf_reader, preprocessor, bert_model, word2vec_model):
    document_analyses = []
    embeddings = []

    for path in document_paths:
        text = pdf_reader.extract_text(path)
        processed_text = preprocessor.process(text)

        # Detailed analysis
        analysis_details = preprocessor.analyze_details(processed_text)
        complexity_score = preprocessor.analyze_complexity(processed_text)
        style_score = preprocessor.analyze_style(processed_text)
        vocabulary_diversity = preprocessor.analyze_vocabulary(processed_text)

        # Extract key-terms
        key_terms = preprocessor.extract_key_terms(processed_text)

        # Calculate topics
        topics = preprocessor.analyze_topics(processed_text)

        analysis = {
            "details": analysis_details,
            "complexity": complexity_score,
            "style": style_score,
            "vocabulary_diversity": vocabulary_diversity,
            "key_terms": key_terms,
            "topics": topics
        }

        bert_embedding = bert_model.get_embeddings(processed_text)
        w2v_embedding = document_to_vec(processed_text, word2vec_model)
        combined_embedding = torch.from_numpy(np.concatenate(
            (bert_embedding.detach().numpy(), w2v_embedding)))

        embeddings.append(combined_embedding)
        document_analyses.append((path, analysis))

    return embeddings, document_analyses


def generate_feedback(test_embedding, train_embeddings):
    distances = [torch.dist(test_embedding, train_emb, 2).item()
                 for train_emb in train_embeddings]
    avg_distance = sum(distances) / len(distances)
    if avg_distance > 0.21:
        return "The document is not aligned with the training standards."
    else:
        return "The document is aligned with the training standards."


def get_document_paths(directory):
    doc_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                doc_paths.append(os.path.join(root, file))
    return doc_paths


def calculate_benchmarks(train_analyses):
    # Calculating averages and standard deviations
    sentiment_scores = [analysis['details']['sentiment']
                        ['compound'] for _, analysis in train_analyses]
    num_sentences = [analysis['details']['num_sentences']
                     for _, analysis in train_analyses]

    avg_sentiment = sum(sentiment_scores) / len(sentiment_scores)
    std_sentiment = np.std(sentiment_scores)
    avg_num_sentences = sum(num_sentences) / len(num_sentences)
    std_num_sentences = np.std(num_sentences)

    return {
        "avg_sentiment": avg_sentiment,
        "std_sentiment": std_sentiment,
        "avg_num_sentences": avg_num_sentences,
        "std_num_sentences": std_num_sentences
    }


def main(model_path):
    pdf_reader = PDFReader()
    preprocessor = Preprocessor()
    bert_model = BertModel('jackaduma/SecBERT')
    analyzer = Analyzer()
    word2vec_model = KeyedVectors.load_word2vec_format(
        model_path, binary=False)

    train_doc_paths = get_document_paths('train/')
    test_doc_paths = get_document_paths('test/')

    train_embeddings, train_analyses = process_documents(
        train_doc_paths, pdf_reader, preprocessor, bert_model, word2vec_model)
    test_embeddings, test_analyses = process_documents(
        test_doc_paths, pdf_reader, preprocessor, bert_model, word2vec_model)

    benchmarks = calculate_benchmarks(train_analyses)

    for test_embedding, test_analysis in zip(test_embeddings, test_analyses):
        test_path = test_analysis[0]  # Doc path
        analysis_data = test_analysis[1]

        print(f"\nAnalyzing document: {test_path}")

        # Euclidian distance
        avg_distance = analyzer.average_distance_to_train_docs(
            test_embedding, train_embeddings)
        print(f"Average distance to training documents = {
              avg_distance:.3f}")

        # Similarity analysis
        analysis_result = analyzer.analyze(test_embedding, train_embeddings)
        print(f"Average cosine similarity: {
              analysis_result['average_similarity']:.3f}")

        # Embedding-based feedback
        feedback = generate_feedback(test_embedding, train_embeddings)

        # Detailed analysis feedback
        detailed_feedback = f"Detailed Analysis: Sentiment - {
            analysis_data['details']['sentiment']}, Number of Sentences - {analysis_data['details']['num_sentences']}"
        feedback += f"\n{detailed_feedback}"

        # Compare benchmarks and add feedbacks
        sentiment_diff = analysis_data['details']['sentiment']['compound'] - \
            benchmarks['avg_sentiment']
        num_sentences_diff = analysis_data['details']['num_sentences'] - \
            benchmarks['avg_num_sentences']

        if abs(sentiment_diff) > benchmarks['std_sentiment']:
            feedback += f"\nThe sentiment of this document is significantly {
                'positive' if sentiment_diff > 0 else 'negative'} compared to the training documents."
        if abs(num_sentences_diff) > benchmarks['std_num_sentences']:
            feedback += f"\nThis document has {'more' if num_sentences_diff >
                                               0 else 'fewer'} sentences than the average of the training documents."

        # Add new feedbacks based on the analysis
        feedback += f"\nText Complexity (Flesch-Kincaid): {
            analysis_data['complexity']}"
        feedback += f"\nWriting Style: {
            'Passive' if analysis_data['style'] > 0 else 'Active'}"
        feedback += f"\nVocabulary Diversity: {
            analysis_data['vocabulary_diversity']:.2f}"
        feedback += f"\nIdentified Key Terms: {
            ', '.join(analysis_data['key_terms'])}"
        feedback += f"\nMain Topics: {', '.join(analysis_data['topics'])}"

        # Print final feedback
        print(f"Final Feedback: {feedback}")

In [3]:
if __name__ == "__main__":
    main('word2Vec_models/your_word22Vec_model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at jackaduma/SecBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Analyzing document: test/documentCategory1/test2.pdf
Average distance to training documents = 0.095
Average cosine similarity: 0.982
Final Feedback: The document is aligned with the training standards.
Detailed Analysis: Sentiment - {'neg': 0.0, 'neu': 0.999, 'pos': 0.001, 'compound': 0.128}, Number of Sentences - 190
Text Complexity (Flesch-Kincaid): 21.5
Writing Style: Active
Vocabulary Diversity: 0.34
Identified Key Terms: segurança, ataque, risco, vulnerabilidade
Main Topics: ETIR, Segurança, Informação, incidente, serviço

Analyzing document: test/documentCategory1/test1.pdf
Average distance to training documents = 0.100
Average cosine similarity: 0.982
Final Feedback: The document is aligned with the training standards.
Detailed Analysis: Sentiment - {'neg': 0.009, 'neu': 0.985, 'pos': 0.006, 'compound': -0.7556}, Number of Sentences - 206
The sentiment of this document is significantly negative compared to the training documents.
Text Complexity (Flesch-Kincaid): 17.4
Writing S