In [6]:
import os
import fitz  
import nltk
import re
import numpy as np
import torch
import json
import tkinter as tk
from tkinter import filedialog, simpledialog
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from transformers import LongformerModel, LongformerTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import gym
from gym import spaces

nltk.download('punkt')


LEARNING_STORAGE_PATH = "learning_data.json"


sbert = SentenceTransformer('paraphrase-MiniLM-L6-v2')


longformer = LongformerModel.from_pretrained('allenai/longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')


def ensureLearningStorage():
    if not os.path.exists(LEARNING_STORAGE_PATH):
        with open(LEARNING_STORAGE_PATH, "w") as f:
            json.dump({"documents": []}, f, indent=4)

ensureLearningStorage()


def loadLearningData():
    with open(LEARNING_STORAGE_PATH, "r") as f:
        return json.load(f)


def saveLearningData(new_data):
    with open(LEARNING_STORAGE_PATH, "w") as f:
        json.dump(new_data, f, indent=4)


def cleanText(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text) 
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'\b(?:introduction|abstract|conclusion|references)\b', '', text, flags=re.IGNORECASE)  
    return text


def extractText(pdfPath):
    doc = fitz.open(pdfPath)
    return [" ".join([b[4] for b in page.get_text("blocks") if len(b[4].split()) > 5]) for page in doc]


def splitSentences(text):
    return sent_tokenize(text)


def getSentenceEmbeddings(sentences):
    sbertEmbeddings = sbert.encode(sentences)
    
    longformerEmbeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = longformer(**inputs)
        longformerEmbeddings.append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())

    return np.hstack((sbertEmbeddings, np.array(longformerEmbeddings)))


class HighlightSelector(gym.Env):
    def _init_(self, scores):
        super(HighlightSelector, self)._init_()
        self.scores = np.array(scores)
        self.numSentences = len(scores)

        self.observation_space = spaces.Box(low=0, high=1, shape=(self.numSentences,), dtype=np.float32)
        self.action_space = spaces.Discrete(self.numSentences)

        self.state = np.zeros(self.numSentences)
        self.currentStep = 0

    def reset(self):
        self.state = np.zeros(self.numSentences)
        self.currentStep = 0
        return self.state.copy()

    def step(self, action):
        reward = self.scores[action] if self.state[action] == 0 else -1  
        self.state[action] = 1
        self.currentStep += 1
        return self.state.copy(), reward, self.currentStep >= self.numSentences, {}


def selectSentences(sentences, scores, maxHighlights=20):
    if len(sentences) < 2:
        return sentences  

    env = DummyVecEnv([lambda: HighlightSelector(scores)])
    ppo = PPO("MlpPolicy", env, verbose=0)

    selectedSentences = []
    state = env.reset()

    for _ in range(maxHighlights):
        action, _ = ppo.predict(state, deterministic=True)
        if state[0][action] == 0:
            selectedSentences.append(sentences[action])
        state, _, done, _ = env.step(action)
        if done:
            break

    return selectedSentences


def applyHighlights(originalPdf, importantSentences, outputPdf):
    doc = fitz.open(originalPdf)

    for page in doc:
        for sentence in importantSentences:
            areas = page.search_for(sentence)
            if not areas:
                words = sentence.split()
                for i in range(len(words) - 4):
                    phrase = " ".join(words[i:i+4])
                    areas = page.search_for(phrase)
                    if areas:
                        break

            for rect in areas:
                highlight = page.add_highlight_annot(rect)
                highlight.set_colors(stroke=(1, 1, 0))  
                highlight.update()

    doc.save(outputPdf, garbage=4, deflate=True)
    print(f" Highlighted PDF saved as: {outputPdf}")

# Store learnings
def storeLearnings(pdfPath, importantSentences, scores):
    learningData = loadLearningData()

    learningEntry = {
        "pdf": pdfPath,
        "sentences": importantSentences,
        "scores": scores.tolist()
    }

    learningData["documents"].append(learningEntry)
    saveLearningData(learningData)
    print(" Learning data updated!")


def getUserFeedback(highlighted_sentences):
    print("\n💡 Please rate the highlighted sentences:")
    feedback = {}
    for sentence in enumerate(highlighted_sentences, 1):
        rating = simpledialog.askinteger("Feedback", f"Rate this highlight (1-5):\n{sentence}", minvalue=1, maxvalue=5)
        feedback[sentence] = rating
    return feedback


def run():
    root = tk.Tk()
    root.withdraw()
    pdfPath = filedialog.askopenfilename(title="Select PDF", filetypes=[("PDF Files", "*.pdf")])

    if not pdfPath:
        print(" No PDF selected. Exiting.")
        return

    print(f" Selected PDF: {pdfPath}")
    pageTexts = extractText(pdfPath)
    
    allHighlights = []

    for text in pageTexts:
        text = cleanText(text)
        sentences = splitSentences(text)
        for sent in sentences:
            print(sent)
        continue
        if not sentences:
            continue  

        embeddings = getSentenceEmbeddings(sentences)
        scores = cosine_similarity(embeddings).mean(axis=1)

        importantSentences = [sentences[i] for i in np.argsort(scores)[::-1][:20]]
        rlSelectedSentences = selectSentences(sentences, scores, maxHighlights=20)
        finalHighlights = list(set(importantSentences + rlSelectedSentences))
        allHighlights.extend(finalHighlights)
    return
    outputPdf = os.path.splitext(pdfPath)[0] + "_highlighted.pdf"
    applyHighlights(pdfPath, allHighlights, outputPdf)
    storeLearnings(pdfPath, allHighlights, scores)

if __name__ == '__main__':
    run()

[nltk_data] Downloading package punkt to C:\Users\Vasantha
[nltk_data]     Raj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


 Selected PDF: C:/Users/Vasantha Raj/Desktop/Road Pothole Detection Using YoloV8-1.pdf
an intensive project on road damage detection   using yolov8 and additive optimization  techiniques along with the survey on various   n r gladiss merlin  department of ads  rmk engineering college chennai   kaviyarasu s  department of ads  rmk engineering college chennai india   keshavardhan d r  department of ads  rmk engineering college chennai india   ashraf deen a  department of ads  rmk engineering college chennai india   jeyanth s  department of ads  rmk engineering college chennai india     deep learning has gained significant traction in the  realm of computer technology with road damage detection  technology emerging as a crucial component for road maintenance  and harmony in traffic timely identification of road defects such  as potholes and cracks are imperative for ensuring road safety  facilitating prompt repairs and ensuring the quality infrastructure  for public consequently road dama

In [7]:
# pip install pymupdf

In [8]:
# !pip install fitz

In [9]:
# %pip install gym

In [10]:
# pip install sentence-transformers

In [11]:
# pip install fitz

In [None]:
import pandas as pd

In [14]:
data=pd.read_csv('test22.csv')
data.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [20]:
len(data['article'][1].split())

311

In [24]:
print(len(sent_tokenize(data['article'][1])))

14


In [25]:
print(len(sent_tokenize(data['highlights'][1])))

3
