In [2]:
!pip install datasets
!pip install rouge-score
!pip install PyPDF2
!pip install bert-score

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [1]:
import json
import os
import numpy as np
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AutoTokenizer, AutoModel
from datasets import Dataset
from rouge_score import rouge_scorer
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import PyPDF2
from io import BytesIO
import re
import nltk
from nltk.tokenize import sent_tokenize
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer as BertTokenizer, AutoModel as BertModel
from bert_score import score

from google.colab import drive
drive.mount('/content/drive')
# Ensure NLTK resources are downloaded
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# File paths
DRIVE_PATH = '/content/drive/MyDrive/nlp_project'
PAPERS_FILE = os.path.join(DRIVE_PATH, "cached_papers.json")
EMBEDDINGS_FILE = os.path.join(DRIVE_PATH, "cached_embeddings.npy")
MODEL_PATH = os.path.join(DRIVE_PATH, "pegasus_arxiv_finetuned_model")
TEST_JSON = os.path.join(DRIVE_PATH, "intermediate_training_batch_4.json")
FALLBACK_MODEL = "google/pegasus-arxiv"

# Verify model directory and load model
pegasus_model = None
pegasus_tokenizer = None
print(f"Attempting to load model from: {MODEL_PATH}")
if os.path.exists(MODEL_PATH) and os.path.exists(os.path.join(MODEL_PATH, "config.json")):
    try:
        print(f"Loading fine-tuned model from {MODEL_PATH}")
        pegasus_tokenizer = PegasusTokenizer.from_pretrained(MODEL_PATH)
        pegasus_model = PegasusForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)
        print("Fine-tuned model loaded successfully")
    except Exception as e:
        print(f"Error loading fine-tuned model from {MODEL_PATH}: {e}")
        print(f"Falling back to {FALLBACK_MODEL}")
else:
    print(f"Model directory not found at {MODEL_PATH} or missing config.json")
    # Check alternative path
    alternative_path = os.path.join(DRIVE_PATH, "pegasus_arxiv_finetuned")
    if os.path.exists(alternative_path) and os.path.exists(os.path.join(alternative_path, "config.json")):
        try:
            print(f"Trying alternative path: {alternative_path}")
            pegasus_tokenizer = PegasusTokenizer.from_pretrained(alternative_path)
            pegasus_model = PegasusForConditionalGeneration.from_pretrained(alternative_path).to(device)
            print("Fine-tuned model loaded from alternative path")
        except Exception as e:
            print(f"Error loading model from {alternative_path}: {e}")
            print(f"Falling back to {FALLBACK_MODEL}")
    else:
        print(f"No fine-tuned model found. Falling back to {FALLBACK_MODEL}")

# Load fallback model if necessary
if pegasus_model is None or pegasus_tokenizer is None:
    try:
        print(f"Loading fallback model: {FALLBACK_MODEL}")
        pegasus_tokenizer = PegasusTokenizer.from_pretrained(FALLBACK_MODEL)
        pegasus_model = PegasusForConditionalGeneration.from_pretrained(FALLBACK_MODEL).to(device)
        print("Fallback model loaded successfully")
    except Exception as e:
        print(f"Error loading fallback model {FALLBACK_MODEL}: {e}")
        print("Cannot proceed without a model. Exiting.")
        exit()

# Provided functions (unchanged)
def download_arxiv_paper(arxiv_id):
    if "arxiv.org" in arxiv_id:
        arxiv_id = arxiv_id.split('/')[-1].split('v')[0]
    api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
    response = requests.get(api_url)
    soup = BeautifulSoup(response.content, 'xml')
    abstract = soup.find('summary').text.strip() if soup.find('summary') else ""
    title = soup.find('title').text.strip() if soup.find('title') else ""
    pdf_url = f"http://arxiv.org/pdf/{arxiv_id}.pdf"
    response = requests.get(pdf_url)
    if response.status_code == 200:
        try:
            pdf_file = BytesIO(response.content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            full_text = ""
            for page in pdf_reader.pages:
                full_text += page.extract_text()
            cleaned_text = clean_extracted_text(full_text)
            introduction = ""
            intro_patterns = [
                r"(?i)(?:1\.?\s*|I\.?\s*)?Introduction(.*?)(?:\n\d\.|\n[A-Z]\.|\nII\.)",
                r"(?i)(?:1\.?\s*|I\.?\s*)?Introduction(.*?)(?=\n2\.|\nII\.)"
            ]
            for pattern in intro_patterns:
                intro_match = re.search(pattern, cleaned_text, re.DOTALL)
                if intro_match:
                    introduction = clean_extracted_text(intro_match.group(1).strip())
                    break
            conclusion = ""
            concl_patterns = [
                r"(?i)(?:\d\.?\s*|[IVX]+\.?\s*)?Conclusion[s]?(.*?)(?:\n\d\.|\n[A-Z]\.|\nReferences|\n[IVX]+\.)",
                r"(?i)(?:\d\.?\s*|[IVX]+\.?\s*)?Discussion(?:s)?(.*?)(?:\n\d\.|\n[A-Z]\.|\nReferences|\n[IVX]+\.)"
            ]
            for pattern in concl_patterns:
                concl_match = re.search(pattern, cleaned_text, re.DOTALL)
                if concl_match:
                    conclusion = clean_extracted_text(concl_match.group(1).strip())
                    break
            return {
                "title": title,
                "abstract": abstract,
                "introduction": introduction,
                "conclusion": conclusion,
                "full_text": cleaned_text
            }
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return {"title": title, "abstract": abstract, "introduction": "", "conclusion": "", "full_text": ""}
    return {"title": title, "abstract": abstract, "introduction": "", "conclusion": "", "full_text": ""}

def clean_extracted_text(text):
    if not text:
        return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[a-zA-Z0-9]+[=><+\-*/^()[\]{}]+[a-zA-Z0-9]+|\$[^$]*\$|\$\$[^$]*\$\$', '', text)
    ref_patterns = [
        r'References\s*\n', r'REFERENCES\s*\n', r'Bibliography\s*\n',
        r'BIBLIOGRAPHY\s*\n', r'Works Cited\s*\n', r'REFERENCES CITED\s*\n'
    ]
    ref_start = len(text)
    for pattern in ref_patterns:
        matches = list(re.finditer(pattern, text))
        if matches:
            ref_start = min(ref_start, matches[-1].start())
    text = text[:ref_start]
    ligature_map = {
        '\ufb01': 'fi', '\ufb02': 'fl', '\ufb00': 'ff', '\ufb03': 'ffi', '\ufb04': 'ffl',
        '\u2019': "'", '\u2018': "'", '\u201c': '"', '\u201d': '"', '\u2014': '-', '\u2013': '-', '\u0003': ''
    }
    for ligature, replacement in ligature_map.items():
        text = text.replace(ligature, replacement)
    text = re.sub(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{2,4}[/-]\d{1,2}[/-]\d{1,2}\b', '', text)
    text = re.sub(r'\b\d+\.\d+%?\b|\b\d{2,}\b|\[\d+\]', '', text)
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
    text = re.sub(r'([a-z])([A-Z])|([a-z])([0-9])|([0-9])([a-z])', r'\1 \2', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
    text = re.sub(r'([.,;:!?])([a-zA-Z])', r'\1 \2', text)
    return text.strip()

def get_bert_embeddings(texts, model_name="bert-base-uncased"):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name).to(device)
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(embedding[0])
    return np.array(embeddings)

def highlight_query_relevant_sentences(text, query, top_n=10):
    text = clean_extracted_text(text)
    sentences = sent_tokenize(text)
    if not sentences:
        return []
    all_embeddings = get_bert_embeddings(sentences + [query])
    sentence_embeddings = all_embeddings[:-1]
    query_embedding = all_embeddings[-1]
    similarities = cosine_similarity(sentence_embeddings, query_embedding.reshape(1, -1)).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    return [(sentences[i], float(similarities[i]), i) for i in top_indices]

def get_paragraph_context(sentences, index, context_size=2):
    start_idx = max(0, index - context_size)
    end_idx = min(len(sentences) - 1, index + context_size)
    context_paragraph = " ".join(sentences[start_idx:end_idx + 1])
    return clean_extracted_text(context_paragraph)

def extract_relevant_content(paper_data, query, max_sentences_per_section=10, context_size=2):
    result = {
        "title": paper_data["title"],
        "abstract": paper_data["abstract"],
        "relevant_sections": []
    }
    sections = {
        "introduction": paper_data.get("introduction", ""),
        "conclusion": paper_data.get("conclusion", ""),
        "full_text": paper_data.get("full_text", "")
    }
    for section_name, section_text in sections.items():
        if not section_text:
            continue
        cleaned_text = clean_extracted_text(section_text)
        all_sentences = sent_tokenize(cleaned_text)
        relevant_sentences = highlight_query_relevant_sentences(cleaned_text, query, top_n=max_sentences_per_section)
        if relevant_sentences:
            processed_results = []
            relevant_sentences.sort(key=lambda x: x[2])
            processed_indices = set()
            for sent, score, idx in relevant_sentences:
                if idx in processed_indices:
                    continue
                context_paragraph = get_paragraph_context(all_sentences, idx, context_size)
                for i in range(max(0, idx - context_size), min(len(all_sentences), idx + context_size + 1)):
                    processed_indices.add(i)
                processed_results.append({
                    "text": context_paragraph,
                    "relevance_score": score,
                    "core_sentence": sent
                })
            result["relevant_sections"].append({
                "section_name": section_name,
                "sentences": processed_results
            })
    return result

def process_top_papers(query, top_papers, max_papers=5, context_size=2):
    results = []
    for i, paper in enumerate(top_papers[:max_papers]):
        paper_link = paper['link']
        paper_data = download_arxiv_paper(paper_link)
        relevant_content = extract_relevant_content(paper_data, query, context_size=context_size)
        results.append({
            "paper_index": i+1,
            "title": paper['title'],
            "link": paper['link'],
            "extracted_content": relevant_content
        })
    return results

# Load cached papers and embeddings
if os.path.exists(PAPERS_FILE) and os.path.exists(EMBEDDINGS_FILE):
    with open(PAPERS_FILE, "r") as f:
        all_papers = json.load(f)
    paper_embeddings = np.load(EMBEDDINGS_FILE)
else:
    print("Cached data not found in Google Drive.")
    exit()

# Load Specter model
try:
    specter_tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
    specter_model = AutoModel.from_pretrained("allenai/specter").to(device)
    print("Specter model loaded successfully")
except Exception as e:
    print(f"Error loading Specter model: {e}")
    exit()

# Load test data
if not os.path.exists(TEST_JSON):
    print(f"Test JSON file not found at {TEST_JSON}.")
    exit()
with open(TEST_JSON, "r", encoding="utf-8") as f:
    test_data = json.load(f)

# Process test data
val_predictions = []
val_references = []

for item in test_data['training']:
    query = item['query']
    reference_summary = item['summary']

    # Generate query embedding
    def get_embedding(text):
        inputs = specter_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            output = specter_model(**inputs)
        return output.last_hidden_state.mean(dim=1).cpu().numpy()

    query_embedding = get_embedding(query)
    similarities = cosine_similarity(query_embedding, paper_embeddings)
    sorted_indices = np.argsort(similarities[0])[::-1]
    top_papers = [all_papers[i] for i in sorted_indices]

    # Process top papers
    extraction_results = process_top_papers(query, top_papers, max_papers=5, context_size=2)

    # Concatenate relevant sections
    combined_text = []
    for paper in extraction_results:
        for section in paper['extracted_content']['relevant_sections']:
            for sentence in section['sentences']:
                combined_text.append(sentence['text'])
    input_text = " ".join(combined_text).strip()

    if not input_text:
        print(f"No relevant content extracted for query: {query}")
        continue

    # Generate summary
    inputs = pegasus_tokenizer(input_text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    summary_ids = pegasus_model.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=4,
        early_stopping=True,
    )
    summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    val_predictions.append(summary)
    val_references.append(reference_summary)

# Compute ROUGE scores
def compute_rouge_scores(predictions, references):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge1, rouge2, rougeL = [], [], []
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge1.append(scores['rouge1'].fmeasure)
        rouge2.append(scores['rouge2'].fmeasure)
        rougeL.append(scores['rougeL'].fmeasure)
    return {
        "rouge1": np.mean(rouge1),
        "rouge2": np.mean(rouge2),
        "rougeL": np.mean(rougeL)
    }

# Compute ROUGE scores
rouge_scores = compute_rouge_scores(val_predictions, val_references)
print("\nROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

# Compute BERTScore
P, R, F1 = score(val_predictions, val_references, lang="en", verbose=False)
print("\nBERTScore:")
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall: {R.mean().item():.4f}")
print(f"F1: {F1.mean().item():.4f}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Attempting to load model from: /content/drive/MyDrive/nlp_project/pegasus_arxiv_finetuned_model
Loading fine-tuned model from /content/drive/MyDrive/nlp_project/pegasus_arxiv_finetuned_model
Error loading fine-tuned model from /content/drive/MyDrive/nlp_project/pegasus_arxiv_finetuned_model: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /content/drive/MyDrive/nlp_project/pegasus_arxiv_finetuned_model.
Falling back to google/pegasus-arxiv
Loading fallback model: google/pegasus-arxiv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-arxiv and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Fallback model loaded successfully
Specter model loaded successfully


[0, IndirectObject(96, 0, 137749382742928)]
[0, IndirectObject(98, 0, 137749382742928)]
[0, IndirectObject(100, 0, 137749382742928)]
[0, IndirectObject(102, 0, 137749382742928)]
[0, IndirectObject(104, 0, 137749382742928)]
[0, IndirectObject(106, 0, 137749382742928)]
[0, IndirectObject(108, 0, 137749382742928)]
[0, IndirectObject(110, 0, 137749382742928)]
[0, IndirectObject(112, 0, 137749382742928)]
[0, IndirectObject(114, 0, 137749382742928)]
[0, IndirectObject(116, 0, 137749382742928)]
[0, IndirectObject(118, 0, 137749382742928)]
[0, IndirectObject(120, 0, 137749382742928)]
[0, IndirectObject(122, 0, 137749382742928)]
[0, IndirectObject(124, 0, 137749382742928)]
[0, IndirectObject(126, 0, 137749382742928)]



ROUGE Scores:
ROUGE-1: 0.1446
ROUGE-2: 0.0114
ROUGE-L: 0.0948


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERTScore:
Precision: 0.7841
Recall: 0.8027
F1: 0.7929


Mounted at /content/drive
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Error loading model from /content/drive/MyDrive/nlp_project/pegasus_arxiv_finetuned_model: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /content/drive/MyDrive/nlp_project/pegasus_arxiv_finetuned_model.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

NameError: name 'pegasus_model' is not defined