In [2]:
!pip install datasets
!pip install rouge-score
!pip install PyPDF2
!pip install bert-score
!pip install transformers


from google.colab import drive
drive.mount('/content/drive')
from huggingface_hub import login
# login("HF_TOKEN")
import json
import os
import numpy as np
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import Dataset
from rouge_score import rouge_scorer
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import PyPDF2
from io import BytesIO
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer as BertTokenizer, AutoModel as BertModel
from bert_score import score

# Ensure NLTK resources are downloaded
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# File paths
DRIVE_PATH = '/content/drive/MyDrive/nlp_project'
PAPERS_FILE = os.path.join(DRIVE_PATH, "cached_papers.json")
EMBEDDINGS_FILE = os.path.join(DRIVE_PATH, "cached_embeddings.npy")
MODEL_PATH = os.path.join(DRIVE_PATH, "bart_finetuned_model")
TEST_JSON = os.path.join(DRIVE_PATH, "intermediate_training_batch_4.json")  # Replace with your test JSON file

# Provided functions (unchanged)
def download_arxiv_paper(arxiv_id):
    if "arxiv.org" in arxiv_id:
        arxiv_id = arxiv_id.split('/')[-1].split('v')[0]
    api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
    response = requests.get(api_url)
    soup = BeautifulSoup(response.content, 'xml')
    abstract = soup.find('summary').text.strip() if soup.find('summary') else ""
    title = soup.find('title').text.strip() if soup.find('title') else ""
    pdf_url = f"http://arxiv.org/pdf/{arxiv_id}.pdf"
    response = requests.get(pdf_url)
    if response.status_code == 200:
        try:
            pdf_file = BytesIO(response.content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            full_text = ""
            for page in pdf_reader.pages:
                full_text += page.extract_text()
            cleaned_text = clean_extracted_text(full_text)
            introduction = ""
            intro_patterns = [
                r"(?i)(?:1\.?\s*|I\.?\s*)?Introduction(.*?)(?:\n\d\.|\n[A-Z]\.|\nII\.)",
                r"(?i)(?:1\.?\s*|I\.?\s*)?Introduction(.*?)(?=\n2\.|\nII\.)"
            ]
            for pattern in intro_patterns:
                intro_match = re.search(pattern, cleaned_text, re.DOTALL)
                if intro_match:
                    introduction = clean_extracted_text(intro_match.group(1).strip())
                    break
            conclusion = ""
            concl_patterns = [
                r"(?i)(?:\d\.?\s*|[IVX]+\.?\s*)?Conclusion[s]?(.*?)(?:\n\d\.|\n[A-Z]\.|\nReferences|\n[IVX]+\.)",
                r"(?i)(?:\d\.?\s*|[IVX]+\.?\s*)?Discussion(?:s)?(.*?)(?:\n\d\.|\n[A-Z]\.|\nReferences|\n[IVX]+\.)"
            ]
            for pattern in concl_patterns:
                concl_match = re.search(pattern, cleaned_text, re.DOTALL)
                if concl_match:
                    conclusion = clean_extracted_text(concl_match.group(1).strip())
                    break
            return {
                "title": title,
                "abstract": abstract,
                "introduction": introduction,
                "conclusion": conclusion,
                "full_text": cleaned_text
            }
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return {"title": title, "abstract": abstract, "introduction": "", "conclusion": "", "full_text": ""}
    return {"title": title, "abstract": abstract, "introduction": "", "conclusion": "", "full_text": ""}

def clean_extracted_text(text):
    if not text:
        return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[a-zA-Z0-9]+[=><+\-*/^()[\]{}]+[a-zA-Z0-9]+|\$[^$]*\$|\$\$[^$]*\$\$', '', text)
    ref_patterns = [
        r'References\s*\n', r'REFERENCES\s*\n', r'Bibliography\s*\n',
        r'BIBLIOGRAPHY\s*\n', r'Works Cited\s*\n', r'REFERENCES CITED\s*\n'
    ]
    ref_start = len(text)
    for pattern in ref_patterns:
        matches = list(re.finditer(pattern, text))
        if matches:
            ref_start = min(ref_start, matches[-1].start())
    text = text[:ref_start]
    ligature_map = {
        '\ufb01': 'fi', '\ufb02': 'fl', '\ufb00': 'ff', '\ufb03': 'ffi', '\ufb04': 'ffl',
        '\u2019': "'", '\u2018': "'", '\u201c': '"', '\u201d': '"', '\u2014': '-', '\u2013': '-', '\u0003': ''
    }
    for ligature, replacement in ligature_map.items():
        text = text.replace(ligature, replacement)
    text = re.sub(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{2,4}[/-]\d{1,2}[/-]\d{1,2}\b', '', text)
    text = re.sub(r'\b\d+\.\d+%?\b|\b\d{2,}\b|\[\d+\]', '', text)
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
    text = re.sub(r'([a-z])([A-Z])|([a-z])([0-9])|([0-9])([a-z])', r'\1 \2', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
    text = re.sub(r'([.,;:!?])([a-zA-Z])', r'\1 \2', text)
    return text.strip()

def get_bert_embeddings(texts, model_name="bert-base-uncased"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(embedding[0])
    return np.array(embeddings)

def highlight_query_relevant_sentences(text, query, top_n=10):
    text = clean_extracted_text(text)
    sentences = sent_tokenize(text)
    if not sentences:
        return []
    all_embeddings = get_bert_embeddings(sentences + [query])
    sentence_embeddings = all_embeddings[:-1]
    query_embedding = all_embeddings[-1]
    similarities = cosine_similarity(sentence_embeddings, query_embedding.reshape(1, -1)).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    return [(sentences[i], float(similarities[i]), i) for i in top_indices]

def get_paragraph_context(sentences, index, context_size=2):
    start_idx = max(0, index - context_size)
    end_idx = min(len(sentences) - 1, index + context_size)
    context_paragraph = " ".join(sentences[start_idx:end_idx + 1])
    return clean_extracted_text(context_paragraph)

def extract_relevant_content(paper_data, query, max_sentences_per_section=10, context_size=2):
    result = {
        "title": paper_data["title"],
        "abstract": paper_data["abstract"],
        "relevant_sections": []
    }
    sections = {
        "introduction": paper_data.get("introduction", ""),
        "conclusion": paper_data.get("conclusion", ""),
        "full_text": paper_data.get("full_text", "")
    }
    for section_name, section_text in sections.items():
        if not section_text:
            continue
        cleaned_text = clean_extracted_text(section_text)
        all_sentences = sent_tokenize(cleaned_text)
        relevant_sentences = highlight_query_relevant_sentences(cleaned_text, query, top_n=max_sentences_per_section)
        if relevant_sentences:
            processed_results = []
            relevant_sentences.sort(key=lambda x: x[2])
            processed_indices = set()
            for sent, score, idx in relevant_sentences:
                if idx in processed_indices:
                    continue
                context_paragraph = get_paragraph_context(all_sentences, idx, context_size)
                for i in range(max(0, idx - context_size), min(len(all_sentences), idx + context_size + 1)):
                    processed_indices.add(i)
                processed_results.append({
                    "text": context_paragraph,
                    "relevance_score": score,
                    "core_sentence": sent
                })
            result["relevant_sections"].append({
                "section_name": section_name,
                "sentences": processed_results
            })
    return result

def process_top_papers(query, top_papers, max_papers=5, context_size=2):
    results = []
    for i, paper in enumerate(top_papers[:max_papers]):
        paper_link = paper['link']
        paper_data = download_arxiv_paper(paper_link)
        relevant_content = extract_relevant_content(paper_data, query, context_size=context_size)
        results.append({
            "paper_index": i+1,
            "title": paper['title'],
            "link": paper['link'],
            "extracted_content": relevant_content
        })
    return results

# Load cached papers and embeddings
if os.path.exists(PAPERS_FILE) and os.path.exists(EMBEDDINGS_FILE):
    with open(PAPERS_FILE, "r") as f:
        all_papers = json.load(f)
    paper_embeddings = np.load(EMBEDDINGS_FILE)
else:
    print("Cached data not found in Google Drive.")
    exit()

# Load Specter model
specter_tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
specter_model = AutoModel.from_pretrained("allenai/specter").to(device)

# Load fine-tuned BART model
bart_tokenizer = BartTokenizer.from_pretrained(MODEL_PATH)
bart_model = BartForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)

# Load test data
with open(TEST_JSON, "r", encoding="utf-8") as f:
    test_data = json.load(f)

# Process test data
val_predictions = []
val_references = []

for item in test_data['training']:
    query = item['query']
    reference_summary = item['summary']

    # Generate query embedding
    def get_embedding(text):
        inputs = specter_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            output = specter_model(**inputs)
        return output.last_hidden_state.mean(dim=1).cpu().numpy()

    query_embedding = get_embedding(query)
    similarities = cosine_similarity(query_embedding, paper_embeddings)
    sorted_indices = np.argsort(similarities[0])[::-1]
    top_papers = [all_papers[i] for i in sorted_indices]

    # Process top papers
    extraction_results = process_top_papers(query, top_papers, max_papers=5, context_size=2)

    # Concatenate relevant sections
    combined_text = []
    for paper in extraction_results:
        for section in paper['extracted_content']['relevant_sections']:
            for sentence in section['sentences']:
                combined_text.append(sentence['text'])
    input_text = " ".join(combined_text).strip()

    if not input_text:
        print(f"No relevant content extracted for query: {query}")
        continue

    # Generate summary
    inputs = bart_tokenizer(input_text, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=256,
        num_beams=4,
        early_stopping=True,
    )
    summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    val_predictions.append(summary)
    val_references.append(reference_summary)

# Compute ROUGE scores
def compute_rouge_scores(predictions, references):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge1, rouge2, rougeL = [], [], []
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge1.append(scores['rouge1'].fmeasure)
        rouge2.append(scores['rouge2'].fmeasure)
        rougeL.append(scores['rougeL'].fmeasure)
    return {
        "rouge1": np.mean(rouge1),
        "rouge2": np.mean(rouge2),
        "rougeL": np.mean(rougeL)
    }

# Compute ROUGE scores
rouge_scores = compute_rouge_scores(val_predictions, val_references)
print("\nROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

# Compute BERTScore
P, R, F1 = score(val_predictions, val_references, lang="en", verbose=False)
print("\nBERTScore:")
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall: {R.mean().item():.4f}")
print(f"F1: {F1.mean().item():.4f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[0, IndirectObject(96, 0, 132696685759952)]
[0, IndirectObject(98, 0, 132696685759952)]
[0, IndirectObject(100, 0, 132696685759952)]
[0, IndirectObject(102, 0, 132696685759952)]
[0, IndirectObject(104, 0, 132696685759952)]
[0, IndirectObject(106, 0, 132696685759952)]
[0, IndirectObject(108, 0, 132696685759952)]
[0, IndirectObject(110, 0, 132696685759952)]
[0, IndirectObject(112, 0, 132696685759952)]
[0, IndirectObject(114, 0, 132696685759952)]
[0, IndirectObject(116, 0, 132696685759952)]
[0, IndirectObject(118, 0, 132696685759952)]
[0, IndirectObject(120, 0, 132696685759952)]
[0, IndirectObject(122, 0, 132696685759952)]
[0, IndirectObject(124, 0, 132696685759952)]
[0, IndirectObject(126, 0, 132696685759952)]



ROUGE Scores:
ROUGE-1: 0.0150
ROUGE-2: 0.0010
ROUGE-L: 0.0120


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERTScore:
Precision: 0.2727
Recall: 0.2729
F1: 0.2725


