# 📘 ArIES Research Paper Evaluator
Automated publishability assessment, justification generation, and Q&A system.

In [None]:
# ✅ Install Required Packages
!pip install sentence-transformers scikit-learn pandas joblib tqdm pymupdf
!pip install transformers accelerate
!pip install langchain faiss-cpu




## 📄 Step 1: Extract Text from PDFs

In [40]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)


In [41]:
import os
os.makedirs("models", exist_ok=True)


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import joblib
import os

# Sample data
X = ["this is a document", "this is another document", "text data for classification"]
y = ["class1", "class1", "class2"]

# Step 1: Create the model pipeline
clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Step 2: Train the model
clf.fit(X, y)

# Step 3: Save the model
os.makedirs("models", exist_ok=True)
joblib.dump(clf, "models/classifier.pkl")


['models/classifier.pkl']

In [44]:
import os
os.makedirs("models", exist_ok=True)  # ✅ create the folder if it doesn't exist

joblib.dump(clf, "models/classifier.pkl")


['models/classifier.pkl']

## 🧠 Step 2: Train Classifier from 15 Labeled Papers

In [45]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import joblib

df = pd.read_csv("data/labels.csv")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

X, y = [], []
for _, row in df.iterrows():
    text = extract_text_from_pdf(f"data/papers/{row['filename']}")
    embedding = embedder.encode(text)
    X.append(embedding)
    y.append(1 if row['label'] == "Publishable" else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

joblib.dump(clf, "models/classifier.pkl")


Accuracy: 0.6666666666666666
F1 Score: 0.6666666666666666


['models/classifier.pkl']

In [None]:
!mkdir -p data/papers


In [None]:
!pip install fpdf




In [None]:
from fpdf import FPDF

def create_dummy_pdf(filename, text):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, text)
    pdf.output(f"data/papers/{filename}")

# Generate 15 dummy PDFs matching your labels.csv
for i in range(1, 16):
    filename = f"paper{i:02d}.pdf"
    text = f"This is the content of {filename}. It simulates a research paper for testing purposes."
    create_dummy_pdf(filename, text)


In [None]:
import os

print(os.listdir("data/papers"))


['paper15.pdf', 'paper06.pdf', 'paper07.pdf', 'paper03.pdf', 'paper02.pdf', 'paper08.pdf', 'paper01.pdf', 'paper10.pdf', 'paper14.pdf', 'paper13.pdf', 'paper09.pdf', 'paper04.pdf', 'paper11.pdf', 'paper12.pdf', 'paper05.pdf']


In [None]:
from google.colab import files
uploaded = files.upload()


Saving labels.csv to labels.csv


In [None]:
!mkdir -p data
!mv labels.csv data/


## 🏷️ Step 3: Classify a Paper

In [None]:
import joblib

model = joblib.load("models/classifier.pkl")

def classify_paper(text):
    embedding = embedder.encode([text])
    label = model.predict(embedding)[0]
    return "Publishable" if label == 1 else "Non-Publishable"


## 🧾 Step 4: Generate Justification

In [31]:
generator = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", device_map="auto")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [36]:
def generate_justification(text, label):
    # your logic here
    return f"This paper was classified as {label} because ..."


In [37]:
import os

os.makedirs("results", exist_ok=True)


In [38]:
import os
os.makedirs("results", exist_ok=True)

df_results = pd.DataFrame(results, columns=["Filename", "Publishability", "Justification"])
df_results.to_csv("results/results.csv", index=False)
df_results.head()


Unnamed: 0,Filename,Publishability,Justification


In [46]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=df_results)

https://docs.google.com/spreadsheets/d/1iB0o0g3OzEx0bwOoXCbLKYfdd_z4J9mTJhCGcvhhWa8/edit#gid=0


## 🔄 Step 5: Full Pipeline Evaluation

In [47]:
import os

results = []
paper_dir = "data/papers"

for filename in os.listdir(paper_dir):
    path = os.path.join(paper_dir, filename)
    text = extract_text_from_pdf(path)
    label = classify_paper(text)
    reason = generate_justification(text, label)
    results.append([filename, label, reason])

df_results = pd.DataFrame(results, columns=["Filename", "Publishability", "Justification"])
df_results.to_csv("results/results.csv", index=False)
df_results.head()


Unnamed: 0,Filename,Publishability,Justification
0,paper15.pdf,Non-Publishable,This paper was classified as Non-Publishable b...
1,paper06.pdf,Publishable,This paper was classified as Publishable becau...
2,paper07.pdf,Non-Publishable,This paper was classified as Non-Publishable b...
3,paper03.pdf,Non-Publishable,This paper was classified as Non-Publishable b...
4,paper02.pdf,Publishable,This paper was classified as Publishable becau...


## ❓ Step 6: Interactive Q&A System

In [48]:
!pip install -U langchain-community




In [49]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

def load_chunks(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    return splitter.create_documents([text])

def build_qa_chain(documents):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(documents, embeddings)

    pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1", device_map="auto")
    llm = HuggingFacePipeline(pipeline=pipe)

    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")
    return qa

def interactive_qa(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    chunks = load_chunks(text)
    qa = build_qa_chain(chunks)

    print(f"\nInteractive Q&A for {pdf_path}")
    print("Type 'exit' to quit.")
    while True:
        query = input("\nAsk your question: ")
        if query.lower() == "exit":
            break
        result = qa.run(query)
        print(f"\nAnswer: {result}")


In [53]:
from google.colab import files

uploaded = files.upload()  # This will prompt you to upload the ZIP file from your computer


Saving EARC Dataset-20250417T115257Z-0012.zip to EARC Dataset-20250417T115257Z-0012.zip


In [54]:
import os

print(os.listdir('.'))  # Should show your ZIP file in the output


['.config', 'EARC Dataset-20250417T115257Z-001.zip', 'models', 'EARC Dataset-20250417T115257Z-0012.zip', 'data', 'results', 'sample_data']


In [55]:
import zipfile

zip_path = 'EARC Dataset-20250417T115257Z-0012.zip'  # Replace with your actual file name
extract_path = './extracted_papers'

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction complete. Files are in:", extract_path)


Extraction complete. Files are in: ./extracted_papers


In [56]:
print(os.listdir(extract_path))


['EARC Dataset']


In [57]:
!pip install pymupdf




In [59]:
import os

# Check extracted files
extract_path = './extracted_papers'
print("Contents of extraction folder:", os.listdir(extract_path))


Contents of extraction folder: ['EARC Dataset']


In [60]:
# Find first PDF in nested directories
def find_first_pdf(root_dir):
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith('.pdf'):
                return os.path.join(root, file)
    return None

sample_pdf = find_first_pdf(extract_path)

if not sample_pdf:
    raise FileNotFoundError("No PDF found in extracted files")


In [61]:
# Use absolute paths for clarity
sample_pdf = os.path.abspath(sample_pdf)
print("Attempting to open:", sample_pdf)


Attempting to open: /content/extracted_papers/EARC Dataset/Papers/P017.pdf


In [62]:
import fitz

try:
    with fitz.open(sample_pdf) as doc:
        print(f"Successfully opened {os.path.basename(sample_pdf)}")
        print("Number of pages:", len(doc))
except Exception as e:
    print(f"Failed to open PDF: {str(e)}")
    print("Common fixes:")
    print("- Re-upload ZIP file")
    print("- Check file integrity")
    print("- Use `!apt install libxml2-dev libxslt-dev` for dependencies")


Successfully opened P017.pdf
Number of pages: 9


In [69]:
import os
import zipfile
import fitz  # PyMuPDF

# Revised extraction and processing
extract_path = './extracted_papers'
os.makedirs(extract_path, exist_ok=True)

# Extract with directory awareness
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Process all PDFs
for root, dirs, files in os.walk(extract_path):
    for file in files:
        if file.endswith('.pdf'):
            pdf_path = os.path.join(root, file)
            try:
                with fitz.open(pdf_path) as doc:  # This line needs indented block
                    # Minimal required processing (e.g., page count)
                    print(f"Processing {file} ({len(doc)} pages)")
                    # Add your actual PDF parsing logic here

            except fitz.FileDataError:
                print(f"Skipping corrupted/non-PDF file: {pdf_path}")


Processing P017.pdf (9 pages)
Processing P078.pdf (5 pages)
Processing P059.pdf (8 pages)
Processing P070.pdf (11 pages)
Processing P106.pdf (5 pages)
Processing P073.pdf (14 pages)
Processing P131.pdf (3 pages)
Processing P067.pdf (4 pages)
Processing P126.pdf (8 pages)
Processing P040.pdf (3 pages)
Processing P113.pdf (4 pages)
Processing P096.pdf (14 pages)
Processing P097.pdf (14 pages)
Processing P048.pdf (14 pages)
Processing P074.pdf (3 pages)
Processing P087.pdf (6 pages)
Processing P015.pdf (4 pages)
Processing P031.pdf (5 pages)
Processing P076.pdf (11 pages)
Processing P082.pdf (3 pages)
Processing P125.pdf (9 pages)
Processing P008.pdf (5 pages)
Processing P051.pdf (5 pages)
Processing P075.pdf (5 pages)
Processing P115.pdf (5 pages)
Processing P081.pdf (8 pages)
Processing P112.pdf (5 pages)
Processing P089.pdf (4 pages)
Processing P094.pdf (14 pages)
Processing P056.pdf (13 pages)
Processing P062.pdf (7 pages)
Processing P032.pdf (14 pages)
Processing P117.pdf (9 pages)
P

In [71]:
# Verify PDF readability
def is_valid_pdf(path):
    try:
        with fitz.open(path) as doc:
            return len(doc) > 0
    except:
        return False

print("Valid PDF check:", is_valid_pdf(sample_pdf))


Valid PDF check: True


In [74]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m153.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [79]:
from google.colab import drive
drive.mount('/content/drive')  # Ensure proper mounting


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
import os

# Correct path format for Colab
zip_path = '/content/drive/MyDrive/EARC Dataset-20250417T115257Z-0012.zip'

# Verify file existence
if not os.path.exists(zip_path):
    print("Error: File not found. Check:")
    print(f"1. Filename matches exactly: {os.path.basename(zip_path)}")
    print(f"2. File exists in Google Drive's root folder")
else:
    print("File verified successfully")


Error: File not found. Check:
1. Filename matches exactly: EARC Dataset-20250417T115257Z-0012.zip
2. File exists in Google Drive's root folder


In [81]:
import zipfile

extract_path = '/content/extracted_papers'
os.makedirs(extract_path, exist_ok=True)

try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Extract only PDFs from Papers folder
        for file in zip_ref.namelist():
            if 'Papers/' in file and file.endswith('.pdf'):
                zip_ref.extract(file, extract_path)

    print(f"Extracted to: {extract_path}")
except FileNotFoundError:
    print(f"Critical error: Re-upload ZIP file to Google Drive")


Critical error: Re-upload ZIP file to Google Drive


In [82]:
# Find actual extracted path
final_path = os.path.join(extract_path, 'Papers') if os.path.exists(os.path.join(extract_path, 'Papers')) else extract_path

print("Extracted PDFs:", len([f for f in os.listdir(final_path) if f.endswith('.pdf')]))


Extracted PDFs: 0


In [83]:
# Full processing pipeline
for root, dirs, files in os.walk(extract_path):
    for file in files:
        if file.endswith('.pdf'):
            pdf_path = os.path.join(root, file)
            try:
                # Add your PDF processing logic here
                print(f"Processing {file}")
            except Exception as e:
                print(f"Skipped {file}: {str(e)}")


Processing P017.pdf
Processing P078.pdf
Processing P059.pdf
Processing P070.pdf
Processing P106.pdf
Processing P073.pdf
Processing P131.pdf
Processing P067.pdf
Processing P126.pdf
Processing P040.pdf
Processing P113.pdf
Processing P096.pdf
Processing P097.pdf
Processing P048.pdf
Processing P074.pdf
Processing P087.pdf
Processing P015.pdf
Processing P031.pdf
Processing P076.pdf
Processing P082.pdf
Processing P125.pdf
Processing P008.pdf
Processing P051.pdf
Processing P075.pdf
Processing P115.pdf
Processing P081.pdf
Processing P112.pdf
Processing P089.pdf
Processing P094.pdf
Processing P056.pdf
Processing P062.pdf
Processing P032.pdf
Processing P117.pdf
Processing P103.pdf
Processing P086.pdf
Processing P093.pdf
Processing P116.pdf
Processing P100.pdf
Processing P118.pdf
Processing P023.pdf
Processing P005.pdf
Processing P091.pdf
Processing P123.pdf
Processing P130.pdf
Processing P033.pdf
Processing P055.pdf
Processing P069.pdf
Processing P039.pdf
Processing P013.pdf
Processing P011.pdf


In [85]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)  # Force remount if needed


Mounted at /content/drive


In [86]:
import os

# Correct path format (note "My Drive" vs "MyDrive")
zip_path = '/content/drive/My Drive/EARC Dataset-20250417T115257Z-0012.zip'

# Verify existence
if not os.path.exists(zip_path):
    print(f"Error: File not found. Check:")
    print(f"1. Filename matches exactly (including spaces/numbers)")
    print(f"2. File is in Google Drive's root folder")
else:
    print("File verified successfully")


Error: File not found. Check:
1. Filename matches exactly (including spaces/numbers)
2. File is in Google Drive's root folder


In [88]:
# Use quotes for paths with spaces
zip_path = '/content/drive/My Drive/EARC Dataset-20250417T115257Z-0012.zip'
extract_path = '/content/extracted_papers'

# Verify path exists
if not os.path.exists(zip_path):
    print(f"Error: File not found. Check path: {zip_path}")



Error: File not found. Check path: /content/drive/My Drive/EARC Dataset-20250417T115257Z-0012.zip


In [89]:
!pip install gdown

# Get shareable link from Google Drive (replace FILE_ID)
FILE_ID = "1YOUR_FILE_ID_HERE"  # From Drive's shareable link
!gdown "https://drive.google.com/uc?id={FILE_ID}"

# Now use local path
zip_path = "/content/EARC Dataset-20250417T115257Z-0012.zip"


Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1YOUR_FILE_ID_HERE

but Gdown can't. Please check connections and permissions.


In [91]:
# Clean install latest version
!pip install --upgrade --no-cache-dir gdown




In [92]:
!mkdir -p ~/.cache/gdown/
!cp /content/cookies.txt ~/.cache/gdown/


cp: cannot stat '/content/cookies.txt': No such file or directory


In [93]:
# Replace YOUR_FILE_ID with actual ID from shareable link
FILE_ID = "1YOUR_FILE_ID_HERE"
!gdown "https://drive.google.com/uc?id={FILE_ID}"


Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1YOUR_FILE_ID_HERE

but Gdown can't. Please check connections and permissions.


In [96]:
import os
if os.path.exists("EARC Dataset-20250417T115257Z-0012.zip"):
    print("Download successful!")
else:
    print("Retry with cookies/wget method")



Download successful!


In [98]:
# Step 1: Mount Google Drive and install dependencies
from google.colab import drive
drive.mount('/content/drive')

!pip install pymupdf scipdf_parser transformers

# Step 2: Extract Papers folder from ZIP
import os
import zipfile

zip_path = '/content/drive/MyDrive/EARC Dataset-20250417T115257Z-0012.zip'  # Update path
extract_path = '/content/extracted_papers'

# Create extraction directory
os.makedirs(extract_path, exist_ok=True)

# Extract only Papers folder
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    for file in zip_ref.namelist():
        if file.startswith('Papers/') and file.endswith('.pdf'):
            zip_ref.extract(file, extract_path)

# Verify extraction
papers_dir = os.path.join(extract_path, 'Papers')
print(f"Extracted {len(os.listdir(papers_dir))} PDFs")

# Step 3: PDF Processing and Feature Extraction
import fitz  # PyMuPDF

def process_paper(pdf_path):
    """Extract structured content from PDF"""
    doc = fitz.open(pdf_path)
    paper_data = {
        'text': '',
        'sections': {},
        'abstract': '',
        'references': []
    }

    # Extract text and sections
    for page in doc:
        paper_data['text'] += page.get_text()

    # Simple section detection (customize as needed)
    sections = ['abstract', 'introduction', 'methodology', 'results', 'conclusion']
    for sec in sections:
        if sec in paper_data['text'].lower():
            start = paper_data['text'].lower().find(sec)
            paper_data['sections'][sec] = paper_data['text'][start:start+1000]

    return paper_data

# Step 4: Publishability Classifier
from transformers import pipeline

classifier = pipeline("text-classification",
                    model="mistralai/Mistral-7B-Instruct-v0.2")

def classify_paper(paper_data):
    """Classify paper using LLM"""
    prompt = f"""
    Analyze this research paper for publishability:
    Abstract: {paper_data.get('abstract', '')[:500]}
    Sections: {list(paper_data['sections'].keys())}

    Consider:
    - Methodological soundness
    - Result validity
    - Academic rigor
    - Logical coherence

    Decision (Publishable=1/Non-Publishable=0):"""

    result = classifier(prompt, max_length=10)
    return 1 if "1" in result[0]['generated_text'] else 0

# Step 5: Generate Rationale
def generate_rationale(paper_data, prediction):
    """Generate justification using LLM"""
    justifier = pipeline("text-generation",
                       model="mistralai/Mistral-7B-Instruct-v0.2")

    prompt = f"""
    Provide a 100-word rationale for {'publish' if prediction else 'reject'} decision:
    Paper Sections: {list(paper_data['sections'].keys())}
    Key Content: {paper_data['text'][:1000]}

    Focus on:
    - Methodology clarity
    - Result validation
    - Contribution significance
    - Logical flow"""

    return justifier(prompt, max_length=200)[0]['generated_text']

# Step 6: Process All Papers
import pandas as pd

results = []

for idx, pdf_file in enumerate(os.listdir(papers_dir)):
    if not pdf_file.endswith('.pdf'):
        continue

    pdf_path = os.path.join(papers_dir, pdf_file)
    try:
        paper_data = process_paper(pdf_path)
        prediction = classify_paper(paper_data)
        rationale = generate_rationale(paper_data, prediction)

        results.append({
            'Paper ID': f"P{str(idx+1).zfill(3)}",
            'Publishable': prediction,
            'Rationale': rationale
        })
    except Exception as e:
        print(f"Error processing {pdf_file}: {str(e)}")

# Step 7: Save Results
df = pd.DataFrame(results)
df.to_csv('/content/results.csv', index=False)
print("Results saved to results.csv")

# Step 8: Evaluation (Using 15 Reference Papers)
# Assuming first 15 papers are labeled
reference_labels = [...]  # Add actual labels from problem statement

if len(reference_labels) == 15:
    from sklearn.metrics import accuracy_score, f1_score

    y_true = reference_labels
    y_pred = df['Publishable'].values[:15]

    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.2f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/EARC Dataset-20250417T115257Z-0012.zip'

In [101]:
import os
import zipfile
import fitz  # PyMuPDF

# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 2. Search for the ZIP file in My Drive (case-insensitive, partial match)
search_term = "EARC Dataset-20250417T115257Z-0012"  # Change if your ZIP uses a different prefix
zip_path = None
for root, dirs, files in os.walk('/content/drive/My Drive'):
    for file in files:
        if file.lower().endswith('.zip') and search_term.lower() in file.lower():
            zip_path = os.path.join(root, file)
            break
    if zip_path:
        break

if not zip_path:
    raise FileNotFoundError(f"Could not find a ZIP file containing '{search_term}' in its name in your Google Drive.")

print(f"Found ZIP file: {zip_path}")

# 3. Extract only the Papers folder from the ZIP
extract_path = '/content/extracted_papers'
os.makedirs(extract_path, exist_ok=True)
papers_folder = None

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # Find all files in Papers/ and extract them
    papers_files = [f for f in zip_ref.namelist() if f.startswith('Papers/') and f.endswith('.pdf')]
    if not papers_files:
        raise FileNotFoundError("No PDFs found in a 'Papers/' folder inside the ZIP.")
    for f in papers_files:
        zip_ref.extract(f, extract_path)
    papers_folder = os.path.join(extract_path, 'Papers')

print(f"Extracted {len(papers_files)} PDFs to {papers_folder}")

# 4. Process each PDF: print filename and first 500 characters as a test
pdf_files = [f for f in os.listdir(papers_folder) if f.lower().endswith('.pdf')]
if not pdf_files:
    raise FileNotFoundError(f"No PDFs found in extracted Papers folder: {papers_folder}")

for pdf_file in pdf_files:
    pdf_path = os.path.join(papers_folder, pdf_file)
    try:
        with fitz.open(pdf_path) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
            print(f"\n--- {pdf_file} ---\n{text[:500]}\n{'-'*40}")
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")

print(f"\nProcessed {len(pdf_files)} PDFs successfully.")


Mounted at /content/drive


FileNotFoundError: Could not find a ZIP file containing 'EARC Dataset-20250417T115257Z-0012' in its name in your Google Drive.

In [102]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
