In [None]:
import csv
import google.generativeai as genai
import requests
import concurrent.futures

# Set up Google Gemini API
API_KEY = "AIzaSyDRAasu_HGezGj4cM8-dmZ0KSPssqq34JE" # Replace with your actual API key
genai.configure(api_key=API_KEY)
# Define categories
CATEGORIES = [
    "Deep Learning",
    "Natural Language Processing",
    "Computer Vision",
    "Reinforcement Learning",
    "Optimization & Theory"
]

# Function to read PDF content from URL
def read_pdf_from_url(pdf_url):
    """Reads the text from a PDF given its URL (assuming it's in a readable format)."""
    try:
        response = requests.get(pdf_url, timeout=15)
        if response.status_code != 200:
            return "Error: Unable to download PDF"

        return response.text[:2000]  # Limit text to avoid token overflow

    except Exception as e:
        return f"Error fetching PDF: {e}"

# Function to classify the paper
def classify_paper(pdf_content):
    """Ask Gemini to classify the paper."""
    prompt = f"""
    Read this PDF content:
    {pdf_content}

    Assign it to the most suitable category from these: {', '.join(CATEGORIES)}.
    Return only ONE word as the category.
    """

    try:
        model = genai.GenerativeModel('gemini-pro')
        response = model.generate_content(prompt)
        category = response.text.strip()

        # Validate category
        return category if category in CATEGORIES else "Uncategorized"
    except Exception as e:
        print(f"Error classifying paper: {e}")
        return "Error"

# Function to process a single row (Parallel Execution)
def process_row(row, pdf_link_index):
    if len(row) <= pdf_link_index:
        print("⚠️ Skipping row due to missing columns:", row)
        return row + ["Missing Data"]

    pdf_link = row[pdf_link_index]

    if not pdf_link or "http" not in pdf_link:
        label = "Missing Data"
    else:
        pdf_text = read_pdf_from_url(pdf_link)
        label = classify_paper(pdf_text)

    row.append(label)
    print(f"Processed: {pdf_link} -> {label}")
    return row

# Function to annotate dataset
def annotate_papers(input_file, output_file):
    updated_rows = []

    # Read CSV file
    with open(input_file, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        headers = next(reader)

        # Find the index of "PDF Link" column dynamically
        try:
            pdf_link_index = headers.index("PDF Link")
        except ValueError:
            print("❌ 'PDF Link' column not found in CSV.")
            return

        # Add "Label" column if not present
        if "Label" not in headers:
            headers.append("Label")

        updated_rows.append(headers)

        # Use ThreadPoolExecutor for parallel processing
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            # Submit all rows for parallel processing
            future_to_row = {executor.submit(process_row, row, pdf_link_index): row for row in reader}

            # Collect results as they complete
            for future in concurrent.futures.as_completed(future_to_row):
                updated_rows.append(future.result())

    # Write new CSV file with the label column
    with open(output_file, mode='w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(updated_rows)

    print("✅ Labeling complete! New file saved.")

# Run annotation
CSV_FILE = "C:\\Users\\Lenovo\\Downloads\\Untitled spreadsheet - Sheet1.csv"
OUTPUT_FILE = "C:\\Users\\Lenovo\\Desktop\\neurips_papers_labeled.csv"
annotate_papers(CSV_FILE, OUTPUT_FILE)
