**##Chat 643##**

###Research Paper Retrieval###

Pratik Mahajan ( 23M1731 )
Himanshu Maurya ( 24M1509 )





In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json

In [None]:
!pip install faiss-cpu



In [None]:
!pip install sentence_transformers



Code for ACL query retrieval

In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
import aiohttp
import asyncio
from bs4 import BeautifulSoup

import nest_asyncio

# Apply nest_asyncio to allow nested event loops (useful for Jupyter)
nest_asyncio.apply()

# Load a pre-trained sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Asynchronous function to fetch abstract from a paper's page
async def fetch_paper_abstract(session, paper_url):
    try:
        async with session.get(paper_url) as response:
            if response.status == 200:
                html = await response.text()
                soup = BeautifulSoup(html, "html.parser")

                # Extract abstract - we assume it is in spans without class
                all_spans = soup.find_all("span")
                abstract = " ".join([span.text.strip() for span in all_spans if not span.has_attr('class')])

                return abstract
            return None
    except Exception:
        return None

# Main asynchronous function to handle ACL papers and extract PDF links
async def acl_papers(years):
    url = f"https://aclanthology.org/events/acl-{years}/"
    titles = []
    abstracts = []
    abs_links = []
    pdf_links = []  # To store the PDF links

    # Create an asynchronous session
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            if response.status == 200:
                html = await response.text()
                soup = BeautifulSoup(html, "html.parser")

                # Extract paper titles and their URLs
                strong = soup.find_all('strong')
                for paper_entry in soup.find_all('p', class_='d-sm-flex align-items-stretch'):
                    title_element = paper_entry.find('strong')
                    if title_element:
                        title = title_element.find_next('a').text
                        abs_link = "https://aclanthology.org/" + title_element.find_next('a').get('href')

                        # Check for conditions to filter unwanted links
                        if "https://github.com/baidu" not in abs_link and "pdf\n" not in title:
                            titles.append(title)
                            abs_links.append(abs_link)

                            # Find the corresponding PDF link within the same paper entry
                            pdf_link_element = paper_entry.find('span', class_="d-block mr-2 text-nowrap list-button-row")
                            if pdf_link_element:
                                pdf_link = pdf_link_element.find('a').get('href')
                                if "https://aclanthology.org/" not in pdf_link:
                                    pdf_links.append("https://aclanthology.org/" + pdf_link)
                                else:
                                    pdf_link = pdf_link.replace('.bib', '')
                                    pdf_links.append(pdf_link)
                            else:
                                # Append None if PDF link is not found for this paper
                                pdf_links.append(None)

        # Fetch all paper abstracts asynchronously
        tasks = [fetch_paper_abstract(session, paper_url) for paper_url in abs_links]
        abstracts = await asyncio.gather(*tasks)

    # Ensure pdf_links is the same length as the titles
    # Fill missing pdf_links with None if there are fewer pdf links than titles
    ''' while len(pdf_links) < len(titles):
        pdf_links.append(None)'''

    # Create DataFrame
    df_acl = pd.DataFrame({
        'title': titles,
        'abstract': abstracts,
        'pdf_link': pdf_links
    })
    return df_acl

# Wrapper to run the asynchronous function
def run_acl_papers(years):
    try:
        return asyncio.run(acl_papers(years))
    except RuntimeError:
        # For environments with an already running event loop (like Jupyter notebooks)
        return asyncio.get_event_loop().run_until_complete(acl_papers(years))

# Step 1: Fetch papers and create a DataFrame
df_acl = run_acl_papers(year)

# Step 2: Generate embeddings for the paper abstracts
def embed_papers(df_acl):
    # Join title and abstract to generate a combined embedding
    df_acl['abstract'] = df_acl['abstract'].astype(str)  # Convert the 'abstract' column to string type
    combined_texts = df_acl['title'] + ' ' + df_acl['abstract']
    embeddings = model.encode(combined_texts.tolist(), convert_to_tensor=False)
    return embeddings

# Step 3: Create a FAISS index for fast retrieval
def create_faiss_index(embeddings):
    dim = embeddings.shape[1]  # Dimensionality of the embedding
    index = faiss.IndexFlatL2(dim)  # Using L2 (Euclidean) distance
    index.add(embeddings)  # Add embeddings to the index
    return index

# Step 4: Query-based retrieval using FAISS
def query_papers(query, index, df, num_results=5):
    # Convert query into an embedding
    query_embedding = model.encode([query], convert_to_tensor=False)

    # Search the FAISS index for the most similar papers
    distances, indices = index.search(np.array(query_embedding), num_results)

    # Retrieve the corresponding titles, abstracts, and PDF links
    results = df_acl.iloc[indices[0]].copy()
    results['distance'] = distances[0]
    return results

# Step 5: Putting it all together
# Embed the papers
paper_embeddings = embed_papers(df_acl)
# Convert embeddings to a numpy array
paper_embeddings = np.array(paper_embeddings)

# Create a FAISS index with these embeddings
faiss_index_acl = create_faiss_index(paper_embeddings)





In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
import aiohttp
import asyncio
from bs4 import BeautifulSoup

import nest_asyncio

# Apply nest_asyncio to allow nested event loops (useful for Jupyter)
nest_asyncio.apply()

# Load a pre-trained sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Asynchronous function to fetch abstract from a paper's page
async def fetch_paper_abstract(session, paper_url):
    try:
        async with session.get(paper_url) as response:
            if response.status == 200:
                html = await response.text()
                soup = BeautifulSoup(html, "html.parser")

                # Extract abstract - we assume it is in spans without class
                all_spans = soup.find_all("span")
                abstract = " ".join([span.text.strip() for span in all_spans if not span.has_attr('class')])

                return abstract
            return None
    except Exception:
        return None

# Main asynchronous function to handle ACL papers and extract PDF links
async def acl_papers(years):
    url = f"https://aclanthology.org/events/acl-{years}/"
    titles = []
    abstracts = []
    abs_links = []
    pdf_links = []  # To store the PDF links

    # Create an asynchronous session
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            if response.status == 200:
                html = await response.text()
                soup = BeautifulSoup(html, "html.parser")

                # Extract paper titles and their URLs
                strong = soup.find_all('strong')
                for paper_entry in soup.find_all('p', class_='d-sm-flex align-items-stretch'):
                    title_element = paper_entry.find('strong')
                    if title_element:
                        title = title_element.find_next('a').text
                        abs_link = "https://aclanthology.org/" + title_element.find_next('a').get('href')

                        # Check for conditions to filter unwanted links
                        if "https://github.com/baidu" not in abs_link and "pdf\n" not in title:
                            titles.append(title)
                            abs_links.append(abs_link)

                            # Find the corresponding PDF link within the same paper entry
                            pdf_link_element = paper_entry.find('span', class_="d-block mr-2 text-nowrap list-button-row")
                            if pdf_link_element:
                                pdf_link = pdf_link_element.find('a').get('href')
                                if "https://aclanthology.org/" not in pdf_link:
                                    pdf_links.append("https://aclanthology.org/" + pdf_link)
                                else:
                                    pdf_link = pdf_link.replace('.bib', '')
                                    pdf_links.append(pdf_link)
                            else:
                                # Append None if PDF link is not found for this paper
                                pdf_links.append(None)

        # Fetch all paper abstracts asynchronously
        tasks = [fetch_paper_abstract(session, paper_url) for paper_url in abs_links]
        abstracts = await asyncio.gather(*tasks)

    # Create DataFrame
    df_acl = pd.DataFrame({
        'title': titles,
        'abstract': abstracts,
        'pdf_link': pdf_links
    })
    return df_acl

# Wrapper to run the asynchronous function
def run_acl_papers(years):
    try:
        return asyncio.run(acl_papers(years))
    except RuntimeError:
        # For environments with an already running event loop (like Jupyter notebooks)
        return asyncio.get_event_loop().run_until_complete(acl_papers(years))

# Step 1: Fetch papers and create a DataFrame
df_acl = run_acl_papers(year)

# Step 2: Generate embeddings for the paper abstracts
def embed_papers(df_acl):
    # Join title and abstract to generate a combined embedding
    df_acl['abstract'] = df_acl['abstract'].astype(str)  # Convert the 'abstract' column to string type
    combined_texts = df_acl['title'] + ' ' + df_acl['abstract']
    embeddings = model.encode(combined_texts.tolist(), convert_to_tensor=False)
    return embeddings

# Step 3: Create a FAISS index for fast retrieval
def create_faiss_index(embeddings):
    dim = embeddings.shape[1]  # Dimensionality of the embedding
    index = faiss.IndexFlatL2(dim)  # Using L2 (Euclidean) distance
    index.add(embeddings)  # Add embeddings to the index
    return index

# Step 4: Query-based retrieval using FAISS
def query_papers(query, index, df, top_k=5):
    # Convert query into an embedding
    query_embedding = model.encode([query], convert_to_tensor=False)

    # Search the FAISS index for the most similar papers
    distances, indices = index.search(np.array(query_embedding), top_k)

    # Normalize distances
    min_distance = distances.min()
    max_distance = distances.max()

    # Apply min-max normalization
    normalized_distances = (distances - min_distance) / (max_distance - min_distance)

    # Retrieve the corresponding titles, abstracts, and PDF links
    results = df.iloc[indices[0]].copy()
    results['normalized_distance'] = normalized_distances[0]
    return results.head(top_k)

# Step 5: Putting it all together
# Embed the papers
paper_embeddings = embed_papers(df_acl)
# Convert embeddings to a numpy array
paper_embeddings = np.array(paper_embeddings)

# Create a FAISS index with these embeddings
faiss_index_acl = create_faiss_index(paper_embeddings)

# Example of querying papers
# results = query_papers("Your query here", faiss_index_acl, df_acl, num_results=5)




In [None]:
6# Example query
query_sentence = '''query''' "transformers in NLP"
retrieved_papers = query_papers(query_sentence, faiss_index_acl, df_acl,top_k=5)

# Output results to CSV
retrieved_papers.to_csv('relevant_papers_with_pdfs.csv', index=False)

# Alternatively, output to Excel or JSON
retrieved_papers.to_excel('relevant_papers_with_pdfs.xlsx', index=False)
retrieved_papers.to_json('relevant_papers_with_pdfs.json', orient='records')

retrieved_papers


Unnamed: 0,title,abstract,pdf_link,normalized_distance
124,DialSQL: Dialogue Based Structured Query Gener...,The recent advance in deep learning and semant...,https://aclanthology.org/P18-1124.pdf,0.0
34,Semantic Parsing with Syntax- and Table-Aware ...,We present a generative model to map natural l...,https://aclanthology.org/P18-1034.pdf,0.622805
368,Personalized Language Model for Query Auto-Com...,Query auto-completion is a search engine featu...,https://aclanthology.org/P18-2111.pdf,0.653376
600,The Annotated Transformer,A major goal of open-source NLP is to quickly ...,https://aclanthology.org/W18-2509.pdf,0.855199
536,Phrase2VecGLM: Neural generalized language mod...,"In this work, we develop a novel, completely u...",https://aclanthology.org/W18-2313.pdf,1.0


Code for ICLR based query retrieval

In [None]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import pandas as pd
import nest_asyncio
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
import requests  # Added for synchronous requests

# Apply nest_asyncio to allow nested event loops (useful for Jupyter)
nest_asyncio.apply()

# Load pre-trained model from SentenceTransformers for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')  # Compact model for fast embedding

# Asynchronous function to fetch a single paper's abstract and PDF link
async def fetch_paper_abstract_and_pdf(session, link):
    try:
        async with session.get(link) as response:
            if response.status == 200:
                html = await response.text()
                soup = BeautifulSoup(html, "html.parser")

                # Extract abstract
                abstract_div = soup.find('div', id='abstractExample')
                abstract = abstract_div.find('p').text.strip() if abstract_div and abstract_div.find('p') else 'Abstract not available'

                # Extract PDF link
                pdf_link = None
                pdf_tag = soup.find('a', href=lambda x: x and '.pdf' in x)  # Look for anchor tag with '.pdf' anywhere in href
                if pdf_tag:
                    pdf_link = pdf_tag.get('href')
                    print(f"Found PDF link: {pdf_link}")  # Debugging: print the found PDF link
                    if not pdf_link.startswith('http'):
                        pdf_link = "https://iclr.cc" + pdf_link  # Make the URL absolute if it's relative
                    print(f"Final PDF link: {pdf_link}")  # Debugging: print the final PDF link
                else:
                    '''print(f"No PDF link found for {link}")'''
                    pass
                return abstract, link
            else:
                return 'Abstract not available', None
    except Exception as e:
        return f'Error fetching abstract: {e}', None

# Main function to handle ICLR papers, now incorporating your scraping logic
async def iclr_papers_with_pdf(year):
    titles = []
    links = []
    abstracts = []
    pdf_links = []  # List to store PDF links

    # Using the scraping logic you provided
    url = f"https://iclr.cc/virtual/{year}/papers.html?filter=titles"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    title_list = soup.find('ul', class_="nav nav-pills")
    titles_data = title_list.find_next('ul').find_all('a')

    for head in titles_data:
        titles.append(head.text.strip())  # Extract title
        full_url = "https://iclr.cc/" + head.get('href')
        links.append(full_url)

    # Create an asynchronous session
    async with aiohttp.ClientSession() as session:
        # Fetch all paper abstracts and PDF links asynchronously
        tasks = [fetch_paper_abstract_and_pdf(session, paper_link) for paper_link in links]
        paper_data = await asyncio.gather(*tasks)

        # Unpack the fetched data
        for abstract, pdf_link in paper_data:
            abstracts.append(abstract)
            pdf_links.append(pdf_link)

    # Create DataFrame
    df = pd.DataFrame({
        'title': titles,
        'abstract': abstracts,
        'pdf_link': pdf_links  # Add PDF links to the DataFrame
    })
    df.reset_index(drop=True, inplace=True)
    return df

# Wrapper to run the asynchronous function
def run_iclr_papers_with_pdf(year):
    try:
        return asyncio.run(iclr_papers_with_pdf(year))
    except RuntimeError:
        # For environments with an already running event loop (like Jupyter notebooks)
        return asyncio.get_event_loop().run_until_complete(iclr_papers_with_pdf(year))

# Embedding the papers and creating FAISS index
def create_faiss_index(papers_df):
    titles = papers_df['title'].tolist()
    abstracts = papers_df['abstract'].tolist()

    # Combine titles and abstracts
    documents = [f"{title} {abstract}" for title, abstract in zip(titles, abstracts)]

    # Embed documents
    embeddings = model.encode(documents, show_progress_bar=True)

    # Normalize embeddings for cosine similarity
    normalized_embeddings = np.array([embedding / np.linalg.norm(embedding) for embedding in embeddings])

    # Initialize FAISS index with embedding dimension size
    dimension = normalized_embeddings.shape[1]  # Embedding size from the model
    faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product (dot product, akin to cosine similarity)

    # Add embeddings to the FAISS index
    faiss_index.add(normalized_embeddings)

    return faiss_index, papers_df

# Query-based retrieval function with PDF links
def query_faiss_with_pdf(faiss_index, query, papers_df, top_k=5):
    # Embed the query
    query_embedding = model.encode([query])[0]

    # Normalize the query embedding
    query_embedding = query_embedding / np.linalg.norm(query_embedding)

    # Search the FAISS index
    distances, indices = faiss_index.search(np.array([query_embedding]), top_k)

    # Retrieve relevant papers
    results = papers_df.iloc[indices[0]]
    results['similarity_score'] = distances[0]  # Add similarity scores to the result

    return results

# Example call to retrieve papers based on a query
def run_query_based_retrieval_with_pdf_iclr(year, query, top_k=5):
    # Fetch the papers from the ICLR conference (as done previously)
    papers_df = run_iclr_papers_with_pdf(year)

    # Create the FAISS index for these papers
    faiss_index, papers_df = create_faiss_index(papers_df)

    # Retrieve the top-k most relevant papers
    relevant_papers = query_faiss_with_pdf(faiss_index, query, papers_df, top_k=top_k)

    # Save to CSV (or you can output to JSON or XLSX as needed)
    relevant_papers.to_csv(f"relevant_papers_with_pdf_iclr{year}.csv", index=False)

    return relevant_papers




In [None]:
# Example query to retrieve papers with PDF links and similarity scores
query = query ''' "how can we incorporate neural network for optimization" '''
year=2019
relevant_papers_df = run_query_based_retrieval_with_pdf_iclr(year, query, top_k=5)
relevant_papers_df


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['similarity_score'] = distances[0]  # Add similarity scores to the result


Unnamed: 0,title,abstract,pdf_link,similarity_score
197,"Deep, Skinny Neural Networks are not Universal...",In order to choose a neural network architectu...,https://iclr.cc//virtual/2019/poster/905,0.535856
267,Self-Tuning Networks: Bilevel Optimization of ...,Hyperparameter optimization can be formulated ...,https://iclr.cc//virtual/2019/poster/824,0.532122
245,Neural network gradient-based learning of blac...,Deep neural networks work well at approximatin...,https://iclr.cc//virtual/2019/poster/849,0.52304
156,Gradient Descent Provably Optimizes Over-param...,Abstract not available,https://iclr.cc//virtual/2019/poster/956,0.515727
145,Initialized Equilibrium Propagation for Backpr...,Deep neural networks are almost universally tr...,https://iclr.cc//virtual/2019/poster/968,0.507854


NIPS Query Based retrieval

In [None]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import pandas as pd
import nest_asyncio
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load pre-trained model from SentenceTransformers for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Helper function to retry requests
async def fetch_with_retries(session, url, retries=3, delay=2):
    for attempt in range(retries):
        try:
            async with session.get(url) as response:
                text = await response.text()
                return text
        except aiohttp.ClientConnectionError as e:
            print(f"Connection error: {e}, retrying... ({attempt + 1}/{retries})")
        except aiohttp.ClientResponseError as e:
            print(f"Response error: {e}, retrying... ({attempt + 1}/{retries})")
        except asyncio.TimeoutError:
            print(f"Timeout on {url}, retrying... ({attempt + 1}/{retries})")

    raise aiohttp.ServerDisconnectedError(f"Failed to fetch {url} after {retries} retries")

# Asynchronous function to fetch a single paper's title, abstract, and PDF link
async def fetch_paper_data(session, url):
    try:
        text = await fetch_with_retries(session, url)
        soup = BeautifulSoup(text, "html.parser")

        # Get the paper title
        title_tag = soup.find('title')
        paper_title = title_tag.text.strip() if title_tag else "No Title"

        # Replace abstract extraction logic with your code
        # Find the 'Abstract' heading, then get the next paragraph containing the actual abstract
        abstract_heading = soup.find('p')
        if abstract_heading:
            abstract_tag = abstract_heading.find_next('p').find_next('p')
            abstract = abstract_tag.text if abstract_tag else "No Abstract"
        else:
            abstract = "No Abstract"

        # Generate the PDF link
        pdf_tag = soup.find('a', href=lambda x: x and x.endswith('.pdf'))
        pdf_link = pdf_tag['href'] if pdf_tag else "No PDF Link"

        if not pdf_link.startswith('http'):
            pdf_link = f"https://papers.nips.cc{pdf_link}"

        return paper_title, abstract, pdf_link
    except Exception as e:
        print(f"Failed to fetch paper data from {url}: {e}")
        return "No Title", "No Abstract", "No PDF Link"

# Function to fetch batch of papers
async def fetch_batch_papers(session, batch):
    tasks = [fetch_paper_data(session, url) for url in batch]
    paper_data = await asyncio.gather(*tasks)
    return paper_data

# Function to fetch all papers for the specified year in batches
async def fetch_all_papers_in_batches(base_url, year, batch_size=10, delay_between_batches=5):
    url = f"{base_url}/{year}"
    async with aiohttp.ClientSession() as session:
        text = await fetch_with_retries(session, url)
        soup = BeautifulSoup(text, "html.parser")
        titles = soup.find_all('a', href=True)

        # Prepare the URLs for the paper details
        paper_urls = [f"https://papers.nips.cc{title.get('href')}" for title in titles]

        # Split the paper URLs into batches
        batches = [paper_urls[i:i+batch_size] for i in range(0, len(paper_urls), batch_size)]

        all_paper_data = []

        for batch in batches:
            # Fetch papers in the current batch
            batch_paper_data = await fetch_batch_papers(session, batch)
            all_paper_data.extend(batch_paper_data)

            # Wait for a delay between batches to avoid server overload
            await asyncio.sleep(delay_between_batches)

        papers = [data[0] for data in all_paper_data]
        abstracts = [data[1] for data in all_paper_data]
        pdf_links = [data[2] for data in all_paper_data]

        df = pd.DataFrame({'Title': papers, 'Abstract': abstracts, 'PDF Link': pdf_links})
        return df

# Wrapper to run the asynchronous function
def nips_papers(year, batch_size=10):
    base_url = "https://papers.nips.cc/paper_files/paper"
    loop = asyncio.get_event_loop()
    df = loop.run_until_complete(fetch_all_papers_in_batches(base_url, year, batch_size))
    return df

# Function to create FAISS index for NIPS papers
def create_faiss_index(papers_df):
    titles = papers_df['Title'].tolist()
    abstracts = papers_df['Abstract'].tolist()

    # Combine titles and abstracts for embedding
    documents = [f"{title} {abstract}" for title, abstract in zip(titles, abstracts)]

    # Embed documents
    embeddings = model.encode(documents, show_progress_bar=True)

    # Normalize embeddings for cosine similarity
    normalized_embeddings = np.array([embedding / np.linalg.norm(embedding) for embedding in embeddings])

    # Initialize FAISS index with embedding dimension size
    dimension = normalized_embeddings.shape[1]  # Embedding size from the model
    faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product (dot product, akin to cosine similarity)

    # Add embeddings to the FAISS index
    faiss_index.add(normalized_embeddings)

    return faiss_index, papers_df

# Query-based retrieval function with PDF links for NIPS papers
def query_faiss_with_pdf(faiss_index, query, papers_df, top_k=5):
    # Embed the query
    query_embedding = model.encode([query])[0]

    # Normalize the query embedding
    query_embedding = query_embedding / np.linalg.norm(query_embedding)

    # Search the FAISS index
    distances, indices = faiss_index.search(np.array([query_embedding]), top_k)

    # Retrieve relevant papers
    results = papers_df.iloc[indices[0]]
    results['similarity_score'] = distances[0]  # Add similarity scores to the result

    return results

# Function to fetch NIPS papers and perform query-based retrieval
def run_query_based_retrieval_with_pdf_nips1(year, query, top_k=5):
    # Fetch NIPS papers for the given year
    papers_df = nips_papers(year, batch_size=2000)

    # Create FAISS index for the papers
    faiss_index, papers_df = create_faiss_index(papers_df)

    # Perform query-based retrieval
    relevant_papers = query_faiss_with_pdf(faiss_index, query, papers_df, top_k=top_k)

    # Save results to CSV (or output in other formats like JSON/XLSX as needed)
    relevant_papers.to_csv(f"relevant_nips_papers_with_pdf_{year}.csv", index=False)

    return relevant_papers

# Example usage for query-based retrieval





In [None]:
query =  "deep learning for time series forecasting"  # Example query
year=2018
relevant_papers = run_query_based_retrieval_with_pdf_nips1(year, query, top_k=5)
print(f"Top relevant papers for NIPS 2018:\n")
relevant_papers

Connection error: Cannot connect to host papers.nips.cchttps:443 ssl:default [Name or service not known], retrying... (1/3)
Connection error: Cannot connect to host papers.nips.cchttps:443 ssl:default [Name or service not known], retrying... (1/3)
Connection error: Cannot connect to host papers.nips.cchttps:443 ssl:default [Name or service not known], retrying... (2/3)
Connection error: Cannot connect to host papers.nips.cchttps:443 ssl:default [Name or service not known], retrying... (3/3)
Failed to fetch paper data from https://papers.nips.cchttps://www.proceedings.com/search-result/?search_query=nips: Failed to fetch https://papers.nips.cchttps://www.proceedings.com/search-result/?search_query=nips after 3 retries
Connection error: Cannot connect to host papers.nips.cchttps:443 ssl:default [Name or service not known], retrying... (2/3)
Connection error: Cannot connect to host papers.nips.cchttps:443 ssl:default [Name or service not known], retrying... (3/3)
Failed to fetch paper dat

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Top relevant papers for NIPS 2018:



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['similarity_score'] = distances[0]  # Add similarity scores to the result


Unnamed: 0,Title,Abstract,PDF Link,similarity_score
383,Deep State Space Models for Time Series Foreca...,We present a novel approach to probabilistic t...,https://papers.nips.cc/paper_files/paper/2018/...,0.740821
124,Learning filter widths of spectral decompositi...,Time series classification using deep neural n...,https://papers.nips.cc/paper_files/paper/2018/...,0.503503
700,"FastGRNN: A Fast, Accurate, Stable and Tiny Ki...",This paper develops the FastRNN and FastGRNN a...,https://papers.nips.cc/paper_files/paper/2018/...,0.496754
883,Approximating Real-Time Recurrent Learning wit...,Despite all the impressive advances of recurre...,https://papers.nips.cc/paper_files/paper/2018/...,0.495714
409,Complex Gated Recurrent Neural Networks,Complex numbers have long been favoured for di...,https://papers.nips.cc/paper_files/paper/2018/...,0.489789


ECCV Query Based retrieval

In [None]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import pandas as pd
import nest_asyncio
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load pre-trained model from SentenceTransformers for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Asynchronous function to fetch a single paper's abstract
async def fetch_paper_abstract(session, link):
    try:
        async with session.get(link) as response:
            if response.status == 200:
                html = await response.text()
                paper_soup = BeautifulSoup(html, "html.parser")

                # Extract abstract from the div with id='abstract'
                abstract_div = paper_soup.find('div', id='abstract')
                if abstract_div:
                    return abstract_div.text.strip()
                else:
                    return 'Abstract not available'
            else:
                return f'Error: {response.status}'
    except Exception as e:
        return f'Error fetching abstract: {e}'

# Asynchronous function to fetch ECCV papers for a given year
async def eccv_papers(year):
    year = f'eccv_{year}'
    paper_titles = []
    paper_links = []
    paper_pdfs = []
    paper_abstracts = []

    # URL of the main ECCV page
    url = "https://www.ecva.net/papers.php"

    # Create an asynchronous session
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            if response.status == 200:
                html = await response.text()
                soup = BeautifulSoup(html, "html.parser")

                # Find all divs with class 'accordion-content'
                titles = soup.find_all('div', class_='accordion-content')

                # Loop over all titles to extract paper titles and links
                for title in titles:
                    anchors = title.find_all('a')
                    for anchor in anchors:
                        href = anchor.get('href')
                        if href:
                            # If it's a paper link and contains the specified year
                            if href.endswith('.php') and year in href:
                                paper_title = anchor.text.strip()
                                if paper_title:
                                    paper_titles.append(paper_title)
                                    full_url = f'https://www.ecva.net/{href}'
                                    paper_links.append(full_url)
                                    '''pdf_link = full_url.replace('.php', '.pdf')  # Replace '.php' with '.pdf'
                                    paper_pdfs.append(pdf_link)'''
                                    # Add PDF link (if available, assuming the same structure)
                                if href.endswith('.php') and year in href and not href.endswith('-supp.pdf'):
                                    pdf_link =  f'https://www.ecva.net/{href}'
                                    pdf_link.replace('.php', '.pdf')
                                    paper_pdfs.append(pdf_link)

        # Fetch all paper abstracts asynchronously
        tasks = [fetch_paper_abstract(session, link) for link in paper_links]
        paper_abstracts = await asyncio.gather(*tasks)

    # Create DataFrame with all extracted data
    df = pd.DataFrame({
        'Title': paper_titles,
        'Abstract': paper_abstracts,
        'PDF Link': paper_pdfs
    })
    df.reset_index(drop=True, inplace=True)
    return df

# Wrapper to run the asynchronous function based on the environment
def run_eccv_papers(year):
    try:
        return asyncio.run(eccv_papers(year))
    except RuntimeError:
        # For environments with an already running event loop (like Jupyter notebooks)
        return asyncio.get_event_loop().run_until_complete(eccv_papers(year))

# Function to create FAISS index for ECCV papers
def create_faiss_index(papers_df):
    titles = papers_df['Title'].tolist()
    abstracts = papers_df['Abstract'].tolist()

    # Combine titles and abstracts for embedding
    documents = [f"{title} {abstract}" for title, abstract in zip(titles, abstracts)]

    # Embed documents
    embeddings = model.encode(documents, show_progress_bar=True)

    # Normalize embeddings for cosine similarity
    normalized_embeddings = np.array([embedding / np.linalg.norm(embedding) for embedding in embeddings])

    # Initialize FAISS index with embedding dimension size
    dimension = normalized_embeddings.shape[1]  # Embedding size from the model
    faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product (dot product, akin to cosine similarity)

    # Add embeddings to the FAISS index
    faiss_index.add(normalized_embeddings)

    return faiss_index, papers_df

# Query-based retrieval function with PDF links for ECCV papers
def query_faiss_with_pdf(faiss_index, query, papers_df, top_k=5):
    # Embed the query
    query_embedding = model.encode([query])[0]

    # Normalize the query embedding
    query_embedding = query_embedding / np.linalg.norm(query_embedding)

    # Search the FAISS index
    distances, indices = faiss_index.search(np.array([query_embedding]), top_k)

    # Retrieve relevant papers
    results = papers_df.iloc[indices[0]]
    results['similarity_score'] = distances[0]  # Add similarity scores to the result

    return results

# Function to fetch ECCV papers and perform query-based retrieval
def run_query_based_retrieval_with_pdf_eccv(year, query, top_k=5):
    # Fetch ECCV papers for the given year
    papers_df = run_eccv_papers(year)

    # Create FAISS index for the papers
    faiss_index, papers_df = create_faiss_index(papers_df)

    # Perform query-based retrieval
    relevant_papers = query_faiss_with_pdf(faiss_index, query, papers_df, top_k=top_k)

    # Save results to CSV (or output in other formats like JSON/XLSX as needed)
    relevant_papers.to_csv(f"relevant_eccv_papers_with_pdf_{year}.csv", index=False)

    return relevant_papers





In [None]:
query = "deep learning for image recognition"  # Example query
year=2018
relevant_papers = run_query_based_retrieval_with_pdf_eccv(year, query, top_k=5)
print(f"Top relevant papers for ECCV 2018:\n")
relevant_papers

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Top relevant papers for ECCV 2018:



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['similarity_score'] = distances[0]  # Add similarity scores to the result


Unnamed: 0,Title,Abstract,PDF Link,similarity_score
99,Diverse feature visualizations reveal invarian...,Visualizing features in deep neural networks (...,https://www.ecva.net/papers/eccv_2018/papers_E...,0.538293
664,Evaluating Capability of Deep Neural Networks ...,Inspired by the pioneering work of information...,https://www.ecva.net/papers/eccv_2018/papers_E...,0.535041
123,Transductive Semi-Supervised Deep Learning usi...,"In this paper, we propose Transductive Semi-Su...",https://www.ecva.net/papers/eccv_2018/papers_E...,0.526344
53,DeepKSPD: Learning Kernel-matrix-based SPD Rep...,"As a second-order pooled representation, covar...",https://www.ecva.net/papers/eccv_2018/papers_E...,0.525246
223,Semi-Supervised Deep Learning with Memory,We consider the semi-supervised multi-class cl...,https://www.ecva.net/papers/eccv_2018/papers_E...,0.521009


ICML Query Based Retrieval


In [None]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import pandas as pd
import nest_asyncio
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
import requests

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load pre-trained model from SentenceTransformers for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Dictionary for ICML year and corresponding volume number
icml_volumes = {
    2016: 48,
    2017: 70,
    2018: 80,
    2019: 97,
    2020: 119,
    2021: 139,
    2022: 162,
    2023: 202,
    2024: 250  # Hypothetical for 2024, update when available
}

# Asynchronous function to fetch a single paper's abstract
async def fetch_paper_abstract(session, link):
    try:
        async with session.get(link) as response:
            if response.status == 200:
                html = await response.text()
                paper_soup = BeautifulSoup(html, "html.parser")

                # Extract abstract from the div with id='abstract'
                abstract_div = paper_soup.find('div', id='abstract')
                if abstract_div:
                    return abstract_div.text.strip()
                else:
                    return 'Abstract not available'
            else:
                return f'Error: {response.status}'
    except Exception as e:
        return f'Error fetching abstract: {e}'

# Function to fetch ICML papers for a given year
def icml_papers(year):
    paper_titles = []
    paper_links = []
    pdf_links = []
    paper_years = []
    abstracts = []

    # Get the volume number for the year
    volume = icml_volumes.get(year)
    if volume is None:
        print(f"Volume for ICML {year} is not available.")
        return None

    # URL of the ICML page for a specific year
    url = f"https://proceedings.mlr.press/v{volume}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all paper listings
    papers = soup.find_all('div', class_='paper')

    # Check if papers were found
    if not papers:
        print(f"No papers found for ICML {year} at {url}")
        return None

    # Loop over all papers
    for paper_div in papers:
        # Get the title from 'p' tag with class 'title'
        title = paper_div.find('p', class_='title').text.strip()
        # Get the paper link from 'a' tag (if available)
        paper_link = paper_div.find('a').get('href')
        if not paper_link.startswith('http'):
            paper_link = f"https://proceedings.mlr.press{paper_link}"

        # Get the PDF link (from 'a' tag with text 'Download PDF')
        pdf_link = paper_div.find('a', text="Download PDF")
        if pdf_link:
            pdf_link = pdf_link.get('href')
            if not pdf_link.startswith('http'):
                pdf_link = f"https://proceedings.mlr.press{pdf_link}"
        else:
            pdf_link = 'No PDF link available'

        # Append the title, link, PDF, and year
        paper_titles.append(title)
        paper_links.append(paper_link)
        pdf_links.append(pdf_link)
        paper_years.append(year)

    # Create DataFrame with scraped data
    df = pd.DataFrame({
        'year': paper_years,
        'title': paper_titles,
        'paper link': paper_links,
        'pdf link': pdf_links,
        'Conference': 'ICML'
    })

    # Explicitly ensure 'year' column is of integer type
    df['year'] = df['year'].astype(int)

    return df

# Function to create FAISS index for ICML papers
def create_faiss_index_icml(papers_df):
    titles = papers_df['title'].tolist()

    # Embed titles for FAISS indexing
    embeddings = model.encode(titles, show_progress_bar=True)

    # Normalize embeddings for cosine similarity
    normalized_embeddings = np.array([embedding / np.linalg.norm(embedding) for embedding in embeddings])

    # Initialize FAISS index with embedding dimension size
    dimension = normalized_embeddings.shape[1]  # Embedding size from the model
    faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product (dot product, akin to cosine similarity)

    # Add embeddings to the FAISS index
    faiss_index.add(normalized_embeddings)

    return faiss_index, papers_df

# Query-based retrieval function with PDF links for ICML papers
def query_faiss_with_pdf_icml(faiss_index, query, papers_df, top_k=5):
    # Embed the query
    query_embedding = model.encode([query])[0]

    # Normalize the query embedding
    query_embedding = query_embedding / np.linalg.norm(query_embedding)

    # Search the FAISS index
    distances, indices = faiss_index.search(np.array([query_embedding]), top_k)

    # Retrieve relevant papers
    results = papers_df.iloc[indices[0]]
    results['similarity_score'] = distances[0]  # Add similarity scores to the result

    return results

# Function to fetch ICML papers and perform query-based retrieval for 2018
def run_query_based_retrieval_with_pdf_icml(year, query, top_k=5):
    # Fetch ICML papers for the given year
    papers_df = icml_papers(year)

    # Create FAISS index for the papers
    faiss_index, papers_df = create_faiss_index_icml(papers_df)

    # Fetch abstracts asynchronously for the papers
    async def fetch_all_abstracts(session):
        tasks = [fetch_paper_abstract(session, link) for link in papers_df['paper link']]
        return await asyncio.gather(*tasks)

    # Fetch abstracts asynchronously
    async def fetch_abstracts():
        async with aiohttp.ClientSession() as session:
            return await fetch_all_abstracts(session)

    # Run the async function to fetch abstracts
    abstracts = asyncio.run(fetch_abstracts())
    papers_df['abstract'] = abstracts

    # Perform query-based retrieval
    relevant_papers = query_faiss_with_pdf_icml(faiss_index, query, papers_df, top_k=top_k)

    # Save results to CSV
    relevant_papers.to_csv(f"relevant_icml_papers_with_pdf_{year}.csv", index=False)

    return relevant_papers





In [None]:
query = "deep learning for image recognition"  # Example query
year=2018
relevant_papers = run_query_based_retrieval_with_pdf_icml(year, query, top_k=5)
print(f"Top relevant papers for ICML 2018:\n")
relevant_papers

  pdf_link = paper_div.find('a', text="Download PDF")


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Top relevant papers for ICML 2018:



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['similarity_score'] = distances[0]  # Add similarity scores to the result


Unnamed: 0,year,title,paper link,pdf link,Conference,abstract,similarity_score
541,2018,Deep Predictive Coding Network for Object Reco...,https://proceedings.mlr.press/v80/wen18a.html,http://proceedings.mlr.press/v80/wen18a/wen18a...,ICML,Based on the predictive coding theory in neuro...,0.622312
430,2018,Gradually Updated Neural Networks for Large-Sc...,https://proceedings.mlr.press/v80/qiao18b.html,http://proceedings.mlr.press/v80/qiao18b/qiao1...,ICML,Depth is one of the keys that make neural netw...,0.583673
611,2018,Understanding Generalization and Optimization ...,https://proceedings.mlr.press/v80/zhou18a.html,http://proceedings.mlr.press/v80/zhou18a/zhou1...,ICML,This work aims to provide understandings on th...,0.548413
445,2018,Learning to Reweight Examples for Robust Deep ...,https://proceedings.mlr.press/v80/ren18a.html,http://proceedings.mlr.press/v80/ren18a/ren18a...,ICML,Deep neural networks have been shown to be ver...,0.533541
55,2018,To Understand Deep Learning We Need to Underst...,https://proceedings.mlr.press/v80/belkin18a.html,http://proceedings.mlr.press/v80/belkin18a/bel...,ICML,Generalization performance of classifiers in d...,0.530446


ICCV Query Based Retrieval

In [None]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import pandas as pd
import nest_asyncio
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
import requests

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load pre-trained model from SentenceTransformers for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

class ConferenceScraper:
    def __init__(self, conf, years):
        self.conf = conf
        self.years = years if isinstance(years, list) else [years]  # Ensure years is a list
        self.data = []

    def get_all_papers(self, year):
        base_url = f"https://openaccess.thecvf.com/{self.conf}{year}"
        url = f"{base_url}?day=all"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        paper_titles = soup.find_all('dt', class_='ptitle')

        for title in paper_titles:
            paper_title = title.find('a').text.strip()
            a_tag = title.find('a')
            if a_tag:
                paper_link = a_tag.get('href')
                paper_link = f"http://openaccess.thecvf.com/{paper_link}"

                # Find corresponding PDF link
                pdf_link = None
                pdf_tag = title.find_next_sibling('dd')
                if pdf_tag and pdf_tag.find('a'):
                    pdf_link = pdf_tag.find('a').get('href')
                    if pdf_link.endswith('.pdf'):
                        pdf_link = f"http://openaccess.thecvf.com/{pdf_link}"

                # Fetch the abstract
                abstract = self.get_abstract(paper_link)

                # Add the data to the list
                self.data.append({
                    "year": year,
                    "title": paper_title,
                    "link": paper_link,
                   # "pdf link": pdf_link,
                   # "abstract": abstract,
                    "Conference": self.conf
                })

    def get_abstract(self, paper_link):
        try:
            response = requests.get(paper_link)
            soup = BeautifulSoup(response.text, "html.parser")
            abstract_div = soup.find('blockquote', class_='abstract')
            if abstract_div:
                return abstract_div.text.strip()
            else:
                return 'Abstract not available'
        except Exception as e:
            return f'Error fetching abstract: {e}'

    def fetch_conference_papers(self):
        for year in self.years:
            print(f"Fetching papers for {self.conf} {year}...")
            self.get_all_papers(year)

        # Create DataFrame with the titles, links, PDF links, abstracts, and year
        df = pd.DataFrame(self.data)
        return df

# Function to create FAISS index for ICCV papers
def create_faiss_index_iccv(papers_df):
    titles = papers_df['title'].tolist()

    # Embed titles for FAISS indexing
    embeddings = model.encode(titles, show_progress_bar=True)

    # Normalize embeddings for cosine similarity
    normalized_embeddings = np.array([embedding / np.linalg.norm(embedding) for embedding in embeddings])

    # Initialize FAISS index with embedding dimension size
    dimension = normalized_embeddings.shape[1]  # Embedding size from the model
    faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product (dot product, akin to cosine similarity)

    # Add embeddings to the FAISS index
    faiss_index.add(normalized_embeddings)

    return faiss_index, papers_df

# Query-based retrieval function with PDF links for ICCV papers
def query_faiss_with_pdf_iccv(faiss_index, query, papers_df, top_k=5):
    # Embed the query
    query_embedding = model.encode([query])[0]

    # Normalize the query embedding
    query_embedding = query_embedding / np.linalg.norm(query_embedding)

    # Search the FAISS index
    distances, indices = faiss_index.search(np.array([query_embedding]), top_k)

    # Retrieve relevant papers
    results = papers_df.iloc[indices[0]]
    results['similarity_score'] = distances[0]  # Add similarity scores to the result

    return results

# Function to run the entire process for ICCV conference
def run_query_based_retrieval_with_pdf_iccv(year, query, top_k=5):
    # Fetch ICCV papers for the given year
    scraper = ConferenceScraper(conf="ICCV", years=[year])
    papers_df = scraper.fetch_conference_papers()

    # Create FAISS index for the papers
    faiss_index, papers_df = create_faiss_index_iccv(papers_df)

    # Perform query-based retrieval
    relevant_papers = query_faiss_with_pdf_iccv(faiss_index, query, papers_df, top_k=top_k)

    # Save results to CSV
    relevant_papers.to_csv(f"relevant_iccv_papers_with_pdf_{year}.csv", index=False)

    return relevant_papers






In [None]:
query = "deep learning for image recognition"
year=2023  # Example query
relevant_papers = run_query_based_retrieval_with_pdf_iccv(year, query, top_k=5)
print(f"Top relevant papers for ICCV 2018:\n")
relevant_papers

CVPR Query Based Retrieval

In [None]:
import requests
import pandas as pd
import numpy as np
import faiss
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import nest_asyncio

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load pre-trained model from SentenceTransformers for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

class ConferenceScraper:
    def __init__(self, conf, years):
        self.conf = conf
        self.years = years if isinstance(years, list) else [years]  # Ensure years is a list
        self.data = []

    def get_all_papers(self, year):
        base_url = f"https://openaccess.thecvf.com/{self.conf}{year}"
        url = f"{base_url}?day=all"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        paper_titles = soup.find_all('dt', class_='ptitle')

        for title in paper_titles:
            paper_title = title.find('a').text.strip()
            a_tag = title.find('a')
            if a_tag:
                paper_link = a_tag.get('href')
                paper_link = f"http://openaccess.thecvf.com/{paper_link}"

                # Find corresponding PDF link
                pdf_link = None
                pdf_tag = title.find_next_sibling('dd')
                if pdf_tag and pdf_tag.find('a'):
                    pdf_link = pdf_tag.find('a').get('href')
                    if pdf_link.endswith('.pdf'):
                        pdf_link = f"http://openaccess.thecvf.com/{pdf_link}"

                # Fetch the abstract
                abstract = self.get_abstract(paper_link)

                # Add the data to the list
                self.data.append({
                    "year": year,
                    "title": paper_title,
                    "link": paper_link,
                   # "pdf link": pdf_link,
                    #"abstract": abstract,
                    "Conference": self.conf
                })

    def get_abstract(self, paper_link):
        try:
            response = requests.get(paper_link)
            soup = BeautifulSoup(response.text, "html.parser")
            abstract_div = soup.find('blockquote', class_='abstract')
            if abstract_div:
                return abstract_div.text.strip()
            else:
                return 'Abstract not available'
        except Exception as e:
            return f'Error fetching abstract: {e}'

    def fetch_conference_papers(self):
        for year in self.years:
            print(f"Fetching papers for {self.conf} {year}...")
            self.get_all_papers(year)

        # Create DataFrame with the titles, links, PDF links, abstracts, and year
        df = pd.DataFrame(self.data)
        return df

# Function to create FAISS index for CVPR papers
def create_faiss_index_cvpr(papers_df):
    titles = papers_df['title'].tolist()

    # Embed titles for FAISS indexing
    embeddings = model.encode(titles, show_progress_bar=True)

    # Normalize embeddings for cosine similarity
    normalized_embeddings = np.array([embedding / np.linalg.norm(embedding) for embedding in embeddings])

    # Initialize FAISS index with embedding dimension size
    dimension = normalized_embeddings.shape[1]  # Embedding size from the model
    faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product (dot product, akin to cosine similarity)

    # Add embeddings to the FAISS index
    faiss_index.add(normalized_embeddings)

    return faiss_index, papers_df

# Query-based retrieval function with PDF links for CVPR papers
def query_faiss_with_pdf_cvpr(faiss_index, query, papers_df, top_k=5):
    # Embed the query
    query_embedding = model.encode([query])[0]

    # Normalize the query embedding
    query_embedding = query_embedding / np.linalg.norm(query_embedding)

    # Search the FAISS index
    distances, indices = faiss_index.search(np.array([query_embedding]), top_k)

    # Retrieve relevant papers
    results = papers_df.iloc[indices[0]]
    results['similarity_score'] = distances[0]  # Add similarity scores to the result

    return results

# Function to run the entire process for CVPR conference
def run_query_based_retrieval_with_pdf_cvpr(year, query, top_k=5):
    # Fetch CVPR papers for the given year
    scraper = ConferenceScraper(conf="CVPR", years=[year])
    papers_df = scraper.fetch_conference_papers()

    # Create FAISS index for the papers
    faiss_index, papers_df = create_faiss_index_cvpr(papers_df)

    # Perform query-based retrieval
    relevant_papers = query_faiss_with_pdf_cvpr(faiss_index, query, papers_df, top_k=top_k)

    # Save results to CSV
    relevant_papers.to_csv(f"relevant_cvpr_papers_with_pdf_{year}.csv", index=False)

    return relevant_papers

# Run the retrieval for CVPR 2018 with a sample query


In [None]:
year = 2022
query = "deep learning"  # Example query
relevant_papers = run_query_based_retrieval_with_pdf_cvpr(year, query)

# Print the relevant papers retrieved
relevant_papers


In [None]:
conference=input("Enter the conference name: ")
year=int(input("Enter the year: "))
query=input("Enter the query:  ")
if conference=="ICML":
    relevant_papers=run_query_based_retrieval_with_pdf_icml(year,query,top_k=5)
elif conference=="CVPR":
    relevant_papers=run_query_based_retrieval_with_pdf_cvpr(year,query)
elif conference=="ICCV":
    relevant_papers=run_query_based_retrieval_with_pdf_iccv(year,query,top_k=5)
elif conference=="ACL":
    relevant_papers=query_papers(query, faiss_index_acl, df_acl,year)
elif conference=="NIPS":
    relevant_papers=run_query_based_retrieval_with_pdf_nips1(year,query,top_k=5)
elif conference=="ECCV":
    relevant_papers=run_query_based_retrieval_with_pdf_eccv(year,query,top_k=5)
elif conference=="ICLR":
    relevant_papers=run_query_based_retrieval_with_pdf_iclr(year,query,top_k=5)
else:

    print("Invalid conference name")



relevant_papers


Enter the conference name: NIPS
Enter the year: 2022
Enter the query:  uses of machine learning
Connection error: Cannot connect to host papers.nips.cchttps:443 ssl:default [Name or service not known], retrying... (1/3)
Connection error: Cannot connect to host papers.nips.cchttps:443 ssl:default [Name or service not known], retrying... (2/3)
Connection error: Cannot connect to host papers.nips.cchttps:443 ssl:default [Name or service not known], retrying... (3/3)
Failed to fetch paper data from https://papers.nips.cchttps://www.proceedings.com/search-result/?search_query=nips: Failed to fetch https://papers.nips.cchttps://www.proceedings.com/search-result/?search_query=nips after 3 retries
Connection error: Server disconnected, retrying... (1/3)
Connection error: Server disconnected, retrying... (1/3)
Connection error: Server disconnected, retrying... (1/3)
Connection error: Server disconnected, retrying... (1/3)
Connection error: Cannot connect to host papers.nips.cchttps:443 ssl:defa

Batches:   0%|          | 0/89 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['similarity_score'] = distances[0]  # Add similarity scores to the result


Unnamed: 0,Title,Abstract,PDF Link,similarity_score
1696,A Dataset for Efforts Towards Achieving the Su...,Among United Nations' 17 Sustainable Developme...,https://papers.nips.cc/paper_files/paper/2022/...,0.451049
2662,Fuzzy Learning Machine,Classification is one of the most important pr...,https://papers.nips.cc/paper_files/paper/2022/...,0.404125
1341,Modeling the Machine Learning Multiverse,Amid mounting concern about the reliability an...,https://papers.nips.cc/paper_files/paper/2022/...,0.400395
1020,A Theory of PAC Learnability under Transformat...,Transformation invariances are present in many...,https://papers.nips.cc/paper_files/paper/2022/...,0.385254
2082,Knowledge Distillation: Bad Models Can Be Good...,Large neural networks trained in the overparam...,https://papers.nips.cc/paper_files/paper/2022/...,0.384567
