In [1]:
from autosearch.functions.text_analysis import chunk_pdf

from autosearch.database.paper_database import PaperDatabase
from autosearch.analysis.document_analyzer import DocumentAnalyzer
from autosearch.research_project import ResearchProject
from autosearch.write_blog import WriteBlog

import autogen
from typing import List, Dict, Any



In [2]:
import os
from dotenv import load_dotenv
from azure.core.exceptions import HttpResponseError

# Load environment variables
load_dotenv()

# Retrieve Azure credentials from environment variables
config={
    'doc_api_key': os.getenv("DOCUMENT_INTELLIGENCE_KEY"),
    'doc_endpoint': os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
}
os.environ["TOKENIZERS_PARALLELISM"] = "True"

In [3]:
title = "Exploring the Intricacies of Polymer Representation: Unraveling Complexity"

In [4]:
test_project = ResearchProject(
    project_id = "project_test",
    version= "0.2",
    config=config,
    config_file="OAI_CONFIG_LIST-sweden-505",
    initiate_db= False,
    funcClsList = ["FactualCheck", "GetPDF", "GetPDFs", "UrlCheck", "AcademicRetriever", "AcademicSearch", "WriteSection", "PlotFigure"],
    communiteList = ["outline_agents", "write_section_agents", "instructor_agents"],
    local_papers_dir="./papers",
    models = ["gpt-35-turbo", "gpt-35-turbo-16k"]
)
project_config = test_project.ProjectConfig
# print(project_config.logging_session_id)
# test_project.run(
#     title=title,
#     target_audience="expert in experimental polymer science and machine learning experts",
# )

Logging session ID: fe1dd17f-70f0-4e58-adeb-67694cb5b0d2
Equipping function 'academic_retriever' to agent 'blog_editor-in-chief'
Equipping function 'academic_search' to agent 'blog_editor-in-chief'
Equipping function 'get_pdf' to agent 'blog_editor-in-chief'
Equipping function 'get_pdfs' to agent 'blog_editor-in-chief'
Equipping function 'factual_check' to agent 'content_strategist'
Equipping function 'academic_retriever' to agent 'content_strategist'
Equipping function 'academic_search' to agent 'content_strategist'
Equipping function 'get_pdf' to agent 'content_strategist'


The return type of the function 'wrapper' is not annotated. Although annotating it is optional, the function should return either a string, a subclass of 'pydantic.BaseModel'.


Equipping function 'get_pdfs' to agent 'content_strategist'
Equipping function 'write_section' to agent 'writing_coordinator'
Equipping function 'factual_check' to agent 'content_review_specialist'
Equipping function 'academic_retriever' to agent 'content_review_specialist'
Equipping function 'academic_search' to agent 'content_review_specialist'
Equipping function 'get_pdf' to agent 'content_review_specialist'


The return type of the function 'wrapper' is not annotated. Although annotating it is optional, the function should return either a string, a subclass of 'pydantic.BaseModel'.


Equipping function 'plot_figure' to agent 'visualization_specialist'
Equipping function 'academic_search' to agent 'topic_expert'
Equipping function 'academic_retriever' to agent 'topic_expert'
Equipping function 'academic_search' to agent 'research_resource_expert'
Equipping function 'academic_retriever' to agent 'research_resource_expert'
Equipping function 'get_pdf' to agent 'research_resource_expert'
Processing local PDFs...
Error: The folder ./papers does not exist.


In [5]:
import os

def list_files_in_directory(directory):
    try:
        # List all files in the given directory
        files = os.listdir(directory)
        # Filter out directories, only keep files
        files = [f[:-4] for f in files if os.path.isfile(os.path.join(directory, f)) and f.endswith(".txt")]
        return files
    except FileNotFoundError:
        print(f"The directory {directory} does not exist.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


In [6]:
from autosearch.api.arxiv_api import ArxivAPI
import pandas as pd
import json
import numpy as np
from typing import List
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

def process_single_paper(arxiv_id: str, ProjectConfig, max_token_size: int, reference: bool = False) -> List[Dict[str, Any]]:
    """
    Process a single arXiv paper and return its chunks.
    """
    arxiv_api = ArxivAPI()
    try:
        # Fetch the paper metadata
        paper = arxiv_api.get_paper_metadata(arxiv_id)
        
        # Get the chunks
        chunks = ProjectConfig.doc_analyzer.pdf2md_chunk(paper, max_token_size, reference)
        
        paper_data = []
        
        # Process each chunk
        for chunk in chunks:
            chunk_data = {
                'arxiv_id': arxiv_id,
                'page_content': chunk.page_content,
            }
            
            # Unfold the metadata
            if hasattr(chunk, 'metadata'):
                for key, value in chunk.metadata.items():
                    # Convert complex types to JSON strings
                    if isinstance(value, (list, dict, np.integer, np.floating, np.ndarray)):
                        chunk_data[f'metadata_{key}'] = json.dumps(value, cls=NumpyEncoder)
                    else:
                        chunk_data[f'metadata_{key}'] = value
            
            # Add paper information
            paper_dict = paper.to_dict()
            for k, v in paper_dict.items():
                if isinstance(v, (list, dict, np.integer, np.floating, np.ndarray)):
                    chunk_data[k] = json.dumps(v, cls=NumpyEncoder)
                else:
                    chunk_data[k] = v
            
            paper_data.append(chunk_data)
        
        return paper_data
    
    except Exception as e:
        print(f"Error processing paper {arxiv_id}: {str(e)}")
        return []



In [7]:
def process_arxiv_papers(arxiv_ids: List[str], ProjectConfig, output_dir: str = ".", max_workers: int = 5, max_token_size: int = 1200, reference: bool = False):
    """
    Process a list of arXiv papers in parallel, extract chunks, and combine them into a single DataFrame.
    
    Args:
    arxiv_ids (List[str]): List of arXiv IDs to process.
    output_dir (str): Directory to save the CSV file. Defaults to current directory.
    max_workers (int): Maximum number of worker threads. Defaults to 5.
    
    Returns:
    pd.DataFrame: Combined DataFrame of all chunks from all papers.
    """
    all_data = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_arxiv_id = {executor.submit(process_single_paper, arxiv_id, ProjectConfig, max_token_size, reference): arxiv_id for arxiv_id in arxiv_ids}
        for future in as_completed(future_to_arxiv_id):
            arxiv_id = future_to_arxiv_id[future]
            try:
                paper_data = future.result()
                all_data.extend(paper_data)
                print(f"Processed paper: {arxiv_id}")
            except Exception as exc:
                print(f"Paper {arxiv_id} generated an exception: {exc}")

    # Create DataFrame
    df_chunks = pd.DataFrame(all_data)
    # drop column "id"
    df_chunks = df_chunks.drop(columns=["id"])  
      
    # Save to CSV
    os.makedirs(output_dir, exist_ok=True)
    csv_filename = os.path.join(output_dir, f"arxiv_papers_chunks.csv")
    df_chunks.to_csv(csv_filename, index=False)
    
    print(f"Data saved to {csv_filename}")
  
    return df_chunks, csv_filename



# Example usage
paper_dir = "/home/azureuser/autogen_uscases/autosearch/notebooks/graphrag/input-txt"
# arxiv_ids = list_files_in_directory(paper_dir)
arxiv_ids = ['2209.14803v1', '2103.14174v1', '2105.05278v1', '2403.20021v2', '1002.2059v1', '2205.08619v1', '2311.14744v1', '2109.02794v1', '2102.08134v2', '2209.13557v2', '2209.01307v4', '2205.13757v1', '0912.3344v1', '2311.15481v3', '751', '2212.08945v1', '2011.00508v1', '2312.04013v3', '2406.04727v2', '2010.07683v1', '1812.11212v1']

print(f"Processing {len(arxiv_ids)} papers...")
output_directory = "./graphrag/input"
max_workers = 20

# Run the process
df_chunks, csv_filename = process_arxiv_papers(arxiv_ids, project_config, output_directory, max_workers, max_token_size=1200, reference=False)

Processing 21 papers...
Error processing paper 751: Invalid ArXiv identifier or URL: 751
Processed paper: 751
Loading analysis result from ./project_test/0.2/output/json/2205.13757v1.pdf.json
Loading analysis result from ./project_test/0.2/output/json/2103.14174v1.pdf.json
Loading analysis result from ./project_test/0.2/output/json/2311.15481v3.pdf.json
Loading analysis result from ./project_test/0.2/output/json/2105.05278v1.pdf.json
Loading analysis result from ./project_test/0.2/output/json/0912.3344v1.pdf.json
Loading analysis result from ./project_test/0.2/output/json/2209.13557v2.pdf.json
Loading analysis result from ./project_test/0.2/output/json/2311.14744v1.pdf.json
Loading analysis result from ./project_test/0.2/output/json/2102.08134v2.pdf.json
Loading analysis result from ./project_test/0.2/output/json/2109.02794v1.pdf.json
Loading analysis result from ./project_test/0.2/output/json/2209.01307v4.pdf.json
Loading analysis result from ./project_test/0.2/output/json/2406.04727v