In [1]:
from autosearch.functions.text_analysis import chunk_pdf

from autosearch.database.paper_database import PaperDatabase
from autosearch.analysis.document_analyzer import DocumentAnalyzer
from autosearch.research_project import ResearchProject
from autosearch.write_blog import WriteBlog

import autogen
from typing import List, Dict, Any

In [2]:
import os
from dotenv import load_dotenv
from azure.core.exceptions import HttpResponseError

# Load environment variables
load_dotenv()

# Retrieve Azure credentials from environment variables
config={
    'doc_api_key': os.getenv("DOCUMENT_INTELLIGENCE_KEY"),
    'doc_endpoint': os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
}
os.environ["TOKENIZERS_PARALLELISM"] = "True"

In [3]:
title = "Exploring the Intricacies of Polymer Representation: Unraveling Complexity"

In [4]:
test_project = ResearchProject(
    project_id = "project_test",
    version= "0.2",
    config=config,
    config_file="OAI_CONFIG_LIST-sweden-505",
    initiate_db= False,
    funcClsList = ["FactualCheck", "GetPDF", "GetPDFs", "UrlCheck", "AcademicRetriever", "AcademicSearch", "WriteSection", "PlotFigure"],
    communiteList = ["outline_agents", "write_section_agents", "instructor_agents"],
    local_papers_dir="./papers",
    models = ["gpt-35-turbo", "gpt-35-turbo-16k"]
)
project_config = test_project.ProjectConfig
# print(project_config.logging_session_id)
# test_project.run(
#     title=title,
#     target_audience="expert in experimental polymer science and machine learning experts",
# )

Logging session ID: 98477259-bc8d-4df9-ac50-6e2149524f68
Equipping function 'academic_retriever' to agent 'blog_editor-in-chief'
Equipping function 'academic_search' to agent 'blog_editor-in-chief'
Equipping function 'get_pdf' to agent 'blog_editor-in-chief'
Equipping function 'get_pdfs' to agent 'blog_editor-in-chief'
Equipping function 'factual_check' to agent 'content_strategist'
Equipping function 'academic_retriever' to agent 'content_strategist'
Equipping function 'academic_search' to agent 'content_strategist'
Equipping function 'get_pdf' to agent 'content_strategist'
Equipping function 'get_pdfs' to agent 'content_strategist'


The return type of the function 'wrapper' is not annotated. Although annotating it is optional, the function should return either a string, a subclass of 'pydantic.BaseModel'.


Equipping function 'write_section' to agent 'writing_coordinator'
Equipping function 'factual_check' to agent 'content_review_specialist'
Equipping function 'academic_retriever' to agent 'content_review_specialist'
Equipping function 'academic_search' to agent 'content_review_specialist'
Equipping function 'get_pdf' to agent 'content_review_specialist'


The return type of the function 'wrapper' is not annotated. Although annotating it is optional, the function should return either a string, a subclass of 'pydantic.BaseModel'.


Equipping function 'plot_figure' to agent 'visualization_specialist'
Equipping function 'academic_search' to agent 'topic_expert'
Equipping function 'academic_retriever' to agent 'topic_expert'
Equipping function 'academic_search' to agent 'research_resource_expert'
Equipping function 'academic_retriever' to agent 'research_resource_expert'
Equipping function 'get_pdf' to agent 'research_resource_expert'
Processing local PDFs...
Error: The folder ./papers does not exist.


In [None]:
tests = [
    ('write_section',
     {
         "title": "Modern Approaches in Polymer Representation",
         "brief": "Emphasize accessible content on the impact of machine learning on polymer representation with detailed insights from primary sources or direct implementation case studies.",
         "mind_map": "digraph G {\n   node [shape=box, style=\"rounded,filled\", color=lightblue2];\n   // Define nodes\n   Article [label=\"Exploring the Intricacies of Polymer Representation\"];\n   Introduction [label=\"Introduction to Polymer Complexity\"];\n   HistoricalOverview [label=\"Historical Overview of Polymer Representation Techniques\"];\n   TheoreticalFoundations [label=\"Theoretical Foundations of Polymer Modeling\"];\n   ModernApproaches [label=\"Modern Approaches in Polymer Representation\"];\n   CaseStudies [label=\"Case Studies: Machine Learning in Polymer Representation\"];\n   ChallengesLimitations [label=\"Challenges and Limitations in Current Methodologies\"];\n   FutureDirections [label=\"Future Directions and Theoretical Implications\"];\n\n   // Connect nodes\n   Article -> Introduction;\n   Article -> HistoricalOverview;\n   Article -> TheoreticalFoundations;\n   Article -> ModernApproaches;\n   Article -> CaseStudies;\n   Article -> ChallengesLimitations;\n   Article -> FutureDirections;\n\n   // Legends and References\n   { rank=same; ModernApproaches -> Ref1 [label=\"Ref: Representing Polymers as Periodic Graphs [1]\"] }\n   { rank=same; ModernApproaches -> Ref2 [label=\"Ref: Potentials and challenges of polymer informatics [2]\"] }\n   { rank=same; CaseStudies -> Ref3 [label=\"Ref: Rethinking Interphase Representations [3]\"] }\n}\n"
     }
     ),
    # ('factual_check',
    #  {
    #      "text": "Polymers are substantial cornerstones in the fabric of modern materials science, ubiquitous in applications ranging from packaging and textiles to high-performance engineering plastics and biocompatible materials. The versatility of these macromolecules lies in the infinitude of their structures and compositions, which command their physical and chemical properties. To understand, predict, and manipulate these properties, scientists must adeptly represent polymers at both molecular and macroscopic levels. This section delves into the fundamental concepts of polymer representation, exploring the nuances of chemical structure, topology, and configuration crucial for building a comprehensive understanding of polymer science.",
    #      "paper_title": "Polymer Structure, Properties, and Applications",
    #      "paper_url": "https://example.com/polymer_structure_properties_applications",
    #      "reason": "To verify the general statements about polymer applications, their versatility, and the importance of representation at molecular and macroscopic levels."
    #  }
    #     #  {
    #  #     "text": "The collaboration between these fields also birthed differentiable programming frameworks for quantum chemistry, like TorchANI and TensorMol [4]. These latter-day frameworks enable researchers to quickly prototype neural networks that learn quantum mechanical laws directly from data, providing a valuable tool to accelerate discovery. This integration has offered promising results in tasks like molecular dynamics simulations, which are key for understanding chemical reactions and material properties.",
    #  #     "paper_title": "Automated Calculation of Thermal Rate Coefficients using Ring Polymer Molecular Dynamics and Machine-Learning Interatomic Potentials with Active Learning",
    #  #     "paper_url": "http://arxiv.org/pdf/1805.11924v3",
    #     #     "reason": "To confirm the factual information about differentiable programming frameworks and their impact on research in quantum chemistry and neural networks."
    #     # }
    #  ),
    # ('get_pdfs', {
    #     "urls": ["http://arxiv.org/pdf/2305.13267v1", "http://arxiv.org/pdf/2305.06530v1"],
    #     "reasons": ['factual_check'] * 2
    # }),
    # ('get_pdf', {
    #     "url": "https://arxiv.org/pdf/2110.13711",
    #     "reason": "factual_check",
    #     "part": "full"
    # }),
    # ('url_check',{
    #         "paper_url": "https://arxiv.org/pdf/2107.03012.pdf",
    #         "paper_title": "From algebra to analysis: new proofs of theorems by Ritt and Seidenberg"
    # }),
    # ('academic_retriever',{
    #     "queries":["Large Language Models", "Assessing Language Models", "AI safety and reliability"],
    #     "n_results":3
    # }),
    # ('academic_search',{"query":"Human-Centred Learning Analytics and AI in Education: a Systematic Literature Review"}),
]

for fucn_name, args in tests:
    results = None
    for func in test_project.functions:
        if func.name == fucn_name:
            print(f"Running {fucn_name} with args: {args}")
            results = func.func(**args)
            print(results)

    if results is None:
        print(f"Function {fucn_name} not found")
        continue

In [None]:
from autosearch.data.paper import Paper

pdf_path = "/home/alibina/repo/usecases/autosearch_projects/polymer_representation/papers/schmid-2022-understanding-and-modeling-polymers-the-challenge-of-multiple-scales.pdf"

pdf_file = os.path.basename(pdf_path)
paper = Paper(
            title=os.path.splitext(pdf_file)[0],
            authors=[],
            url=pdf_path,
            source='local'
        )


processed_paper = test_project.ProjectConfig.doc_analyzer.process_local_pdf(paper, project_config)

In [None]:
[func.name for func in test_project.functions]

In [None]:
from autosearch.functions.plot_figure import plot_figure


plot_result = plot_figure(
    project_config=project_config,
    plot_type="line",
    data_description="Monthly sales data for the year 2023 for the following data Jan: 1000, Feb: 1200, Mar: 1300, Apr: 1100, May: 1400, Jun: 1500, Jul: 1600, Aug: 1700, Sep: 1800, Oct: 1900, Nov: 2000, Dec: 2100",
    x_label="Month",
    y_label="Sales ($)",
    title="Monthly Sales Trend - 2023",
    additional_instructions="Use a blue line with circular markers. Add a legend if possible."
)

print(plot_result)

In [17]:
import os

def list_files_in_directory(directory):
    try:
        # List all files in the given directory
        files = os.listdir(directory)
        # Filter out directories, only keep files
        files = [f[:-4] for f in files if os.path.isfile(os.path.join(directory, f)) and f.endswith(".txt")]
        return files
    except FileNotFoundError:
        print(f"The directory {directory} does not exist.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


In [18]:
import asyncio
import os
import pandas as pd

from graphrag.index import run_pipeline, run_pipeline_with_config
from graphrag.index.config import PipelineCSVInputConfig, PipelineWorkflowReference
from graphrag.index.input import load_input

async def index_chunks_with_graphrag(chucks_csv: str, base_dir: str):
    """
    Index chunks using graphrag package.
    
    Args:
    chucks_csv (str): csv file containing chunks to be indexed.
    base_dir (str): base directory to save output files.
    """
   

    # Configure input
    input_config = PipelineCSVInputConfig(
        file_pattern=chucks_csv,
        base_dir=base_dir,
        source_column="url",  # Using 'url' as the source column
        text_column="page_content",
        timestamp_column="last_updated_date",  # Using 'last_updated_date' as the timestamp column
        timestamp_format="%Y-%m-%dT%H:%M:%S%z",  # Format for ISO 8601 timestamps
        title_column="title",
    )

    # Load input
    dataset = await load_input(input_config)

    # Define workflows
    workflows = [
        PipelineWorkflowReference(
            name="entity_extraction",
            config={
                "entity_extract": {
                    "strategy": {
                        "type": "nltk",
                    }
                }
            },
        ),
        PipelineWorkflowReference(
            name="entity_graph",
            config={
                "cluster_graph": {"strategy": {"type": "leiden"}},
                "embed_graph": {
                    "strategy": {
                        "type": "node2vec",
                        "num_walks": 10,
                        "walk_length": 40,
                        "window_size": 2,
                        "iterations": 3,
                        "random_seed": 597832,
                    }
                },
                "layout_graph": {
                    "strategy": {
                        "type": "umap",
                    },
                },
            },
        ),
    ]

    # Run pipeline
    tables = []
    async for table in run_pipeline(dataset=dataset, workflows=workflows):
        tables.append(table)
    
    pipeline_result = tables[-1]

    # Process and save results
    if pipeline_result.result is not None:
        result_df = pipeline_result.result
        result_df.to_csv(os.path.join(output_dir, "indexed_chunks.csv"), index=False)
        print(f"Indexed chunks saved to {os.path.join(output_dir, 'indexed_chunks.csv')}")
        
        # Print some sample results
        first_result = result_df.head(1)
        print(f"level: {first_result['level'].iloc[0]}")
        print(f"embeddings: {first_result['embeddings'].iloc[0]}")
        print(f"entity_graph_positions: {first_result['node_positions'].iloc[0]}")
    else:
        print("No results from indexing!")

    # Clean up temporary file
    os.remove(temp_csv_path)



In [19]:
from autosearch.api.arxiv_api import ArxivAPI
import pandas as pd
import json
import numpy as np
from typing import List
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

def process_single_paper(arxiv_id: str, ProjectConfig, max_token_size: int) -> List[Dict[str, Any]]:
    """
    Process a single arXiv paper and return its chunks.
    """
    arxiv_api = ArxivAPI()
    try:
        # Fetch the paper metadata
        paper = arxiv_api.get_paper_metadata(arxiv_id)
        
        # Get the chunks
        chunks = ProjectConfig.doc_analyzer.pdf2md_chunck(paper, max_token_size)
        
        paper_data = []
        
        # Process each chunk
        for chunk in chunks:
            chunk_data = {
                'arxiv_id': arxiv_id,
                'page_content': chunk.page_content,
            }
            
            # Unfold the metadata
            if hasattr(chunk, 'metadata'):
                for key, value in chunk.metadata.items():
                    # Convert complex types to JSON strings
                    if isinstance(value, (list, dict, np.integer, np.floating, np.ndarray)):
                        chunk_data[f'metadata_{key}'] = json.dumps(value, cls=NumpyEncoder)
                    else:
                        chunk_data[f'metadata_{key}'] = value
            
            # Add paper information
            paper_dict = paper.to_dict()
            for k, v in paper_dict.items():
                if isinstance(v, (list, dict, np.integer, np.floating, np.ndarray)):
                    chunk_data[k] = json.dumps(v, cls=NumpyEncoder)
                else:
                    chunk_data[k] = v
            
            paper_data.append(chunk_data)
        
        return paper_data
    
    except Exception as e:
        print(f"Error processing paper {arxiv_id}: {str(e)}")
        return []



In [23]:
def process_arxiv_papers(arxiv_ids: List[str], ProjectConfig, output_dir: str = ".", max_workers: int = 5, max_token_size: int = 1200):
    """
    Process a list of arXiv papers in parallel, extract chunks, and combine them into a single DataFrame.
    
    Args:
    arxiv_ids (List[str]): List of arXiv IDs to process.
    output_dir (str): Directory to save the CSV file. Defaults to current directory.
    max_workers (int): Maximum number of worker threads. Defaults to 5.
    
    Returns:
    pd.DataFrame: Combined DataFrame of all chunks from all papers.
    """
    all_data = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_arxiv_id = {executor.submit(process_single_paper, arxiv_id, ProjectConfig, max_token_size): arxiv_id for arxiv_id in arxiv_ids}
        for future in as_completed(future_to_arxiv_id):
            arxiv_id = future_to_arxiv_id[future]
            try:
                paper_data = future.result()
                all_data.extend(paper_data)
                print(f"Processed paper: {arxiv_id}")
            except Exception as exc:
                print(f"Paper {arxiv_id} generated an exception: {exc}")

    # Create DataFrame
    df_chunks = pd.DataFrame(all_data)
    
    # Save to CSV
    os.makedirs(output_dir, exist_ok=True)
    csv_filename = os.path.join(output_dir, f"arxiv_papers_chunks.csv")
    df_chunks.to_csv(csv_filename, index=False)
    
    print(f"Data saved to {csv_filename}")
  
    
    return df_chunks, csv_filename



# Example usage
paper_dir = "/home/azureuser/autogen_uscases/autosearch/notebooks/graphrag/input-txt"
# arxiv_ids = list_files_in_directory(paper_dir)
arxiv_ids = ['2209.14803v1', '2103.14174v1', '2105.05278v1', '2403.20021v2', '1002.2059v1', '2205.08619v1', '2311.14744v1', '2109.02794v1', '2102.08134v2', '2209.13557v2', '2209.01307v4', '2205.13757v1', '0912.3344v1', '2311.15481v3', '751', '2212.08945v1', '2011.00508v1', '2312.04013v3', '2406.04727v2', '2010.07683v1', '1812.11212v1']

print(f"Processing {len(arxiv_ids)} papers...")
output_directory = "./graphrag/input"
max_workers = 10

# Run the process
df_chunks, csv_filename = process_arxiv_papers(arxiv_ids, project_config, output_directory, max_workers, max_token_size=1200)

Processing 21 papers...
PDF file already exists: ./project_test/0.2/output/2105.05278v1.pdf
PDF file already exists: ./project_test/0.2/output/2102.08134v2.pdf
PDF file already exists: ./project_test/0.2/output/2403.20021v2.pdf
PDF file already exists: ./project_test/0.2/output/2209.14803v1.pdf
PDF file already exists: ./project_test/0.2/output/2109.02794v1.pdf
PDF file already exists: ./project_test/0.2/output/2311.14744v1.pdf
PDF file already exists: ./project_test/0.2/output/1002.2059v1.pdf
PDF file already exists: ./project_test/0.2/output/2205.08619v1.pdf
Created 17 docs with a total of 6477 tokens. Largest doc has 954 tokens.
Processed paper: 2311.14744v1
Created 8 docs with a total of 3450 tokens. Largest doc has 1184 tokens.
Processed paper: 1002.2059v1
Created 20 docs with a total of 11033 tokens. Largest doc has 1185 tokens.
Processed paper: 2102.08134v2
PDF file already exists: ./project_test/0.2/output/2209.13557v2.pdf
PDF file already exists: ./project_test/0.2/output/2103

In [None]:
await index_chunks_with_graphrag(os.path.abspath(csv_filename), base_dir='/home/azureuser/autogen_uscases/autosearch/notebooks/output')

In [22]:
print(arxiv_ids)

['2209.14803v1', '2103.14174v1', '2105.05278v1', '2403.20021v2', '1002.2059v1', '2205.08619v1', '2311.14744v1', '2109.02794v1', '2102.08134v2', '2209.13557v2', '2209.01307v4', '2205.13757v1', '0912.3344v1', '2311.15481v3', '751', '2212.08945v1', '2011.00508v1', '2312.04013v3', '2406.04727v2', '2010.07683v1', '1812.11212v1']


In [24]:
df_chunks.columns

Index(['arxiv_id', 'page_content', 'metadata_source', 'metadata_pages',
       'metadata_tokens', 'title', 'authors', 'url', 'pdf_url', 'local_path',
       'abstract', 'published_date', 'last_updated_date', 'source', 'summary',
       'id'],
      dtype='object')