In [1]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List, Dict
import re

class StudySummary(BaseModel):
    summary: List[str]
    aspect: str

    @validator('summary')
    def validate_summary(cls, v):
        if len(v) != 3:
            raise ValueError('Summary must contain exactly 3 bullet points')
        for point in v:
            if not isinstance(point, str):
                raise ValueError('Each bullet point in the summary must be a string')
        return v

    @validator('aspect')
    def validate_aspect(cls, v):
        valid_aspects = ['transportation and mobility', 'health', 'landscape', 'public space', 
                         'street design', 'building design', 'infrastructure', 'walkability', 
                         'urban vitality', 'real estate', 'greenery', 'others']
        if v not in valid_aspects:
            raise ValueError(f'Invalid aspect. Must be one of {valid_aspects}')
        return v

class StudyArea(BaseModel):
    study_area: List[Dict[str, str]]
    extent: str

    @validator('study_area')
    def validate_study_area(cls, v):
        for item in v:
            if not isinstance(item, dict) or len(item) != 1:
                raise ValueError('Each study area must be a dictionary with one entry')
        return v

    @validator('extent')
    def validate_extent(cls, v):
        valid_extents = ['building', 'neighborhood', 'district', 'city', 'country', 'not applicable']
        if v not in valid_extents:
            raise ValueError(f'Invalid extent. Must be one of {valid_extents}')
        return v

class ImageData(BaseModel):
    image_data: List[List[str]]

    @validator('image_data')
    def validate_image_data(cls, v):
        valid_types = ['street view image', 'geo-tagged photos', 'aerial image', 'video', 'virtual reality', 'others']
        for item in v:
            if len(item) != 3:
                raise ValueError('Each item in the image_data list must contain exactly 3 items: type, source, and number/volume')
            if not all(isinstance(i, str) for i in item):
                raise ValueError('Each item in the image_data list must be a string')
            if item[0] not in valid_types:
                raise ValueError(f'Invalid image type. Must be one of {valid_types}')
        return v

class PerceptionData(BaseModel):
    perception_data: List[List[str]]

    @validator('perception_data')
    def validate_perception_data(cls, v):
        valid_sources = ['their own collection', 'publicly available data', 'others']
        valid_methods = ['survey/questionnaire', 'observation', 'physiological signals', 'others']
        for item in v:
            if len(item) != 3:
                raise ValueError('Each item in the perception_data list must contain exactly 3 items: source, method, and number of participants')
            if not all(isinstance(i, str) for i in item):
                raise ValueError('Each item in the perception_data list must be a string')
            if item[0] not in valid_sources:
                raise ValueError(f'Invalid data source. Must be one of {valid_sources}')
            if item[1] not in valid_methods:
                raise ValueError(f'Invalid collection method. Must be one of {valid_methods}')
            if not item[2].isdigit() or int(item[2]) < 0:
                raise ValueError('Number of participants must be a non-negative integer')
        return v

class OtherSensoryData(BaseModel):
    other_data_sources: List[str]

    @validator('other_data_sources')
    def validate_other_data_sources(cls, v):
        if not isinstance(v, list):
            raise ValueError('other_data_sources must be a list')
        if not all(isinstance(i, str) for i in v):
            raise ValueError('Each item in the other_data_sources list must be a string')
        if not v:
            raise ValueError('other_data_sources list cannot be empty')
        if len(v) == 1 and v[0].lower() == 'not applicable':
            return v
        return v


class ResearchTypeAndMethod(BaseModel):
    research_type: str
    method: List[str]

    @validator('research_type')
    def validate_research_type(cls, v):
        valid_types = ['quantitative', 'qualitative']
        if v not in valid_types:
            raise ValueError(f'Invalid research type. Must be one of {valid_types}')
        return v

    @validator('method')
    def validate_method(cls, v):
        if not isinstance(v, list):
            raise ValueError('method must be a list')
        if not all(isinstance(i, str) for i in v):
            raise ValueError('Each item in the method list must be a string')
        return v


class ResearchTypeAndVariables(BaseModel):
    research_and_variables: List[Dict[str, List[str]]]

    @validator('research_and_variables')
    def validate_research_and_variables(cls, v):
        valid_types = ['regression', 'model development', 'index construction', 'exploratory analysis', 'others']
        for item in v:
            if not isinstance(item, dict) or len(item) != 1:
                raise ValueError('Each item in the research_and_variables list must be a dictionary with one entry')
            research_type, variables = next(iter(item.items()))
            if research_type not in valid_types:
                raise ValueError(f'Invalid research type. Must be one of {valid_types}')
            if not isinstance(variables, list) or not all(isinstance(i, str) for i in variables):
                raise ValueError('Variables must be a list of strings')
            if research_type in ['regression', 'model development', 'index construction'] and not variables:
                raise ValueError(f'Variables cannot be empty for research type {research_type}')
            if research_type == 'exploratory analysis' and variables != ['not applicable']:
                raise ValueError('For exploratory analysis, variables must be ["not applicable"]')
        return v


class CVModelsData(BaseModel):
    cv_models: List[List[str]]

    @validator('cv_models')
    def validate_cv_models(cls, v):
        valid_purposes = ['object detection', 'semantic/instance segmentation', 'image classification', 'feature extraction', 'others']
        valid_procedures = ['pre-trained without fine-tuning', 'pre-trained with fine-tuning', 'trained from scratch by themselves', 'others']
        for item in v:
            if len(item) != 3:
                raise ValueError('Each item in the cv_models list must contain exactly 3 items: name, purpose, and training procedure')
            if not all(isinstance(i, str) for i in item):
                raise ValueError('Each item in the cv_models list must be a string')
            if item[1] not in valid_purposes:
                raise ValueError(f'Invalid purpose. Must be one of {valid_purposes}')
            if item[2] not in valid_procedures:
                raise ValueError(f'Invalid training procedure. Must be one of {valid_procedures}')
        return v


class CodeAvailability(BaseModel):
    code_availability: str

    @validator('code_availability')
    def validate_code_availability(cls, v):
        valid_options = ['code available upon request', 'code available with restrictions', 'code is not available', 'not mentioned', 'others']
        if re.match(r'^https?://(?:www\.)?github.com/', v):
            return v
        elif v not in valid_options:
            raise ValueError(f'Invalid code_availability. Must be a URL starting with "https://github.com/" or one of {valid_options}')
        return v


class DataAvailability(BaseModel):
    data_availability: str

    @validator('data_availability')
    def validate_data_availability(cls, v):
        valid_options = ['data available upon request', 'data available with restrictions', 'data is not available', 'not mentioned', 'others']
        data_host_services = [r'^https?://(?:www\.)?drive.google.com/', 
                              r'^https?://(?:www\.)?dataverse.harvard.edu/', 
                              r'^https?://(?:www\.)?figshare.com/',
                              r'^https?://(?:www\.)?kaggle.com/',
                              r'^https?://(?:www\.)?data.world/',
                              r'^https?://(?:www\.)?zenodo.org/',
                              r'^https?://(?:www\.)?osf.io/',
                              r'^https?://(?:www\.)?data.gov/']
        if any(re.match(service, v) for service in data_host_services):
            return v
        elif v not in valid_options:
            raise ValueError(f'Invalid data_availability. Must be a URL from a data host service or one of {valid_options}')
        return v


class IRBApproval(BaseModel):
    irb_approval: str

    @validator('irb_approval')
    def validate_irb_approval(cls, v):
        if v not in ['Yes', 'No']:
            raise ValueError('Invalid irb_approval. Must be "Yes" or "No"')
        return v


class StudyFeedback(BaseModel):
    limitations: List[str] = Field(..., min_items=1)
    future_research_opportunities: List[str] = Field(..., min_items=1)
    
class DOI(BaseModel):
    doi: str

In [2]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import UnstructuredPDFLoader
import os
import dotenv
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List

dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
loader = UnstructuredPDFLoader("notebooks/ijgi-11-00628-v2.pdf")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings, persist_directory = "notebooks")
# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=DOI)
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
{format_instructions}
Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)
chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature = 0, model_name = "gpt-3.5-turbo-16k"), chain_type="stuff", retriever=docsearch.as_retriever(), chain_type_kwargs=chain_type_kwargs, return_source_documents = True)
query = """
1. What is DOI of the paper?
Example Answer: 
------
*XXX*
------
"""
output = qa({"query":query})

Created a chunk of size 1584, which is longer than the specified 1000
Created a chunk of size 1348, which is longer than the specified 1000
Created a chunk of size 2246, which is longer than the specified 1000
Created a chunk of size 1456, which is longer than the specified 1000
Created a chunk of size 2518, which is longer than the specified 1000
Created a chunk of size 1319, which is longer than the specified 1000
Created a chunk of size 1306, which is longer than the specified 1000
Created a chunk of size 1044, which is longer than the specified 1000
Created a chunk of size 1133, which is longer than the specified 1000
Created a chunk of size 1076, which is longer than the specified 1000
Created a chunk of size 1230, which is longer than the specified 1000
Created a chunk of size 1753, which is longer than the specified 1000
Created a chunk of size 1116, which is longer than the specified 1000
Created a chunk of size 1681, which is longer than the specified 1000
Created a chunk of s

In [4]:
output

{'query': '\n1. What is DOI of the paper?\nExample Answer: \n------\n*XXX*\n------\n',
 'result': '{"limitations": "https://doi.org/10.3390/ijgi11120628"}',
 'source_documents': [Document(page_content='Publisher’s Note: MDPI stays neutral\n\nwith regard to jurisdictional claims in\n\npublished maps and institutional afﬁl-\n\niations.\n\nCopyright: © 2022 by the authors.\n\nLicensee MDPI, Basel, Switzerland.\n\nThis article is an open access article\n\ndistributed under\n\nthe terms and\n\nconditions of the Creative Commons\n\nAttribution (CC BY) license (https://\n\ncreativecommons.org/licenses/by/\n\n4.0/).\n\nISPRS Int. J. Geo-Inf. 2022, 11, 628. https://doi.org/10.3390/ijgi11120628\n\nhttps://www.mdpi.com/journal/ijgi\n\nISPRS Int. J. Geo-Inf. 2022, 11, 628\n\n2 of 23', metadata={'source': 'notebooks/ijgi-11-00628-v2.pdf'}),
  Document(page_content='International Journal ofGeo-Information\n\nArticle Information in Streetscapes—Research on Visual Perception Information Quantity of St

In [12]:
from langchain.chat_models import ChatOpenAI
import dotenv
import os
from langchain.document_loaders import TextLoader
from langchain import PromptTemplate, OpenAI, LLMChain


dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
txt = "/Users/koichiito/Documents/NUS PhD/Academic Matter/2023 Spring/ISM3/visual_perception_review/data/raw/test_papers/10.1016_j.aap.2014.02.008.txt"
loader = TextLoader(txt)
model = ChatOpenAI(client = None, openai_api_key=OPENAI_API_KEY, temperature = 0, model = "gpt-3.5-turbo-16k", max_tokens=500)
question = """
Please answer the following questions about the paper.
Answer DOI and title of the study. Answer 'not mentioned' if you cannot find answers for DOI and title.
Example Answer: 
------
DOI: *XXX*
Title: *XXX*
------

1. Summarize the study in 3 bullet points in a Python list format based on introduction, conclusion, and abstract. 3 points should cover: Purpose/aim/objective of the study, method, and findings. 2. Choose which aspect of the built environment this study examined: transportation and mobility, health, landscape, public space, street design, building design, infrastructure, walkability, urban vitality, real estate, greenery, others. If it's "others", please provide the appropriate aspect that the study examined after "others:".
Example Answer: 
------
Summary: 
- *PURPOSE*
- *METHOD*
- *FINDINGS*
Aspect: *XXX*
------

1. Answer in which city(s) this study was conducted (i.e. the study area/site(s) that this research collected data from) in a bullet-point list of python dictionary, where "Country" and "City" are the keys, and the values should be based on abstract, data, method. The number of bullet points may vary depending on the information in the source document. 2. Choose the extent/scale of the study area from the following options: individual image-level, building-level, neighborhood-level, district-level, city-level, country-level, or not applicable
Example Answer: 
------
Study area: 
- Country: *XXX, City: *XXX*
Extent/scale of the study area: *XXX*-level
------

Answer the type, data source(s), and data size(s) of image data used to assess perception in a bullet-point list of python list format [TYPE,  IMAGE DATA SOURCE, NUMBER/VOLUME OF IMAGES]. For types of data, choose from the following: street view image, geo-tagged photos, aerial image, video, virtual reality, others. For the data source, provide sources of the images/videos: i.e. where they collected them, and specific service names if possible (e.g., Google Street View, Baidu Map). For the number/volume of images, answer how many images (e.g., street view image) were downloaded/obtaine/used or how long were the videos. Use comma (,) between answers to return multiple answers if needed.
Example Answer: 
------
- [*TYPE*, *DATA_SOURCE*, *NUMBER/VOLUME_OF_IMAGES*]
------

Answer the data source(s), data collection method(s), and data size(s) of the subjective perception data in a bullet-point list of python list format [DATA SOURCE, COLLECTION METHOD, NUMBER OF PARTICIPANTS]. For the data source, choose from the following: their own collection, publicly available data, others. If it's publicly available data, provide their names or citation in parentheses. For the collection method, choose the method of how they collected the data: survey/questionnaire, observation, physiological signals, others. For the the number of participants, provide the number of participants/raters in the survey/questionnaire. Use comma (,) between answers to return multiple answers if needed.
Example Answer: 
------
- [*DATA_SOURCE*, *COLLECTION_METHOD*, *NUMBER_OF_PARTICIPANTS*]
------

If the study used other sensory data, answer the data sources of other sensory data in bullet point. If not, answer "not applicable". Sensory data should include smell/olfactory data (e.g., air quality), texture (e.g., vibration), sound/auditory (e.g., noise/acoustic data) based on data and method.
Example Answer: 
------
- *OTHER_SENSORY*: *DATA_SOURCE*
------

1. Answer if this study is quantitative or qualitative research (i.e. type of research). 2. Explain the method in bullet points based on method. Include the following points and be as specific as possible: data collection, data processing, analysis (e.g., modelling)
Example Answer: 
------
Type of research: *XXX*
Method in bullet points: 
- *XXX*
- *XXX*
- *XXX*
------

Answer the type of this research in bullet points based on method. For the type of research, choose from the following options: regression, model development, index construction, exploratory analysis, others. If it's "others", please provide the appropriate research type after "others:".
Example Answer: 
------
- *TYPE_OF_RESEARCH*
- *TYPE_OF_RESEARCH*
------

If the study used computer vision models, answer the model architecture name(s), purpose(s), and training procedure(s) of the computer vision model(s) used in this study in a bullet-point list of python list format [NAME, PURPOSE, TRAINING PROCEDURE] in bullet point based on method. If not, answer "not applicable". For the model architecture names, provide the specific names of architectures (e.g., ResNet-50). For the purpose of the model, choose from the following options: object detection, semantic/instance segmentation, image classification, feature extraction, others. For training procedure, choose from the following option: pre-trained without fine-tuning, pre-trained with fine-tuning, trained from scratch by themselves, others. Use comma (,) between answers to return multiple answers if needed.
Example Answer: 
------
- [*NAME*, *PURPOSE*, *TRAINING_PROCEDURE*]
------

Choose the availability of the code used in this study from the following options: code available via URL (e.g. github), code available upon request, code available with restrictions, code is not available, not mentioned, others. If you choose "code available via URL", make sure the URL is a link to a version control repository, for example, github.
Example Answer: 
------
*XXX*
------

Choose the availability of the data used in this study from the following options: data vailable via URL, data available upon request, data available with restrictions, data is not available, not mentioned, others. The URL need to be a link to the whole dataset used by the author via a data host service, such as Google Drive (https://drive.google.com/), Harvard Dataverse (https://dataverse.harvard.edu/), Figshare (https://figshare.com/). Other type of URLs (e.g., social media data provider) should not be considered. 
Example Answer: 
------
*XXX*
------

Answer whether the study obtained research ethical approval from an institutional review board (IRB) with only "Yes" or "No".
Example Answer: 
------
*XXX*
------

1. Explain limitations of this study in bullet points based on results, discussion, and conclusion. 2. Explain future research opportunities based on the limitations and findings of this study in bullet points.
Example Answer: 
------
Limitations: 
- *XXX*
Future research opportunities: 
- *XXX*
------

Paper: {context}
"""
llm_chain = LLMChain(llm=model, prompt=PromptTemplate.from_template(question))
llm_chain(inputs={"context": loader.load()[0].page_content})

{'context': "DOI: 10.1016/j.aap.2014.02.008\n\nTitle: Influence of built environment on pedestrian's crossing decision \n\nKeywords: Pedestrian, Perception, Road crossing decision, Built environment\n\nAbstract: The objective of this experimental study is to identify the differentiation made by pedestrians, in their crossing decision, between various urban environments, notably in terms of perception of walking pleasantness and safety. This experiment further aims to identify the environmental features that pedestrians take into account and the inferences they develop and use to explain their road crossing decision. Sets of photographs presenting five different environments (city center, inner suburbs, public housing in the outskirts, commercial zone in the outskirts and countryside) were presented to 77 participants divided up into three age groups (pre-adolescents, young and middle adults). Their decision to cross or not, their perception of pleasantness and safety, and the elements 

In [11]:
llm_chain

LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=PromptTemplate(input_variables=['context'], output_parser=None, partial_variables={}, template='\nPlease answer the following questions about the paper.\nAnswer DOI and title of the study. Answer \'not mentioned\' if you cannot find answers for DOI and title.\nExample Answer: \n------\nDOI: *XXX*\nTitle: *XXX*\n------\n\n1. Summarize the study in 3 bullet points in a Python list format based on introduction, conclusion, and abstract. 3 points should cover: Purpose/aim/objective of the study, method, and findings. 2. Choose which aspect of the built environment this study examined: transportation and mobility, health, landscape, public space, street design, building design, infrastructure, walkability, urban vitality, real estate, greenery, others. If it\'s "others", please provide the appropriate aspect that the study examined after "others:".\nExample Answer: \n------\nSummary:

In [11]:
from langchain.chat_models import ChatOpenAI
import dotenv
import os
from langchain.document_loaders import TextLoader, PyPDFLoader, MathpixPDFLoader, PyPDFium2Loader, PDFMinerLoader, PyMuPDFLoader, PDFPlumberLoader
from langchain import PromptTemplate, OpenAI, LLMChain


dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
txt = "/Users/koichiito/Documents/NUS PhD/Academic Matter/2023 Spring/ISM3/visual_perception_review/data/raw/all_papers/Sfc159.pdf"
loader = PDFPlumberLoader(txt)
loader.load()

[Document(page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/43564565\nCommunity perceptions of landscape values in the South Island High Country.\nA literature review of current knowledge and evaluation of survey methods\nArticle in Science for Conservation · January 2000\nSource: OAI\nCITATIONS READS\n17 167\n2 authors, including:\nSimon Swaffield\nLincoln University New Zealand\n89 PUBLICATIONS 1,927 CITATIONS\nSEE PROFILE\nSome of the authors of this publication are also working on these related projects:\nPractices and Pathways View project\nPromoting urban comfort in a compact future: Developing Urban Comfort as an analytical tool View project\nAll content following this page was uploaded by Simon Swaffield on 21 June 2014.\nThe user has requested enhancement of the downloaded file.', metadata={'source': '/Users/koichiito/Documents/NUS PhD/Academic Matter/2023 Spring/ISM3/visual_perception_review/data/raw/al

In [3]:
# # run subprocess to run bash command
# import subprocess
# import os
# command = "bash serve_grobid.sh"
# process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
import scipdf
article_dict = scipdf.parse_pdf_to_dict('/Users/koichiito/Documents/NUS PhD/Academic Matter/2023 Spring/ISM3/visual_perception_review/data/raw/all_papers/Sfc159.pdf') # return dictionary


ConnectionError: HTTPConnectionPool(host='localhost', port=8070): Max retries exceeded with url: /api/processFulltextDocument (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fcd0a7d35e0>: Failed to establish a new connection: [Errno 61] Connection refused'))