In [16]:
import glob
import pandas as pd
import os
# check the existing papers in input_path = "data/raw/all_papers" and data/processed/3rd_run/input_df_with_title_doi_edited.xlsx
# get the base names of the files
file_list = [os.path.basename(file) for file in glob.glob("data/raw/all_papers/*")]
paper_df = pd.read_excel("data/processed/3rd_run/input_df_with_title_doi_edited.xlsx")
paper_file_list = paper_df["0"].tolist()
# list the papers that are not in file_list
paper_list = [paper for paper in paper_file_list if paper not in file_list]
# get rows of paper_df with paper_list
paper_df = paper_df[paper_df["0"].isin(paper_list)]
# load the initial input csv
initial_input_csv = "data/processed/3rd_run/citation_df.csv"
# join on "EID" column
joined_df = pd.merge(paper_df, pd.read_csv(initial_input_csv), how="inner", on="0")
# save to data/processed/3rd_run/missing_papers.xlsx
# create the directory if it does not exist
if not os.path.exists("data/processed/3rd_run"):
    os.makedirs("data/processed/3rd_run")
joined_df.to_excel("data/processed/3rd_run/missing_papers.xlsx", index=False)

In [10]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import HumanMessagePromptTemplate
from langchain.schema.messages import SystemMessage
from langchain.prompts import ChatPromptTemplate
import os
openai_api_key = os.getenv('OPENAI_API_KEY')
TEMPLATE = """
Abstract: {abstract}
Summary: {summary}
Method: {method}
Question: {question}
Answer:
"""
# define chat and prompt templates
chat_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                "You are research assistant. You will be shown abstract, summary, and method of a paper. Please answer the questions asked by human."
            )
        ),
        HumanMessagePromptTemplate.from_template(TEMPLATE),
    ]
)

llm = ChatOpenAI(client=None, openai_api_key=openai_api_key, temperature=0, model="gpt-3.5-turbo", max_tokens=2000)
question_1 = """
Choose the most relevant physical aspect of the built environment this study examined: landscape, street design, public space, greenery, building design, infrastructure, others. If it's "others", please provide the appropriate aspect that the study examined after "others:".
Example Answer: 
------
Aspect: *XXX*
------
"""
question_2 = """
Choose the most relevant human perception this study examined: health, safety, walkability, urban vitality, transportation and mobility, real estate, others. If it's "others", please provide the appropriate human perception that the study examined after "others:".
Example Answer: 
------
Human perception: *XXX*
------
"""

In [24]:
import pandas as pd
from tqdm import tqdm
if not os.path.exists("data/processed/3rd_run/answers.csv"):
    # get abstract
    abstract_df = pd.read_csv("data/processed/3rd_run/citation_df.csv")
    # get summary
    summary_df = pd.read_csv("data/processed/3rd_run/summary.csv")
    # get method
    method_df = pd.read_csv("data/processed/3rd_run/type_of_research.csv")
    # join them on "0"
    joined_df = pd.merge(abstract_df, summary_df, how="inner", on="0")
    joined_df = pd.merge(joined_df, method_df, how="inner", on="0")
    # # only use the first 2 rows
    # joined_df = joined_df.head(5)
    # loop through joined_df with itertruples and save the answers to the dataframe
    for row in tqdm(joined_df.itertuples(), total=joined_df.shape[0]):
        # get abstract, summary, and method
        abstract = row.Abstract
        summary = row.summary
        method = row.method
        answer_1 = llm(chat_template.format_messages(abstract=abstract, summary=summary, method=method, question=question_1))
        answer_2 = llm(chat_template.format_messages(abstract=abstract, summary=summary, method=method, question=question_2))
        # save the answers to the dataframe
        joined_df.loc[row.Index, "answer_1"] = answer_1.content
        joined_df.loc[row.Index, "answer_2"] = answer_2.content
    # save the dataframe to data/processed/3rd_run/answers.csv
    joined_df.to_csv("data/processed/3rd_run/answers.csv", index=False)

 11%|█         | 44/394 [01:17<09:46,  1.68s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
 12%|█▏        | 47/394 [11:26<8:43:32, 90.53s/it]  Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
 24%|██▍       | 95/394 [22:54<09:59,  2.00s/it]    Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
 34%|███▍      | 134/394 [33:59<06:50,  1.58s/it]   Retrying langchain.chat_models.openai.ChatOpenAI.

In [174]:
# clean the answers
import pandas as pd

def clean_answer_1(row):
    answer_1 = row.answer_1
    abstract = row.Abstract
    answer_1 = answer_1.lower().replace("aspect: ", "")
    # if "greenery" in answer_1, then replace it with "greenery"
    if "green" in answer_1:
        answer_1 = "greenery"
    elif "signscapes" in answer_1 or "urban function" in answer_1:
        answer_1 = "general urban environment"
    elif "street" in answer_1:
        answer_1 = "street design"
    elif "natur" in answer_1:
        answer_1 = "greenery"
    elif "urban blight density" in answer_1:
        answer_1 = "general urban environment"
    elif "illuminance" in answer_1:
        answer_1 = "street design"
    elif "visual properties of the built environment" in answer_1:
        answer_1 = "street design"
    elif "neighborhood" in answer_1:
        answer_1 = "general urban environment"
    elif "urban environment" in answer_1 and "natural" in abstract:
        answer_1 = "greenery"
    elif "urban environment" in answer_1 and "natural" not in abstract:
        answer_1 = "general urban environment"
    elif "safety" in answer_1:
        answer_1 = "general urban environment"
    elif "urban appearance" in answer_1:
        answer_1 = "general urban environment"
    elif "landscape" in answer_1:
        answer_1 = "landscape"
    elif "wet" in answer_1 or "blue" in answer_1 or "river" in answer_1 or "water" in answer_1:
        answer_1 = "waterscapes"
    elif "sky" in answer_1 or "vegetation" in answer_1:
        answer_1 = "greenery"
    elif "pedestrian" in answer_1 or "sidewalk" in answer_1 or "bike" in answer_1:
        answer_1 = "street design"
    elif "alleys" in answer_1:
        answer_1 = "street design"
    elif "infrastructure" in answer_1:
        answer_1 = "infrastructure"
    elif "environment" in answer_1:
        answer_1 = "general urban environment"
    elif "urban space" in answer_1 and "architectural" in answer_1:
        answer_1 = "building design"
    elif "urban space" in answer_1 and "architectural" not in answer_1:
        answer_1 = "general urban environment"
    elif "urban perception" in answer_1 or "urban attributes" in answer_1 or "urban form" in answer_1 or "viewscape" in answer_1:
        answer_1 = "general urban environment"
    elif "enclosure" in answer_1 or "park" in answer_1:
        answer_1 = "public space"
    return answer_1
# health, safety, walkability, urban vitality, transportation and mobility, real estate, others
#TODO Impossible to classify due to too many different answers
def clean_answer_2(row):
    answer_2 = row.answer_2
    abstract = row.Abstract
    answer_2 = answer_2.lower().replace("human perception: ", "")
    if "safe" in answer_2 or "fear" in answer_2 or "crime" in answer_2:
        answer_2 = "safety"
    elif "vitality" in answer_2 or "playability" in answer_2:
        answer_2 = "urban vitality"
    elif "quali" in answer_2 or "aesthetic" in answer_2 or "beauty" in answer_2 or "visual" in answer_2 or "wealth" in answer_2 \
        or "appearance" in answer_2 or "street views" in answer_2 or "attributes" in answer_2 or "perception" in answer_2 \
        or "atmosphere" in answer_2:
        answer_2 = "general quality"
    elif "psycho" in answer_2 or "well-being" in answer_2 or "mental" in answer_2 or "health" in answer_2 or "wellbeing" in answer_2 \
        or "restor" in answer_2 or "emotion" in answer_2:
        answer_2 = "health"
    elif "comfortable" in answer_2:
        answer_2 = "real estate"
    return answer_2

answers_df = pd.read_csv("data/processed/3rd_run/answers.csv")
# show unique answer_1
answers_df["answer_1"] = answers_df.apply(lambda row: clean_answer_1(row), axis=1)
# save to data/processed/3rd_run/aspect.csv after renaming answer_1 to aspect
answers_df.rename(columns={"answer_1": "improved_aspect"})[["0", "improved_aspect"]].to_csv("data/processed/3rd_run/aspect.csv", index=False)

In [159]:
# check row with "blight density" in answer_1
answer_1_example_list = answers_df[answers_df["answer_1"].str.contains("point")]["Abstract"].values[0].split(".")
for answer_1_example in answer_1_example_list:
    print(answer_1_example)

A tremendous amount of research use questionnaires to obtain individuals’ fear of crime and aggregate it to the neighborhood level to measure the spatial distribution of fear of crime
 However, the cost of using questionnaires to measure the large-scale spatial distribution of fear of crime is high
 The built environment is known to influence people’s perceptions, including fear of crime
 This study develops a machine learning model to link built environment extracted from street view images to fear of crime obtained from questionnaires, and then applies this model to extrapolate fear of crime for neighborhoods without the questionnaires
 Using massive street view images and a survey among 1,741 residents in 80 neighborhoods in Guangzhou, China, this study developed a novel systematic approach to measuring large-scale spatial fear of crime at the neighborhood level for 1,753 neighborhoods
 This is the first study to measure fear of crime at the neighborhood level for a metropolitan are

In [1]:
from src.models import predict_model
import os
openai_api_key = os.getenv('OPENAI_API_KEY')
# input and output path
citation_csv = "data/external/asreview_dataset_all_visual-urban-perception-2023-07-09-2023-07-17.csv"
complementary_excel = "data/processed/3rd_run/input_df_with_title_doi_edited.xlsx"
reclibrated_aspect_csv = "data/processed/3rd_run/aspect.csv"
summary_csv = "data/processed/3rd_run/summary.csv"
limitation_opportunity_csv = "data/processed/3rd_run/limitation_future_opportunity.csv"
output_csv_file_path = "data/processed/3rd_run/review_by_aspect.csv"
predict_model.main(citation_csv, complementary_excel, reclibrated_aspect_csv, summary_csv, limitation_opportunity_csv, output_csv_file_path, openai_api_key)

[nltk_data] Downloading package punkt to /Users/koichiito/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/koichiito/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
2023-11-03 18:59:10,639 - src.models.write_review - INFO - Module loaded.


In [1]:
# convert csv to ris
from src.data.asr_csv2ris import CSV2RISConverter
csv_filepath = "data/external/asreview_dataset_all_visual-urban-perception-2023-07-09-2023-07-17.csv"
ris_filepath = "data/external/3rd_run/references.ris"
csv2ris = CSV2RISConverter(csv_filepath, ris_filepath)
csv2ris.run()

Reading CSV...
Writing RIS...




In [12]:
import pandas as pd
import unidecode

from src.models.predict_model import remove_articles_and_prepositions
# create a matching table for old and new citations
def get_new_latex_citation(author: str, title: str, year: str) -> str:
    author_first_name = unidecode.unidecode(author.split("., ")[0].split(" ")[0].lower())
    title_first_word = remove_articles_and_prepositions(title).split(" ")[0].lower().replace("-", "")
    return f"{author_first_name}_{title_first_word}_{year}"

def get_old_latex_citation(author: str, title: str, year: str) -> str:
    author_first_name = unidecode.unidecode(author.split("., ")[0].replace(" ","").lower())
    title_first_word = remove_articles_and_prepositions(title).split(" ")[0].lower().replace("-", "")
    return f"{author_first_name}_{title_first_word}_{year}"

def create_matching_table(df: pd.DataFrame, output_path: str):
    df['new_citations'] = df[['Authors', 'Year', 'Title']].apply(
        lambda row: get_new_latex_citation(row["Authors"], row["Title"], row["Year"]), axis=1)
    df['old_citations'] = df[['Authors', 'Year', 'Title']].apply(
        lambda row: get_old_latex_citation(row["Authors"], row["Title"], row["Year"]), axis=1)
    # keep only the columns that we need
    df = df[["new_citations", "old_citations"]]
    df.to_csv(output_path)

# create a matching table for old and new citations
df = pd.read_csv("data/external/asreview_dataset_all_visual-urban-perception-2023-07-09-2023-07-17.csv")
output_path = "data/external/3rd_run/matching_table.csv"
create_matching_table(df, output_path)

In [13]:
# use the matching table to replace the citations in data/external/3rd_run/paper.txt
text = open("data/external/3rd_run/paper.txt", "r").read()
matching_table = pd.read_csv("data/external/3rd_run/matching_table.csv")
# replace a list of old citations with a list of new citations
old_citations = matching_table["old_citations"].tolist()
new_citations = matching_table["new_citations"].tolist()
for old_citation, new_citation in zip(old_citations, new_citations):
    text = text.replace(old_citation, new_citation)
# save to data/external/3rd_run/paper_modified.txt
text_file = open("data/external/3rd_run/paper_modified.txt", "w")
text_file.write(text)

97350