# Extraction of studies via Databases & Registers

In [2]:
# Standard Packages 
import pandas as pd
import unicodedata
import time
import re
import os
from dotenv import load_dotenv
from datetime import datetime
from typing import Dict, List, Optional, Iterable, Tuple

# API Call Packages
import urllib
import requests
import json

## Load API Keys

In [3]:
load_dotenv()
api_key_WoS = os.getenv("WEB_OF_SCIENCE_API_KEY")
api_key_SS = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

# Check if API keys are loaded
print(f"Web of Science API Key: {api_key_WoS[:3]}...")  # Print first 4 characters
print(f"Semantic Scholar API Key: {api_key_SS[:3]}...")  # Print first 4 characters

Web of Science API Key: 7c0...
Semantic Scholar API Key: eU3...


## Web of Science API Call

**Search and field tags for Web of Science documents**
- `sort_field`: Order by field(s). 
    - Field name and order by clause separated by '+', use A for ASC and D for DESC, 
    - Example: `PY+D`. Multiple values are separated by comma. 
    - Supported fields:  * **LD** - Load Date * **PY** - Publication Year * **RS** - Relevance * **TC** - Times Cited  (optional)
- `...time_span`: Beginning and end dates must be specified in the yyyy-mm-dd format separated by '+' or ' ', e.g. 2023-01-01+2023-12-31. This parameter is not compatible with the all databases search, i.e. db=WOK is not compatible with this parameter. (optional)


In [4]:
# Import WoS API client
import clarivate.wos_starter.client
from clarivate.wos_starter.client.rest import ApiException
from pprint import pprint

In [5]:
# Set up Web of Science API client
BASE_WoS = "https://api.clarivate.com/apis/wos-starter/v1"
configuration = clarivate.wos_starter.client.Configuration(host = BASE_WoS)
configuration.api_key['ClarivateApiKeyAuth'] = api_key_WoS

In [None]:
# Define search terms

# 1. LLM Block
LLM_Block = 'TS=("large language model*" OR "foundation model*" OR LLM OR LLMs OR GPT OR LLaMA* OR Mistral OR Mixtral OR Claude* OR Gemini OR PaLM OR Qwen OR DeepSeek OR "Falcon 180B" OR "Phi-3" OR "GPT-3" OR "GPT-4" OR "PaLM 2" OR "Anthropic")'

# 2. Survey Block
Survey_Block = 'TS=(survey* OR "survey data" OR "survey response*" OR questionnaire* OR question* OR "opinion poll*" OR "public opinion*" OR attitude* OR value* OR norm* OR moral* OR "feeling thermometer*" OR "open-ended" OR "open ended" OR nonresponse OR "non-response" OR respondent* OR participant* OR interview* OR "self-report*" OR "data collection" OR bias OR demographic*)'

# 3. Simulation Block (merged A+B)
Simulation_BlockA = 'TS=((simulat* OR emulat* OR predict* OR imput* OR "missing data" OR nonresponse OR "non-response" OR "item nonresponse" OR "unit nonresponse" OR "synthetic respondent*" OR "synthetic participant*" OR "artificial respondent*" OR "artificial participant*" OR "virtual respondent*" OR "virtual participant*" OR persona* OR "role play*") NEAR/5 (survey* OR questionnaire* OR respondent* OR response* OR interview* OR "self-report*" OR "data collection" OR opinion* OR poll*))'

# 3. 2nd version of  Simulation Block (more comprehensive)
Simulation_BlockB = 'TS=( ( simulat* OR emulat* OR predict* OR imput* OR "synthetic data" OR "missing data" OR nonresponse OR "non-response" OR "item nonresponse" OR "synthetic respondent*" OR "synthetic participant*" OR "artificial respondent*" OR "artificial participant*" OR "virtual respondent*" OR "virtual participant*" OR persona* OR "role play*" OR "as a respondent" OR "LLM as respondent" OR "model as respondent" OR proxy OR surrogate OR "stand-in" OR "stand in" OR replac* OR substitut* OR represent* OR fidelit* OR faithful* OR doppelg* OR ("Synthetic Voice*" NEAR/5 (persona* OR respondent* OR survey* OR "public opinion*" OR opinion*)) OR ("representing people" NEAR/3 (survey* OR respondent* OR persona* OR opinion*)) OR ("LLM-generated persona*" OR "LLM generated persona*") ) NEAR/5 (survey* OR questionnaire* OR respondent* OR response* OR interview* OR "self-report*" OR "data collection" OR opinion* OR poll* OR attitude* OR value* OR norm* OR "public opinion*") )'

# 3. 3rd version of  Simulation Block (fixed phrases)
Simulation_BlockC = 'TS=("survey simulation" OR "simulated participant*" OR "simulated respondent*" OR "synthetic data" OR "synthetic survey data" OR "synthetic respondent*" OR "synthetic participant*" OR "artificial respondent*" OR "artificial participant*" OR "virtual respondent*" OR "virtual participant*" OR "LLM as respondent" OR "model as respondent" OR "as a respondent" OR "role play*" OR persona*)'

# 4. Model Training Block (optional)
Methods_Block = 'TS=( prompt* OR "few-shot" OR "few-shot learning" OR "zero-shot" OR "zero-shot learning" OR "in-context learning" OR ICL OR "chain of thought" OR "self-consistency" OR "system message" OR persona OR personas OR "role prompt*" OR "instruction-tun*" OR "instruction prompt*" OR "fine-tun*" OR ("reinforcement learning with human feedback" OR RLHF) OR ("reinforcement learning with AI feedback" OR RLAIF) OR "temperature parameter" OR "temperature setting" OR "nucleus sampling" OR "top-p sampling" OR "active learning" OR "transfer learning" OR "meta learning" OR "meta-learning" OR "representation learning" OR "continual learning" OR "lifelong learning" )'

# Optional: title block to catch missing papers
Title_Block = 'TI=("synthetic data" OR "doppelgänger" OR "synthetic voices" OR persona* OR "human-like response*" OR "LLM-generated persona*")'

# Create combinations of search blocks with Exclusion Block
LLM_and_Survey = f'{LLM_Block} AND {Survey_Block}'
LLM_and_Survey_and_Methods = f'{LLM_Block} AND {Survey_Block} AND {Methods_Block}'
LLM_and_SimulationA = f'{LLM_Block} AND {Simulation_BlockA}'
LLM_and_SimulationB = f'{LLM_Block} AND {Simulation_BlockB}'
LLM_and_SimulationC = f'{LLM_Block} AND {Simulation_BlockC}'
LLM_and_Methods = f'{LLM_Block} AND {Methods_Block}'
LLM_and_Survey_and_SimulationA = f'{LLM_Block} AND {Survey_Block} AND {Simulation_BlockA}'
LLM_and_Survey_and_SimulationB = f'{LLM_Block} AND {Survey_Block} AND {Simulation_BlockB}'
LLM_and_Survey_and_SimulationC = f'{LLM_Block} AND {Survey_Block} AND {Simulation_BlockC}'
LLM_and_SimulationA_and_Methods = f'{LLM_Block} AND {Simulation_BlockA} AND {Methods_Block}'
LLM_and_SimulationB_and_Methods = f'{LLM_Block} AND {Simulation_BlockB} AND {Methods_Block}'
LLM_and_SimulationC_and_Methods = f'{LLM_Block} AND {Simulation_BlockC} AND {Methods_Block}'
LLMSurvey_or_LLMSimulationA = f'({LLM_and_Survey}) OR ({LLM_and_SimulationA})'
LLMSurvey_or_LLMSimulationB = f'({LLM_and_Survey}) OR ({LLM_and_SimulationB})'
LLMSurvey_or_LLMSimulationC = f'({LLM_and_Survey}) OR ({LLM_and_SimulationC})'
Survey_and_SimulationA = f'{Survey_Block} AND {Simulation_BlockA}'
Survey_and_SimulationB = f'{Survey_Block} AND {Simulation_BlockB}'
Survey_and_SimulationC = f'{Survey_Block} AND {Simulation_BlockC}'

LLM_and_Survey_OR_SimulationB = f'{LLM_Block} AND ({Survey_Block} OR {Simulation_BlockB})'

In [12]:
# DEFINE Final Search Blocks
wos_queries = {
    "LLM and Survey": LLM_and_Survey,
    "LLM and Survey and Methods": LLM_and_Survey_and_Methods,
    "LLM and SimulationA": LLM_and_SimulationA,
    "LLM and SimulationB": LLM_and_SimulationB,
    "LLM and SimulationC": LLM_and_SimulationC,
    "LLM and Methods": LLM_and_Methods,
    "LLM and Survey and SimulationA": LLM_and_Survey_and_SimulationA,
    "LLM and Survey and SimulationB": LLM_and_Survey_and_SimulationB,
    "LLM and Survey and SimulationC": LLM_and_Survey_and_SimulationC,
    "LLM and SimulationA and Methods": LLM_and_SimulationA_and_Methods,
    "LLM and SimulationB and Methods": LLM_and_SimulationB_and_Methods,
    "LLM and SimulationC and Methods": LLM_and_SimulationC_and_Methods,
    "LLMSurvey or LLMSimulationA": LLMSurvey_or_LLMSimulationA,
    "LLMSurvey or LLMSimulationB": LLMSurvey_or_LLMSimulationB,
    "LLMSurvey or LLMSimulationC": LLMSurvey_or_LLMSimulationC,
    "Survey and SimulationA": Survey_and_SimulationA,
    "Survey and SimulationB": Survey_and_SimulationB,
    "Survey and SimulationC": Survey_and_SimulationC
}

wos_queries_subset = {
    "LLM_and_SimulationA": LLM_and_SimulationA,
    "LLM_and_SimulationB": LLM_and_SimulationB,
    "LLM_and_Survey_and_SimulationA": LLM_and_Survey_and_SimulationA,
    "LLM_and_Survey_and_SimulationB": LLM_and_Survey_and_SimulationB,
    "LLM_and_SimulationA_and_Methods": LLM_and_SimulationA_and_Methods,
    "LLM_and_SimulationB_and_Methods": LLM_and_SimulationB_and_Methods,
    "LLMSurvey or LLMSimulationA": LLMSurvey_or_LLMSimulationA,
    "LLMSurvey or LLMSimulationB": LLMSurvey_or_LLMSimulationB,
}

In [13]:
# Define Functions

# Function to run WoS API query
def run_wos_api(
        q,                          # Search query in WOS search syntax
        db='WOS',                       # Choice of Database
        limit=50,                       # Set limit of records on page (1-50) (default to 10)
        page=1,                         # Set the result page 
        sort_field='RS+D',              # Order by Field(s), option: LD, PY, RS, TC
        modified_time_span=None,        # Date range in which results were most recently modified.
        tc_modified_time_span=None,     # Date range in which times cited counts were modified.
        detail=None,                    # Set to returns full data by default, alternative: detail=short
        configuration=configuration ):

    with clarivate.wos_starter.client.ApiClient(configuration) as api_client:
        api_instance = clarivate.wos_starter.client.DocumentsApi(api_client)
        try:
            api_response = api_instance.documents_get(
                q,
                db=db,
                limit=limit,
                page=page,
                sort_field=sort_field,
                modified_time_span=modified_time_span,
                tc_modified_time_span=tc_modified_time_span,
                detail=detail
            )
            return api_response
        
        except ApiException as e:
            print(f"Exception when calling DocumentsApi->documents_get: {e}")
            return None
        
# Function to fetch multiple pages
def wos_fetch_pages(q: str, limit: int = 50) -> pd.DataFrame:
    all_hits = []

    for p in range(1, 21):  # pages 1 to 20 
        resp = run_wos_api(q, page=p, limit=limit)
        if resp is None:
            print(f"[WARN] No response for page {p}")
            continue
        hits = getattr(resp, "hits", []) or []
        all_hits.extend(h.to_dict() for h in hits)

    if not all_hits:
        return pd.DataFrame()

    df = pd.DataFrame(all_hits)
    if "uid" in df.columns:
        df = df.drop_duplicates(subset=["uid"]).reset_index(drop=True)
    return df

def wos_fetch_all_pages(q: str, limit: int = 50) -> pd.DataFrame:
    # Step 1: Fetch the first page to get the total number of records
    resp = run_wos_api(q, page=1, limit=limit)
    if resp is None:
        print(f"[WARN] No response for the first page of query: {q}")
        return pd.DataFrame()

    total_records = getattr(resp.metadata, "total", 0)  # Get the total number of records
    if total_records == 0:
        print(f"[WARN] No records found for query: {q}")
        return pd.DataFrame()

    # Step 2: Calculate the number of pages required
    total_pages = (total_records + limit - 1) // limit  # equivalent to math.ceil(total_records / limit)

    # Step 3: Loop through all pages and collect the records
    all_hits = []
    for page in range(1, total_pages + 1):
        resp = run_wos_api(q, page=page, limit=limit)
        if resp is None:
            print(f"[WARN] No response for page {page} of query: {q}")
            continue

        hits = getattr(resp, "hits", []) or []
        all_hits.extend(h.to_dict() for h in hits)

    if not all_hits:
        return pd.DataFrame()

    # Step 4: Convert the results to a DataFrame
    df = pd.DataFrame(all_hits)
    
    # Deduplicate based on 'uid' (unique identifier)
    if "uid" in df.columns:
        df = df.drop_duplicates(subset=["uid"]).reset_index(drop=True)

    return df

# Function to get total record counts for each query
def wos_query_totals(wos_queries: dict) -> pd.DataFrame:
    results = []
    for name, q in wos_queries.items():
        resp = run_wos_api(q, page=1, limit=1)
        print(f"Processing query: {name}")
        
        if resp is None:
            results.append({"QueryName": name, "TotalRecords": None})
            continue

        total = getattr(resp.metadata, "total", None)
        results.append({"QueryName": name, "TotalRecords": total})

    return pd.DataFrame(results).sort_values("TotalRecords", ascending=False).reset_index(drop=True)

In [14]:
# Example run of the API query `run_wos_api`
q = LLM_and_Methods

#api_response = run_wos_api(q, page=1, limit=50)
#df_WoS_sampleResults = pd.DataFrame([item.to_dict() for item in api_response.hits])
#df_WoS_sampleResults

In [None]:
# RUN to get total records for each queries
df_WoS_totals = wos_query_totals(wos_queries_subset)
df_WoS_totals

# Get the total records possible for 'Simulation_BlockB'ApiException
total_simulationB = df_WoS_totals[df_WoS_totals['QueryName'] == 'LLM_and_SimulationB']['TotalRecords'].values[0]

Processing query: LLM_and_SimulationA
Processing query: LLM_and_SimulationB
Processing query: LLM_and_Survey_and_SimulationA
Processing query: LLM_and_Survey_and_SimulationB
Processing query: LLM_and_SimulationA_and_Methods
Processing query: LLM_and_SimulationB_and_Methods
Processing query: LLMSurvey or LLMSimulationA
Processing query: LLMSurvey or LLMSimulationB


Unnamed: 0,QueryName,TotalRecords
0,LLMSurvey or LLMSimulationB,45020
1,LLMSurvey or LLMSimulationA,44875
2,LLM_and_SimulationB,2278
3,LLM_and_Survey_and_SimulationB,1769
4,LLM_and_SimulationA,830
5,LLM_and_Survey_and_SimulationA,466
6,LLM_and_SimulationB_and_Methods,321
7,LLM_and_SimulationA_and_Methods,159


In [9]:
# RUN Fetch for all queries
dfs_WoS = {}
for name, query in wos_queries.items():
    print(f"\nFetching WoS results for: {name}")
    df = wos_fetch_pages(query, limit=50)
    print(f"{name}: {len(df)} rows")
    dfs_WoS[name] = df


Fetching WoS results for: LLM and Survey
LLM and Survey: 150 rows

Fetching WoS results for: LLM and Survey and Methods
LLM and Survey and Methods: 150 rows

Fetching WoS results for: LLM and SimulationA


: 

: 

In [None]:
# Also bind each to a variable for convenience
df_WoS_LLM_and_Survey = dfs_WoS["LLM and Survey"]
df_WoS_LLM_and_Survey_and_Methods = dfs_WoS["LLM and Survey and Methods"]
df_WoS_LLM_and_SimulationA = dfs_WoS["LLM and SimulationA"]
df_WoS_LLM_and_SimulationB = dfs_WoS["LLM and SimulationB"]
df_WoS_LLM_and_SimulationC = dfs_WoS["LLM and SimulationC"]
df_WoS_LLM_and_Methods = dfs_WoS["LLM and Methods"]
df_WoS_LLM_and_Survey_and_SimulationA = dfs_WoS["LLM and Survey and SimulationA"]
df_WoS_LLM_and_Survey_and_SimulationB = dfs_WoS["LLM and Survey and SimulationB"]
df_WoS_LLM_and_Survey_and_SimulationC = dfs_WoS["LLM and Survey and SimulationC"]
df_WoS_LLM_and_SimulationA_and_Methods = dfs_WoS["LLM and SimulationA and Methods"]
df_WoS_LLM_and_SimulationB_and_Methods = dfs_WoS["LLM and SimulationB and Methods"]
df_WoS_LLM_and_SimulationC_and_Methods = dfs_WoS["LLM and SimulationC and Methods"]
df_WoS_LLMSurvey_or_LLMSimulationA = dfs_WoS["LLMSurvey or LLMSimulationA"]
df_WoS_LLMSurvey_or_LLMSimulationB = dfs_WoS["LLMSurvey or LLMSimulationB"]
df_WoS_LLMSurvey_or_LLMSimulationC = dfs_WoS["LLMSurvey or LLMSimulationC"]
df_WoS_Survey_and_SimulationA = dfs_WoS["Survey and SimulationA"]
df_WoS_Survey_and_SimulationB = dfs_WoS["Survey and SimulationB"]
df_WoS_Survey_and_SimulationC = dfs_WoS["Survey and SimulationC"]

In [31]:
# RUN Fetch for subset of queries
dfs_WoS_subset = {}
for query_name, query in wos_queries_subset.items():
    print(f"Fetching records for query: {query_name}")
    df_results = wos_fetch_all_pages(query)
    dfs_WoS_subset[query_name] = df_results

Fetching records for query: LLM_and_SimulationA
Fetching records for query: LLM_and_Survey_and_SimulationA
Fetching records for query: LLM_and_Survey_and_SimulationB
Fetching records for query: LLM_and_SimulationA_and_Methods
Fetching records for query: LLM_and_SimulationB_and_Methods


In [None]:
df_LLM_and_SimulationA = dfs_WoS_subset["LLM_and_SimulationA"]
df_LLM_and_SimulationB = dfs_WoS_subset["LLM_and_SimulationB"]
df_LLM_and_Survey_and_SimulationA = dfs_WoS_subset["LLM_and_Survey_and_SimulationA"]
df_LLM_and_Survey_and_SimulationB = dfs_WoS_subset["LLM_and_Survey_and_SimulationB"]
df_LLM_and_SimulationA_and_Methods = dfs_WoS_subset["LLM_and_SimulationA_and_Methods"]
df_LLM_and_SimulationB_and_Methods = dfs_WoS_subset["LLM_and_SimulationB_and_Methods"]
df_LLMSurvey_or_LLMSimulationA = dfs_WoS_subset["LLMSurvey or LLMSimulationA"]
df_LLMSurvey_or_LLMSimulationB = dfs_WoS_subset["LLMSurvey or LLMSimulationB"]

# save all these dataframes to CSV files
output_dir = "wos_results"
os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist
for name, df in dfs_WoS.items():
    file_path = os.path.join(output_dir, f"{name.replace(' ', '_')}_WoS_results.csv")
    df.to_csv(file_path, index=False)
    print(f"Saved {name} results to {file_path}")

In [41]:
df_LLM_and_SimulationB

Unnamed: 0,uid,title,types,sourceTypes,source,names,links,citations,identifiers,keywords
0,WOS:001378237107016,From Values to Opinions: Predicting Human Beha...,[Meeting],[Proceedings Paper],{'sourceTitle': '2023 CONFERENCE ON EMPIRICAL ...,"{'authors': [{'displayName': 'Kane, Dongjun', ...",{'record': 'https://www.webofscience.com/api/g...,"[{'db': 'WOS', 'count': 0}]",{'eisbn': '979-8-89176-060-8'},{'authorKeywords': []}
1,WOS:000260660300019,Predictive value of PET-CT imaging versus AGO-...,[Article],[Article],{'sourceTitle': 'EUROPEAN JOURNAL OF OBSTETRIC...,"{'authors': [{'displayName': 'Lenhard, S. M.',...",{'record': 'https://www.webofscience.com/api/g...,"[{'db': 'WOS', 'count': 9}]","{'doi': '10.1016/j.ejogrb.2008.05.006', 'issn'...","{'authorKeywords': ['Ovarian cancer', 'Relapse..."
2,WOS:001424346400002,Benchmarking histopathology foundation models ...,[Article],[Article],"{'sourceTitle': 'DISCOVER ONCOLOGY', 'publishY...","{'authors': [{'displayName': 'Mallya, Mayur', ...",{'record': 'https://www.webofscience.com/api/g...,"[{'db': 'WOS', 'count': 0}]","{'doi': '10.1007/s12672-025-01973-x', 'eissn':...","{'authorKeywords': ['Ovarian cancer', 'Bevaciz..."
3,WOS:001533429100001,Using generative AI for interview simulations ...,[Article],[Article],{'sourceTitle': 'JOURNAL OF MICROBIOLOGY & BIO...,"{'authors': [{'displayName': 'Millen, Jonathan...",{'record': 'https://www.webofscience.com/api/g...,"[{'db': 'WOS', 'count': 0}]","{'doi': '10.1128/jmbe.00122-25', 'issn': '1935...","{'authorKeywords': ['interview simulation', 'g..."
4,WOS:001552274000019,Pan-cancer immunotherapy response prediction u...,[Abstract],[Meeting Abstract],"{'sourceTitle': 'CLINICAL CANCER RESEARCH', 'p...","{'authors': [{'displayName': 'Fomin, Vitalay',...",{'record': 'https://www.webofscience.com/api/g...,"[{'db': 'WOS', 'count': 0}]","{'doi': '10.1158/1557-3265.AIMACHINE-B011', 'i...",{'authorKeywords': []}
...,...,...,...,...,...,...,...,...,...,...
2273,WOS:001332158500001,"Secure in Diversity? Transborder Ethnicity, Tr...",[Article],[Article],{'sourceTitle': 'JOURNAL OF GLOBAL SECURITY ST...,"{'authors': [{'displayName': 'Smith, David J.'...",{'record': 'https://www.webofscience.com/api/g...,"[{'db': 'WOS', 'count': 1}]","{'doi': '10.1093/jogss/ogae033', 'issn': '2057...","{'authorKeywords': ['ontological security', 'm..."
2274,WOS:000247502100002,Phase transitions for the long-time behavior o...,[Article],[Article],"{'sourceTitle': 'ANNALS OF PROBABILITY', 'publ...","{'authors': [{'displayName': 'Greven, A.', 'wo...",{'record': 'https://www.webofscience.com/api/g...,"[{'db': 'WOS', 'count': 23}]","{'doi': '10.1214/009117906000001060', 'issn': ...","{'authorKeywords': ['interacting diffusions', ..."
2275,WOS:001484393900001,"""Hope the Russians Love Their Children Too"": R...",[Article],[Article],{'sourceTitle': 'JOURNAL OF GLOBAL SECURITY ST...,"{'authors': [{'displayName': 'Smetana, Michal'...",{'record': 'https://www.webofscience.com/api/g...,"[{'db': 'WOS', 'count': 1}]","{'doi': '10.1093/jogss/ogaf012', 'issn': '2057...","{'authorKeywords': ['nuclear taboo', 'survey e..."
2276,WOS:000231590700001,Detailed review of transgenic rodent mutation ...,[Review],[Review],{'sourceTitle': 'MUTATION RESEARCH-REVIEWS IN ...,"{'authors': [{'displayName': 'Lambert, LB', 'w...",{'record': 'https://www.webofscience.com/api/g...,"[{'db': 'WOS', 'count': 286}]","{'doi': '10.1016/j.mrrev.2005.04.002', 'issn':...","{'authorKeywords': ['genetic toxicology', 'mut..."


In [22]:
# Inspect available attributes to find the correct one
#print(type(api_response))
#print(dir(api_response))

In [11]:
# Clean and standardize a single WoS dataframe

def authors_from_names(names_obj):
    if isinstance(names_obj, dict):
        people = names_obj.get("authors") or []
        out = []
        for a in people:
            if isinstance(a, dict):
                dn = a.get("displayName") or a.get("wosStandard") or a.get("full_name") or ""
                if dn:
                    out.append(dn)
        return "; ".join(out)
    return ""

def keywords_from_obj(keywords_obj):
    if isinstance(keywords_obj, dict):
        ak = keywords_obj.get("authorKeywords")
        if isinstance(ak, list):
            return "; ".join([k for k in ak if isinstance(k, str)])
        if isinstance(ak, str):
            return ak
    return ""

def doi_from_identifiers(ident_obj):
    if isinstance(ident_obj, dict):
        doi = ident_obj.get("doi")
        if doi:
            return doi
        # fallback if API returns a list
        dois = ident_obj.get("dois")
        if isinstance(dois, list) and len(dois) > 0:
            return dois[0]
    return None

def year_from_source(src_obj):
    if isinstance(src_obj, dict):
        return src_obj.get("publishYear") or src_obj.get("publishedYear")
    return None

def first_source_type(st_list):
    if isinstance(st_list, list) and st_list:
        return st_list[0]
    return None


def clean_wos_df(df_raw: pd.DataFrame) -> pd.DataFrame:
    if df_raw is None or df_raw.empty:
        return pd.DataFrame(columns=["title", "authors", "doi", "Year", "keywords", "sourceType"])

    df = df_raw.copy()

    # Compute desired fields
    df["authors"] = df["names"].apply(authors_from_names) if "names" in df.columns else ""
    df["doi"] = df["identifiers"].apply(doi_from_identifiers) if "identifiers" in df.columns else None
    df["Year"] = df["source"].apply(year_from_source) if "source" in df.columns else None
    df["keywords"] = df["keywords"].apply(keywords_from_obj) if "keywords" in df.columns else ""
    df["sourceType"] = df["sourceTypes"].apply(first_source_type) if "sourceTypes" in df.columns else None

    # Drop intermediate/noisy columns
    to_drop = ["uid", "types", "sourceTypes", "source", "names", "links", "citations", "identifiers"]
    df = df.drop(columns=[c for c in to_drop if c in df.columns], errors="ignore")

    # Reorder columns (keep others after the key fields)
    key_cols = [c for c in ["title", "authors", "doi", "Year", "keywords", "sourceType"] if c in df.columns]
    other_cols = [c for c in df.columns if c not in key_cols]
    df = df[key_cols + other_cols]

    return df

In [None]:
# Apply cleaning to all 9 dataframes
dfs_WoS_clean = {name: clean_wos_df(df) for name, df in dfs_WoS.items()}

# Bind cleaned dataframes to variables
df_WoS_LLM_and_Survey_clean = dfs_WoS_clean["LLM and Survey"]
df_WoS_LLM_and_Survey_and_Methods_clean = dfs_WoS_clean["LLM and Survey and Methods"]
df_WoS_LLM_and_SimulationA_clean = dfs_WoS_clean["LLM and SimulationA"]
df_WoS_LLM_and_SimulationB_clean = dfs_WoS_clean["LLM and SimulationB"]
df_WoS_LLM_and_SimulationC_clean = dfs_WoS_clean["LLM and SimulationC"]
df_WoS_LLM_and_Methods_clean = dfs_WoS_clean["LLM and Methods"]
df_WoS_LLM_and_Survey_and_SimulationA_clean = dfs_WoS_clean["LLM and Survey and SimulationA"]
df_WoS_LLM_and_Survey_and_SimulationB_clean = dfs_WoS_clean["LLM and Survey and SimulationB"]
df_WoS_LLM_and_Survey_and_SimulationC_clean = dfs_WoS_clean["LLM and Survey and SimulationC"]
df_WoS_LLM_and_SimulationA_and_Methods_clean = dfs_WoS_clean["LLM and SimulationA and Methods"]
df_WoS_LLM_and_SimulationB_and_Methods_clean = dfs_WoS_clean["LLM and SimulationB and Methods"]
df_WoS_LLM_and_SimulationC_and_Methods_clean = dfs_WoS_clean["LLM and SimulationC and Methods"]
df_WoS_LLMSurvey_or_LLMSimulationA_clean = dfs_WoS_clean["LLMSurvey or LLMSimulationA"]
df_WoS_LLMSurvey_or_LLMSimulationB_clean = dfs_WoS_clean["LLMSurvey or LLMSimulationB"]
df_WoS_LLMSurvey_or_LLMSimulationC_clean = dfs_WoS_clean["LLMSurvey or LLMSimulationC"]
df_WoS_Survey_and_SimulationA_clean = dfs_WoS_clean["Survey and SimulationA"]
df_WoS_Survey_and_SimulationB_clean = dfs_WoS_clean["Survey and SimulationB"]
df_WoS_Survey_and_SimulationC_clean = dfs_WoS_clean["Survey and SimulationC"]

## ArXiv API Call

In [4]:
import arxiv

In [6]:
# 2) Define Search 
def fetch_results(query, max_results=200, page_size=100):
    client = arxiv.Client(
        page_size=page_size,      # results per page from API
        delay_seconds=3,          # be nice to arXiv
        num_retries=3
    )
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance,
        sort_order=arxiv.SortOrder.Descending,
    )
    seen = set()
    rows = []
    for r in client.results(search):
        if r.entry_id in seen:
            continue
        seen.add(r.entry_id)
        rows.append({
            "arxiv_id": r.get_short_id() if hasattr(r, "get_short_id") else r.entry_id.split('/')[-1],
            "title": r.title.strip(),
            "published": r.published.strftime("%Y-%m-%d") if r.published else "",
            "updated": r.updated.strftime("%Y-%m-%d") if r.updated else "",
            "primary_category": getattr(r, "primary_category", ""),
            "categories": ", ".join(getattr(r, "categories", []) or []),
            "authors": ", ".join(a.name for a in r.authors),
            "summary": r.summary.strip(),
            "pdf_url": r.pdf_url,
            "abs_url": r.entry_id,
        })
    return pd.DataFrame(rows)


In [None]:
# Define Query Blocs
LLM_TERMS = [
    "large language model", "foundation model", "LLM", "LLMs", "GPT", "GPT-3", 
    "GPT-3.5", "GPT-4", "ChatGPT", "LLaMA", "Llama 2", "Llama 3", "Mistral", 
    "Mixtral", "Claude", "Gemini", "PaLM", "Qwen", "Falcon 180B", "Phi-3", 
    "DeepSeek", "AI language model", "AI model", "chatbot", "chat bot", 
    "transformer-based model", "transformer language model"
]

SURVEY_TERMS = [
    "survey", "questionnaire", "opinion poll", "survey data", "public opinion", 
    "feeling thermometer", "open ended", "nonresponse", "non-response", "Likert", 
    "rating scale", "ranking question", "matrix question", "vignette", 
    "anchoring vignette", "conjoint", "discrete choice", "DCE", "self-report", 
    "respondent data", "human judgment", "item nonresponse", 'nonresponse', 
    "unit nonresponse", "missing data"
]

SIMULATION_CORE = [
    "simulate", "simulation", "simulating", "emulate", "emulation", "synthetic", "imputation", "impute",
    "synthetic data", "response generation", "as a respondent", "model as respondent", 
    "LLM as respondent", "synthetic respondent", "artificial respondent", "virtual participant", 
    "synthetic participant", "proxy respondent", "surrogate respondent", "persona", 
    "role play", "role prompt", "persona prompt"
]


# Function to check if any terms from a list are in the text (used in the query generation)
def any_tiabs(terms):
    # Placeholder function to demonstrate matching behavior
    return " OR ".join([f"({term})" for term in terms])

# Blocks that match the different term categories
LLM_BLOCK = any_tiabs(LLM_TERMS)
SURVEY_BLOCK = any_tiabs(SURVEY_TERMS)
SIM_BLOCK = any_tiabs(SIMULATION_CORE)

# Query blocks for combining different categories
query_blocks = {
    "Simulation Block + Survey Block": f"({SIM_BLOCK}) AND ({SURVEY_BLOCK})",
    "LLM Block + Survey Block": f"({LLM_BLOCK}) AND ({SURVEY_BLOCK})",
    "All Blocks": f"({LLM_BLOCK}) AND ({SURVEY_BLOCK}) AND ({SIM_BLOCK})"
}

In [None]:
### RUN SEARCHES AND STORE RESULTS

dfs = {}
for label, query in query_blocks.items():
    print(f"Fetching results for: {label}")
    df = fetch_results(query, max_results=100, page_size=100)
    dfs[label] = df
    print(f"Found {len(df)} results for {label}")

df_sim_survey = dfs["Simulation Block + Survey Block"]
df_llm_survey = dfs["LLM Block + Survey Block"]
df_all_blocks = dfs["All Blocks"]

## Semantic Scholar API Call

In [11]:
from semanticscholar import SemanticScholar
import os
from dotenv import load_dotenv

In [12]:
# DEFINE Search Parameters

FIELDS = ["paperId", "title", "year", "authors", "abstract", "url", "citationCount"]
YEAR_FILTER = "2023-"
BULK_SORT = "citationCount:desc"
MAX_PAPERS_PER_GROUP = 3000

QUERY_GROUPS = {
    # 1) LLM_TERMS
    "g1_llm_terms":
        '( "large language model" | "foundation model" | LLM | LLMs | GPT | "GPT-3" | "GPT-3.5" | "GPT-4" | ChatGPT | LLaMA | "Llama 2" | "Llama 3" | Mistral | Mixtral | Claude | Gemini | PaLM | Qwen | "Falcon 180B" | "Phi-3" | DeepSeek | "AI language model" | "AI model" | chatbot | "chat bot" | "transformer-based model" | "transformer language model" )',

    # 2) SURVEY_TERMS
    "g2_survey_terms":
        '( survey | questionnaire | "opinion poll" | "survey data" | "public opinion" | "feeling thermometer" | "open ended" | nonresponse | "non-response" | Likert | "rating scale" | "ranking question" | "matrix question" | vignette | "anchoring vignette" | conjoint | "discrete choice" | DCE | "self-report" | "respondent data" | "human judgment" )',

    # 3) SIMULATION_CORE
    "g3_simulation_core":
        '( simulate | simulation | simulating | emulate | emulation | synthetic | "synthetic data" | "response generation" | "as a respondent" | "model as respondent" | "LLM as respondent" | "synthetic respondent" | "artificial respondent" | "virtual participant" | "synthetic participant" | "proxy respondent" | "surrogate respondent" | persona | "role play" | "role prompt" | "persona prompt" | predict* | prediction | predict | imput* | imputation | impute )'
}

QUERY_GROUPS.update({
    # 1) LLM + Survey
    "g4_llm_and_survey":
        f'( {QUERY_GROUPS["g1_llm_terms"]} ) + ( {QUERY_GROUPS["g2_survey_terms"]} )',

    # 2) Survey + Simulation
    "g5_survey_and_simulation":
        f'( {QUERY_GROUPS["g2_survey_terms"]} ) + ( {QUERY_GROUPS["g3_simulation_core"]} )',

    # 3) LLM + Simulation
    "g6_llm_and_simulation":
        f'( {QUERY_GROUPS["g1_llm_terms"]} ) + ( {QUERY_GROUPS["g3_simulation_core"]} )',

    # 4) All three together
    "g7_llm_and_survey_and_simulation":
        f'( {QUERY_GROUPS["g1_llm_terms"]} ) + ( {QUERY_GROUPS["g2_survey_terms"]} ) + ( {QUERY_GROUPS["g3_simulation_core"]} )'
})

In [13]:
# DEFINE Search Parameters
FIELDS = ["paperId", "title", "year", "authors", "abstract", "url", "citationCount"]
YEAR_FILTER = "2023-"
BULK_SORT = "citationCount:desc"
MAX_PAPERS_PER_GROUP = 3000

LLM_Block = (
    '("large language model*" | "foundation model*" | LLM | LLMs | GPT | "GPT-3" | "GPT-3.5" | "GPT-4" | '
    'LLaMA* | "Llama 2" | "Llama 3" | Mistral | Mixtral | Claude* | Gemini | PaLM | Qwen | DeepSeek | '
    '"Falcon 180B" | "Phi-3")'
)

Survey_Block = (
    '(survey* | "survey data" | "survey response*" | questionnaire* | question* | "opinion poll*" | '
    '"public opinion*" | attitude* | value* | norm* | moral* | "feeling thermometer*" | "open-ended" | '
    '"open ended" | nonresponse | "non-response" | respondent* | participant* | interview* | '
    '"self-report*" | "data collection")'
)

Simulation_BlockA = (
    '((simulat* | emulat* | predict* | imput* | "missing data" | nonresponse | "non-response" | '
    '"item nonresponse" | "synthetic respondent*" | "synthetic participant*" | "artificial respondent*" | '
    '"artificial participant*" | "virtual respondent*" | "virtual participant*" | persona* | "role play*") '
    '+ (survey* | questionnaire* | respondent* | response* | interview* | "self-report*" | '
    '"data collection" | opinion* | poll*))'
)

Simulation_BlockB = (
    '('
    '(simulat* | emulat* | predict* | imput* | "synthetic data" | "missing data" | nonresponse | "non-response" | '
    '"item nonresponse" | "synthetic respondent*" | "synthetic participant*" | "artificial respondent*" | '
    '"artificial participant*" | "virtual respondent*" | "virtual participant*" | persona* | "role play*" | '
    '"as a respondent" | "LLM as respondent" | "model as respondent" | proxy | surrogate | "stand-in" | "stand in" | '
    'replac* | substitut* | represent* | fidelit* | faithful* | doppelg* | "Synthetic Voice*" | '
    '"LLM-generated persona*" | "LLM generated persona*"'
    ') + '
    '(survey* | questionnaire* | respondent* | response* | interview* | "self-report*" | "data collection" | '
    'opinion* | poll* | attitude* | value* | norm* | "public opinion*") '
    '| ("representing people"~3) | ("Synthetic Voice*"~5)'
    ')'
)

Simulation_BlockC = (
    '("survey simulation" | "simulated participant*" | "simulated respondent*" | "synthetic data" | '
    '"synthetic survey data" | "synthetic respondent*" | "synthetic participant*" | "artificial respondent*" | '
    '"artificial participant*" | "virtual respondent*" | "virtual participant*" | "LLM as respondent" | '
    '"model as respondent" | "as a respondent" | "role play*" | persona*)'
)

Methods_Block = (
    '(prompt* | "few-shot" | "few-shot learning" | "zero-shot" | "zero-shot learning" | '
    '"in-context learning" | ICL | "chain of thought" | "self-consistency" | "system message" | '
    'persona | personas | "role prompt*" | "instruction-tun*" | "instruction prompt*" | "fine-tun*" | '
    '("reinforcement learning with human feedback" | RLHF) | ("reinforcement learning with AI feedback" | RLAIF) | '
    '"temperature parameter" | "temperature setting" | "nucleus sampling" | "top-p sampling" | '
    '"active learning" | "transfer learning" | "meta learning" | "meta-learning" | '
    '"representation learning" | "continual learning" | "lifelong learning")'
)

# Combinations using + for AND and | for OR
QUERY_GROUPS = {
    # pairs
    "ss_llm_and_survey":           f'{LLM_Block} + {Survey_Block}',
    "ss_llm_and_simA":             f'{LLM_Block} + {Simulation_BlockA}',
    "ss_llm_and_simB":             f'{LLM_Block} + {Simulation_BlockB}',
    "ss_llm_and_simC":             f'{LLM_Block} + {Simulation_BlockC}',
    "ss_llm_and_methods":          f'{LLM_Block} + {Methods_Block}',
    "ss_survey_and_simA":          f'{Survey_Block} + {Simulation_BlockA}',
    "ss_survey_and_simB":          f'{Survey_Block} + {Simulation_BlockB}',
    "ss_survey_and_simC":          f'{Survey_Block} + {Simulation_BlockC}',

    # triples
    "ss_llm_survey_methods":       f'{LLM_Block} + {Survey_Block} + {Methods_Block}',
    "ss_llm_survey_simA":          f'{LLM_Block} + {Survey_Block} + {Simulation_BlockA}',
    "ss_llm_survey_simB":          f'{LLM_Block} + {Survey_Block} + {Simulation_BlockB}',
    "ss_llm_survey_simC":          f'{LLM_Block} + {Survey_Block} + {Simulation_BlockC}',
    "ss_llm_simA_methods":         f'{LLM_Block} + {Simulation_BlockA} + {Methods_Block}',
    "ss_llm_simB_methods":         f'{LLM_Block} + {Simulation_BlockB} + {Methods_Block}',
    "ss_llm_simC_methods":         f'{LLM_Block} + {Simulation_BlockC} + {Methods_Block}',

    # unions
    "ss_llm_survey_or_llm_simA":   f'({LLM_Block} + {Survey_Block}) | ({LLM_Block} + {Simulation_BlockA})',
    "ss_llm_survey_or_llm_simB":   f'({LLM_Block} + {Survey_Block}) | ({LLM_Block} + {Simulation_BlockB})',
    "ss_llm_survey_or_llm_simC":   f'({LLM_Block} + {Survey_Block}) | ({LLM_Block} + {Simulation_BlockC})',
}



In [14]:
# Define Functions
def author_names(paper_authors):
    if not paper_authors:
        return ""
    names = []
    for a in paper_authors:
        # supports Author objects and dicts
        names.append(getattr(a, "name", a.get("name") if isinstance(a, dict) else None))
    return ", ".join([n for n in names if n])

def paper_row(p):
    return {
        "paperId": getattr(p, "paperId", None),
        "title": getattr(p, "title", None),
        "year": getattr(p, "year", None),
        "authors": author_names(getattr(p, "authors", None)),
        "abstract": getattr(p, "abstract", None),
        "url": getattr(p, "url", None),
        "citationCount": getattr(p, "citationCount", None),
    }

def fetch_bulk_group(sch: SemanticScholar, query: str,
                     year_filter: str, fields: list,
                     max_papers: int, sort: str | None = None):
    """
    Runs a bulk search and yields up to max_papers Paper objects.
    Prints the API estimated total and progress as it goes.
    """
    results = sch.search_paper(
        query=query,
        year=year_filter,     # e.g., "2023-"
        fields=fields,
        bulk=True,            # /graph/v1/paper/search/bulk
        sort=sort,            # only works with bulk=True
    )
    est_total = getattr(results, "total", None)
    print(f"Estimated total: {est_total if est_total is not None else 'n/a'}")

    count = 0
    for p in results:        # iterates across pages automatically
        yield p
        count += 1
        if count >= max_papers:
            break

def fetch_group_df(sch: SemanticScholar, 
                   tag: str, 
                   max_papers_override=None) -> pd.DataFrame:
    """Fetch a single query group and return a DataFrame."""
    if tag not in QUERY_GROUPS:
        valid = ", ".join(QUERY_GROUPS.keys())
        raise ValueError(f"Unknown group '{tag}'. Valid keys: {valid}")

    query = QUERY_GROUPS[tag]
    rows = []
    for paper in fetch_bulk_group(
        sch,
        query=query,
        year_filter=YEAR_FILTER,
        fields=FIELDS,
        max_papers=max_papers_override if max_papers_override is not None else MAX_PAPERS_PER_GROUP,
        sort=BULK_SORT,
    ):
        rows.append(paper_row(paper))

    df = pd.DataFrame(rows, columns=FIELDS)
    return df


# ---------- Main ----------
def main(tag: str | None = None, 
         max_papers_override=None):

    sch = SemanticScholar(api_key=api_key_SS, timeout=45, retry=True)

    if tag is not None:
        return fetch_group_df(sch, tag, max_papers_override=max_papers_override)

    out = {}
    for k in QUERY_GROUPS:
        out[k] = fetch_group_df(sch, k, max_papers_override=max_papers_override)
    return out

In [15]:
# Common combos
df_ss_llm_svy       = main("ss_llm_and_survey")
df_ss_llm_simB      = main("ss_llm_and_simB")
df_ss_llm_svy_simC  = main("ss_llm_survey_simC")
df_union_svy_or_sim = main("ss_llm_survey_or_llm_simB")

Estimated total: 38949
Estimated total: 12772


: 

: 

In [None]:
df_SS_g1 = main("g1_llm_terms")
df_SS_g2 = main("g2_survey_terms")
df_SS_g3 = main("g3_simulation_core")
df_SS_g4 = main("g4_llm_and_survey")
df_SS_g5 = main("g5_survey_and_simulation")
df_SS_g6 = main("g6_llm_and_simulation")
df_SS_g7 = main("g7_llm_and_survey_and_simulation")

Estimated total: 182534
Estimated total: 914318
Estimated total: 1977706
Estimated total: 12909
Estimated total: 215629
Estimated total: 34780
Estimated total: 2853


# Measure Precision & Recall

## Load Refence Dataset & Functions

In [15]:
# Load the CSV of Zotero list of papers
gold_df = pd.read_csv("data/LLM - Survey Proxies.csv")

gold_df = gold_df[["Title", "Item Type", "Abstract Note"]].dropna().drop_duplicates().reset_index(drop=True)
gold_df["preprint_flag"] = gold_df["Item Type"].apply(lambda x: "preprint" if x == "preprint" else "non-preprint")

print(f"Gold list size: {len(gold_df)}")

Gold list size: 21


In [13]:
# Define normalization function (normalize_title)
def normalize_title(s: str) -> str:
    # Unicode normalize
    s = unicodedata.normalize("NFKC", str(s))
    # Lowercase
    s = s.lower()
    # Remove punctuation-like characters
    s = re.sub(r"[^\w\s]", " ", s)   # keep letters, numbers, underscore, whitespace
    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [19]:
# Build lookup from normalized -> original for the gold set
gold_df["norm"] = gold_df["Title"].map(normalize_title)

# If duplicates normalize to the same string, keep the first original as the representative
gold_norm_to_orig = dict(zip(gold_df["norm"], gold_df["Title"]))
gold_norm_to_preprint_flag = dict(zip(gold_df["norm"], gold_df["preprint_flag"]))
gold_norm_set = set(gold_df["norm"])

# Separate sets for preprint and non-preprint
gold_preprint_set = set(gold_df[gold_df["preprint_flag"] == "preprint"]["norm"])
gold_non_preprint_set = set(gold_df[gold_df["preprint_flag"] == "non-preprint"]["norm"])

# remove three known bad entries (rows) from the 'gold_non_preprint_set'
bad_titles = [
    "Algorithmic Fidelity of Large Language Models in Generating Synthetic German Public Opinions: A Case Study",
    "Extracting Affect Aggregates from Longitudinal Social Media Data with Temporal Adapters for Large Language Models",
    "The Potential and Challenges of Evaluating Attitudes, Opinions, and Values in Large Language Models"
]

for bad_title in bad_titles:
    norm_bad = normalize_title(bad_title)
    if norm_bad in gold_non_preprint_set:
        gold_non_preprint_set.remove(norm_bad)
        print(f"Removed known bad entry from gold_non_preprint_set: {bad_title}")

NameError: name 'gold_df' is not defined

In [18]:
# Define function to calculate recall
def calc_recall(df, 
                gold_norm_set, 
                gold_preprint_set, 
                gold_non_preprint_set):
    df = df.copy()
    df["norm_title"] = df["title"].map(normalize_title)
    
    found_norms = gold_norm_set.intersection(set(df['norm_title']))
    recall = len(found_norms) / len(gold_norm_set) if len(gold_norm_set) > 0 else 0

    found_preprints = gold_preprint_set.intersection(set(df['norm_title']))
    recall_preprint = len(found_preprints) / len(gold_preprint_set) if len(gold_preprint_set) > 0 else 0

    found_non_preprints = gold_non_preprint_set.intersection(set(df['norm_title']))
    recall_non_preprint = len(found_non_preprints) / len(gold_non_preprint_set) if len(gold_non_preprint_set) > 0 else 0

    return pd.DataFrame({
        "Number of Papers Retrieved": [int(len(df))],
        "Recall (out of 21)": [f"{recall:.2%}"],
        "Recall (journalArticels & other)": [f"{recall_non_preprint:.2%}"],
        "Recall (preprints)": [f"{recall_preprint:.2%}"],
    })

# Define function to calculate recall w/ missing titles
def calc_recall_with_missing(df, 
                             gold_norm_set = gold_norm_set, 
                             gold_preprint_set = gold_preprint_set, 
                             gold_non_preprint_set = gold_non_preprint_set, 
                             norm_to_orig = gold_norm_to_orig):
    df = df.copy()
    if "title" not in df.columns:
        df["title"] = ""
    df["norm_title"] = df["title"].map(normalize_title)

    df_norms = set(df["norm_title"])

    found_norms = gold_norm_set.intersection(df_norms)
    found_preprints = gold_preprint_set.intersection(df_norms)
    found_non_preprints = gold_non_preprint_set.intersection(df_norms)

    recall = len(found_norms) / len(gold_norm_set) if len(gold_norm_set) > 0 else 0
    recall_preprint = len(found_preprints) / len(gold_preprint_set) if len(gold_preprint_set) > 0 else 0
    recall_non_preprint = len(found_non_preprints) / len(gold_non_preprint_set) if len(gold_non_preprint_set) > 0 else 0

    # Missing normalized titles
    missing_preprint_norms = gold_preprint_set - df_norms
    missing_non_preprint_norms = gold_non_preprint_set - df_norms

    # Map back to original titles
    missing_preprint_titles = [norm_to_orig.get(n, n) for n in sorted(missing_preprint_norms)]
    missing_non_preprint_titles = [norm_to_orig.get(n, n) for n in sorted(missing_non_preprint_norms)]

    # return a dataframe with recall stats and missing titles
    return pd.DataFrame([{
        "Number of Papers Retrieved": len(df),
        "Recall (out of 21)": f"{recall:.2%}",
        "Recall (journalArticels & other)": f"{recall_non_preprint:.2%}",
        "Recall (preprints)": f"{recall_preprint:.2%}",
        "Missing Articles": "; ".join(missing_non_preprint_titles),
        "Missing Preprint": "; ".join(missing_preprint_titles),
    }])

## Recall Rate - WoS

In [91]:
wos_recall_results = {
    "LLM and Survey": calc_recall_with_missing(df_WoS_LLM_and_Survey_clean),
    "LLM and Survey and Methods": calc_recall_with_missing(df_WoS_LLM_and_Survey_and_Methods_clean),
    "LLM and SimulationA": calc_recall_with_missing(df_WoS_LLM_and_SimulationA_clean),
    "LLM and SimulationB": calc_recall_with_missing(df_WoS_LLM_and_SimulationB_clean),
    "LLM and SimulationC": calc_recall_with_missing(df_WoS_LLM_and_SimulationC_clean),
    "LLM and Methods": calc_recall_with_missing(df_WoS_LLM_and_Methods_clean),
    "LLM and Survey and SimulationA": calc_recall_with_missing(df_WoS_LLM_and_Survey_and_SimulationA_clean),
    "LLM and Survey and SimulationB": calc_recall_with_missing(df_WoS_LLM_and_Survey_and_SimulationB_clean),
    "LLM and Survey and SimulationC": calc_recall_with_missing(df_WoS_LLM_and_Survey_and_SimulationC_clean),
    "LLM and SimulationA and Methods": calc_recall_with_missing(df_WoS_LLM_and_SimulationA_and_Methods_clean),
    "LLM and SimulationB and Methods": calc_recall_with_missing(df_WoS_LLM_and_SimulationB_and_Methods_clean),
    "LLM and SimulationC and Methods": calc_recall_with_missing(df_WoS_LLM_and_SimulationC_and_Methods_clean),
    "LLMSurvey or LLMSimulationA": calc_recall_with_missing(df_WoS_LLMSurvey_or_LLMSimulationA_clean),
    "LLMSurvey or LLMSimulationB": calc_recall_with_missing(df_WoS_LLMSurvey_or_LLMSimulationB_clean),
    "LLMSurvey or LLMSimulationC": calc_recall_with_missing(df_WoS_LLMSurvey_or_LLMSimulationC_clean),
    "Survey and SimulationA": calc_recall_with_missing(df_WoS_Survey_and_SimulationA_clean),
    "Survey and SimulationB": calc_recall_with_missing(df_WoS_Survey_and_SimulationB_clean),
    "Survey and SimulationC": calc_recall_with_missing(df_WoS_Survey_and_SimulationC_clean),
}

In [None]:
recall_table_WoS = pd.concat(wos_recall_results.values(), 
                             keys=wos_recall_results.keys()).reset_index(level=1, 
                             drop=True).reset_index().rename(columns={"index": "Query"})

recall_table_WoS = recall_table_WoS.merge(df_WoS_totals.rename(columns={"QueryName": "Query", "TotalRecords": "Total Records in WoS"}),
                                          on="Query", how="left")

recall_table_WoS

In [None]:
wos_recall_results_subset = {
    "LLM and SimulationA": calc_recall_with_missing(df_LLM_and_SimulationA),
    "LLM and SimulationB": calc_recall_with_missing(df_LLM_and_SimulationB),
    "LLM and Survey and SimulationA": calc_recall_with_missing(df_LLM_and_Survey_and_SimulationA),
    "LLM and Survey and SimulationB": calc_recall_with_missing(df_LLM_and_Survey_and_SimulationB),
    "LLM and SimulationA and Methods": calc_recall_with_missing(df_LLM_and_SimulationA_and_Methods),
    "LLM and SimulationB and Methods": calc_recall_with_missing(df_LLM_and_SimulationB_and_Methods),
    "LLMSurvey or LLMSimulationA": calc_recall_with_missing(df_LLMSurvey_or_LLMSimulationA),
    "LLMSurvey or LLMSimulationB": calc_recall_with_missing(df_LLMSurvey_or_LLMSimulationB)
}
recall_table_WoS_subset = pd.concat(wos_recall_results_subset.values(), 
                                   keys=wos_recall_results_subset.keys()).reset_index(level=1, drop=True).reset_index().rename(columns={"index": "Query"})


# Print the number of records in preprint gold set
print(f"Number of records in preprint gold set: {len(gold_preprint_set)}")
# print the number of records in non-preprint gold set
print(f"Number of records in non-preprint gold set: {len(gold_non_preprint_set)}")
recall_table_WoS_subset

# save the recall_table_WoS_subset tables to CSV files
recall_table_WoS_subset.to_csv("wos_results/recall_table_WoS_subset.csv", index=False)

Unnamed: 0,Query,Number of Papers Retrieved,Recall (out of 21),Recall (journalArticels & other),Recall (preprints),Missing Articles,Missing Preprint
0,LLM and SimulationA,825,19.05%,36.36%,0.00%,Algorithmic Fidelity of Large Language Models ...,Addressing Systematic Non-response Bias with S...
1,LLM and SimulationB,2278,28.57%,54.55%,0.00%,Algorithmic Fidelity of Large Language Models ...,Addressing Systematic Non-response Bias with S...
2,LLM and Survey and SimulationA,463,19.05%,36.36%,0.00%,Algorithmic Fidelity of Large Language Models ...,Addressing Systematic Non-response Bias with S...
3,LLM and Survey and SimulationB,1762,28.57%,54.55%,0.00%,Algorithmic Fidelity of Large Language Models ...,Addressing Systematic Non-response Bias with S...
4,LLM and SimulationA and Methods,159,14.29%,27.27%,0.00%,Algorithmic Fidelity of Large Language Models ...,Addressing Systematic Non-response Bias with S...
5,LLM and SimulationB and Methods,320,23.81%,45.45%,0.00%,Algorithmic Fidelity of Large Language Models ...,Addressing Systematic Non-response Bias with S...


In [None]:
recall_table_WoS_subset

# check the current directory of this notebook
os.getcwd()

# exit the venv in terminal
# deact

In [36]:
# Compare Missing Journal Articles between two Querys in the recall_table_WoS_subse or recall_table_WoS

def compare_missing_journal_articles(df_recall, query1, query2):
    row1 = df_recall[df_recall["Query"] == query1]
    row2 = df_recall[df_recall["Query"] == query2]

    # print error for missing queries
    if row1.empty:
        print(f"Error: Query '{query1}' not found in the recall table.")
        return None
    if row2.empty:
        print(f"Error: Query '{query2}' not found in the recall table.")
        return None
    
    missing1 = set(row1.iloc[0]["Missing Articles"].split("; ")) if pd.notna(row1.iloc[0]["Missing Articles"]) else set()
    missing2 = set(row2.iloc[0]["Missing Articles"].split("; ")) if pd.notna(row2.iloc[0]["Missing Articles"]) else set()
    
    only_in_1 = missing1 - missing2
    only_in_2 = missing2 - missing1
    in_both = missing1.intersection(missing2)
    
    # output them in a bullet points like
    print(f"Comparison of Missing Journal Articles between '{query1}' and '{query2}':\n")
    print(f"ONly Missing in '{query1}' ({len(only_in_1)} articles):")
    for title in sorted(only_in_1):
        print(f" - {title}")    
    print(f"\nOnly Missing in '{query2}' ({len(only_in_2)} articles):")
    for title in sorted(only_in_2):
        print(f" - {title}")
    print(f"\nMissing In both ({len(in_both)} articles):")
    for title in sorted(in_both):
        print(f" - {title}")

compare_missing_journal_articles(recall_table_WoS_subset, 
                                 "LLM and Survey and SimulationB", "LLM and SimulationB and Methods")

Comparison of Missing Journal Articles between 'LLM and Survey and SimulationB' and 'LLM and SimulationB and Methods':

ONly Missing in 'LLM and Survey and SimulationB' (0 articles):

Only Missing in 'LLM and SimulationB and Methods' (1 articles):
 - Out of One, Many: Using Language Models to Simulate Human Samples

Missing In both (5 articles):
 - Algorithmic Fidelity of Large Language Models in Generating Synthetic German Public Opinions: A Case Study
 - Extracting Affect Aggregates from Longitudinal Social Media Data with Temporal Adapters for Large Language Models
 - LLM-Based Doppelgänger Models: Leveraging Synthetic Data for Human-Like Responses in Survey Simulations
 - Synthetic Voices: Evaluating the Fidelity of LLM-Generated Personas in Representing People’s Financial Wellbeing
 - The Potential and Challenges of Evaluating Attitudes, Opinions, and Values in Large Language Models


In [43]:
compare_missing_journal_articles(recall_table_WoS_subset, 
                                 "LLM and SimulationB", "LLM and Survey and SimulationB")

Comparison of Missing Journal Articles between 'LLM and SimulationB' and 'LLM and Survey and SimulationB':

ONly Missing in 'LLM and SimulationB' (0 articles):

Only Missing in 'LLM and Survey and SimulationB' (0 articles):

Missing In both (5 articles):
 - Algorithmic Fidelity of Large Language Models in Generating Synthetic German Public Opinions: A Case Study
 - Extracting Affect Aggregates from Longitudinal Social Media Data with Temporal Adapters for Large Language Models
 - LLM-Based Doppelgänger Models: Leveraging Synthetic Data for Human-Like Responses in Survey Simulations
 - Synthetic Voices: Evaluating the Fidelity of LLM-Generated Personas in Representing People’s Financial Wellbeing
 - The Potential and Challenges of Evaluating Attitudes, Opinions, and Values in Large Language Models


## Recall Rate - ArXiV

In [None]:
recall_summary = []
for label, dframe in zip(
    ["Simulation Block + Survey Block", "LLM Block + Survey Block", "All Blocks"],
    [df_sim_survey, df_llm_survey, df_all_blocks]
):
    stats = calc_recall(dframe, 
                        gold_norm_set, 
                        gold_preprint_set, 
                        gold_non_preprint_set)
    
    stats["Search Block"] = label
    recall_summary.append(stats)

recall_table = pd.DataFrame(recall_summary)[["Search Block", "Number of Papers Retrieved", "Recall (out of 21)", "Recall (journalArticels & other)", "Recall (preprints)"]]
recall_table["Number of Papers Retrieved"] = recall_table["Number of Papers Retrieved"].astype(int)
recall_table

## Recall Rate - Scholar

In [None]:
recall_summary_SS = []
for label, dframe in zip(
    ["LLM Terms", "Survey Terms", "Simulation Terms",
     "LLM + Survey Terms", "Survey + Simulation Terms", "LLM + Simulation Terms", 
     "All Terms"],
    [df_SS_g1, df_SS_g2, df_SS_g3, df_SS_g4, df_SS_g5, df_SS_g6, df_SS_g7]
):
    stats = calc_recall(dframe, 
                        gold_norm_set, 
                        gold_preprint_set, 
                        gold_non_preprint_set)
    
    stats["Search Group"] = label
    recall_summary_SS.append(stats)
    
recall_table_SS = pd.DataFrame(recall_summary_SS)[["Search Group", "Number of Papers Retrieved", "Recall (out of 21)", "Recall (journalArticels & other)", "Recall (preprints)"]]
recall_table_SS["Number of Papers Retrieved"] = recall_table_SS["Number of Papers Retrieved"].astype(int)
recall_table_SS

Unnamed: 0,Search Group,Number of Papers Retrieved,Recall (out of 21),Recall (non-preprints),Recall (preprints)
0,LLM Terms,3000,0.00%,0.00%,0.00%
1,Survey Terms,3000,14.29%,18.18%,10.00%
2,Simulation Terms,3000,0.00%,0.00%,0.00%
3,LLM + Survey Terms,3000,28.57%,27.27%,30.00%
4,Survey + Simulation Terms,3000,19.05%,18.18%,20.00%
5,LLM + Simulation Terms,3000,23.81%,27.27%,20.00%
6,All Terms,2853,57.14%,54.55%,60.00%


In [None]:
# I want to see the list of papers from the gold list that were not found in df_SS_g7
df_SS_g7_norms = set(df_SS_g7["title"].map(normalize_title))
missing_norms = gold_norm_set - df_SS_g7_norms 
missing_titles = [gold_norm_to_orig[n] for n in missing_norms]
missing_preprint_flags = [gold_norm_to_preprint_flag[n] for n in missing_norms]
missing_df = pd.DataFrame({
    "Title": missing_titles,
    "preprint_flag": missing_preprint_flags
})

# filter for non-preprints only
missing_df = missing_df[missing_df["preprint_flag"] == "non-preprint"].reset_index(drop=True)

# add in the abstracts from the gold list into the respective rows of missing_df
missing_df = missing_df.merge(gold_df[["Title", "Abstract Note"]], on="Title", how="left")
missing_df = missing_df.rename(columns={"Abstract Note": "Abstract"})
missing_df

Unnamed: 0,Title,preprint_flag,Abstract
0,"Out of One, Many: Using Language Models to Sim...",non-preprint,We propose and explore the possibility that la...
1,AI–Human Hybrids for Marketing Research: Lever...,non-preprint,The authors’ central premise is that a human–L...
2,Extracting Affect Aggregates from Longitudinal...,non-preprint,This paper proposes temporally aligned Large L...
3,"Vox Populi, Vox AI? Using Large Language Model...",non-preprint,“Synthetic samples” generated by large languag...
4,The Potential and Challenges of Evaluating Att...,non-preprint,Recent advances in Large Language Models (LLMs...


## Recall Rate - Elicit A.I.

In [24]:
# Load Elicit A.I. Search Results (Elicit prompt 1.csv to Elicit prompt 4.csv) each into their respective dataframe
df_Elicit1 = pd.read_csv("data/Elicit prompt 1.csv")
df_Elicit2 = pd.read_csv("data/Elicit prompt 2.csv")
df_Elicit3 = pd.read_csv("data/Elicit prompt 3.csv")
df_Elicit4 = pd.read_csv("data/Elicit prompt 4.csv")

# rename all Title to title
df_Elicit1 = df_Elicit1.rename(columns={"Title": "title"})
df_Elicit2 = df_Elicit2.rename(columns={"Title": "title"})
df_Elicit3 = df_Elicit3.rename(columns={"Title": "title"})
df_Elicit4 = df_Elicit4.rename(columns={"Title": "title"})

In [27]:
# The list of papers that exist in all four Elicit dataframe df1 to df4
common_titles = set(df_Elicit1["title"]).intersection(set(df_Elicit2["title"])).intersection(set(df_Elicit3["title"])).intersection(set(df_Elicit4["title"]))
common_titles

{'Are Large Language Models Chameleons? An Attempt to Simulate Social Surveys',
 'Can Large Language Models Capture Public Opinion about Global Warming? An Empirical Assessment of Algorithmic Fidelity and Bias',
 'Do LLMs Exhibit Human-like Response Biases? A Case Study in Survey Design',
 'Donald Trumps in the Virtual Polls: Simulating and Predicting Public Opinions in Surveys Using Large Language Models',
 'Evaluating the Moral Beliefs Encoded in LLMs',
 'Examining the Feasibility of Large Language Models as Survey Respondents',
 'Frontiers: Can Large Language Models Capture Human Preferences?',
 'Human Preferences in Large Language Model Latent Space: A Technical Analysis on the Reliability of Synthetic Data in Voting Outcome Prediction',
 'Large Language Models Show Human-like Social Desirability Biases in Survey Responses',
 'Large Language Models as Subpopulation Representative Models: A Review',
 'Large language models display human-like social desirability biases in Big Five pe

In [28]:
# calculate recall for df_Elicit1, df_Elicit2, df_Elicit3, df_Elicit4 dataframes with missing titles
elicit_recall_results = {
    "Elicit Prompt 1": calc_recall_with_missing(df_Elicit1, gold_norm_set, gold_preprint_set, gold_non_preprint_set, gold_norm_to_orig),
    "Elicit Prompt 2": calc_recall_with_missing(df_Elicit2, gold_norm_set, gold_preprint_set, gold_non_preprint_set, gold_norm_to_orig),
    "Elicit Prompt 3": calc_recall_with_missing(df_Elicit3, gold_norm_set, gold_preprint_set, gold_non_preprint_set, gold_norm_to_orig),
    "Elicit Prompt 4": calc_recall_with_missing(df_Elicit4, gold_norm_set, gold_preprint_set, gold_non_preprint_set, gold_norm_to_orig),
} 

recall_table_Elicit = pd.concat(elicit_recall_results.values(), keys=elicit_recall_results.keys()).reset_index(level=1, drop=True).reset_index().rename(columns={"index": "Query"})
recall_table_Elicit

Unnamed: 0,Query,Number of Papers Retrieved,Recall (out of 21),Recall (journalArticels & other),Recall (preprints),Missing Articles,Missing Preprint
0,Elicit Prompt 1,104,42.86%,27.27%,60.00%,AI–Human Hybrids for Marketing Research: Lever...,Addressing Systematic Non-response Bias with S...
1,Elicit Prompt 2,104,42.86%,27.27%,60.00%,AI–Human Hybrids for Marketing Research: Lever...,Addressing Systematic Non-response Bias with S...
2,Elicit Prompt 3,104,42.86%,36.36%,50.00%,AI–Human Hybrids for Marketing Research: Lever...,Addressing Systematic Non-response Bias with S...
3,Elicit Prompt 4,104,33.33%,18.18%,50.00%,AI–Human Hybrids for Marketing Research: Lever...,Addressing Systematic Non-response Bias with S...


In [None]:
# calculate recall rates for each dataframe and summarize in a table
recall_summary_Elicit = []
for label, dframe in zip( ["Elicit Prompt 1", "Elicit Prompt 2", "Elicit Prompt 3", "Elicit Prompt 4"],
                          [df_Elicit1, df_Elicit2, df_Elicit3, df_Elicit4]):
    
    # calculate recall with missing titles

    stats = calc_recall(dframe, 
                        gold_norm_set, 
                        gold_preprint_set, 
                        gold_non_preprint_set)
    
    stats["Search Prompt"] = label
    recall_summary_Elicit.append(stats)


recall_table_Elicit = pd.DataFrame(recall_summary_Elicit)[["Search Prompt", "Number of Papers Retrieved", 
                                                           "Recall (out of 21)", "Recall (journalArticels & other)", "Recall (preprints)"]]
recall_table_Elicit["Number of Papers Retrieved"] = recall_table_Elicit["Number of Papers Retrieved"].astype(int)
recall_table_Elicit

ValueError: Must pass 2-d input. shape=(4, 1, 7)

# Screening of Search Results

In [183]:
# Web of Science Search Results
df_WoS_results = pd.read_excel("data/savedrecs (2).xls")
df_WoS_results = df_WoS_results.rename(columns={"Article Title": "title"})

# Semantic Scholar Search Results
df_SS_g7

df_WoS_results

Unnamed: 0,Publication Type,Authors,Book Authors,Book Editors,Book Group Authors,Author Full Names,Book Author Full Names,Group Authors,title,Source Title,...,Web of Science Index,Research Areas,IDS Number,Pubmed Id,Open Access Designations,Highly Cited Status,Hot Paper Status,Date of Export,UT (Unique WOS ID),Web of Science Record
0,C,"Kaiser, C; Kaiser, J; Manewitsch, V; Rau, L; S...",,,ACM,"Kaiser, Carolin; Kaiser, Jakob; Manewitsch, Vl...",,,Simulating Human Opinions with Large Language ...,,...,,,,,,,,,,0
1,J,"Ferreira, G; Amidei, J; Nieto, R; Kaltenbrunne...",,,,"Ferreira, Gregorio; Amidei, Jacopo; Nieto, Rub...",,,How Well Do Simulated Population Samples with ...,,...,,,,,,,,,,0
2,C,"Kane, D; Parke, J; Jo, Y; Bak, J",,"Bouamor, H; Pino, J; Bali, K",,"Kane, Dongjun; Parke, JoonSuk; Jo, Yohan; Bak,...",,,From Values to Opinions: Predicting Human Beha...,,...,,,,,,,,,,0
3,J,"Arora, N; Chakraborty, I; Nishimura, Y",,,,"Arora, Neeraj; Chakraborty, Ishita; Nishimura,...",,,AI-Human Hybrids for Marketing Research: Lever...,,...,,,,,,,,,,0
4,J,"Antal, M; Beder, N",,,,"Antal, Margit; Beder, Norbert",,,Eysenck Personality Questionnaire: A Comparati...,,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,J,"Moscoso, V; Albernaz, AL; Salomao, RDP",,,,"Moscoso, Valdenice; Albernaz, Ana Luisa; Salom...",,,Niche modelling for twelve plant species (six ...,,...,,,,,,,,,,0
120,J,"Domenach, P; Krause, KR; Malmartel, A; Ravaud,...",,,,"Domenach, Paul; Krause, Karolin R.; Malmartel,...",,,Identifying psychosocial and contextual marker...,,...,,,,,,,,,,0
121,J,"Lim, MC; Lukman, KA; Giloi, N; Lim, JF; Salleh...",,,,"Lim, Mei Ching; Lukman, Khamisah Awang; Giloi,...",,,Landscaping Work: Work-related Musculoskeletal...,,...,,,,,,,,,,0
122,J,"King, RJ; Cordon-Rosales, C; Cox, J; Davies, C...",,,,"King, Raymond J.; Cordon-Rosales, Celia; Cox, ...",,,Triatoma dimidiata Infestation in Chagas Disea...,,...,,,,,,,,,,0


In [None]:
# Normalize titles for matching
df_WoS_results['norm_title'] = df_WoS_results['title'].map(normalize_title)
df_SS_g7['norm_title'] = df_SS_g7['title'].map(normalize_title)

# Find intersection of normalized titles
common_norm_titles = set(df_WoS_results['norm_title']).intersection(set(df_SS_g7['norm_title']))

# Filter both dataframes to only those with common titles
df_common = df_WoS_results[df_WoS_results['norm_title'].isin(common_norm_titles)].copy()
df_common = df_common.merge(
    df_SS_g7[df_SS_g7['norm_title'].isin(common_norm_titles)],
    on='norm_title',
    suffixes=('_WoS', '_SS')
)

df_common

Unnamed: 0,Publication Type,Authors,Book Authors,Book Editors,Book Group Authors,Author Full Names,Book Author Full Names,Group Authors,title_WoS,Source Title,...,UT (Unique WOS ID),Web of Science Record,norm_title,paperId,title_SS,year,authors,abstract,url,citationCount
0,J,"Ferreira, G; Amidei, J; Nieto, R; Kaltenbrunne...",,,,"Ferreira, Gregorio; Amidei, Jacopo; Nieto, Rub...",,,How Well Do Simulated Population Samples with ...,,...,,0,how well do simulated population samples with ...,25f383b7a807392696073801959dcc1c1aadd2bb,How Well Do Simulated Population Samples with ...,2025,"Gregorio Ferreira, Jacopo Amidei, Rubén Nieto,...",Background: Advances in artificial intelligenc...,https://www.semanticscholar.org/paper/25f383b7...,0
1,C,"Kane, D; Parke, J; Jo, Y; Bak, J",,"Bouamor, H; Pino, J; Bali, K",,"Kane, Dongjun; Parke, JoonSuk; Jo, Yohan; Bak,...",,,From Values to Opinions: Predicting Human Beha...,,...,,0,from values to opinions predicting human behav...,52e963c40a5083d5403cebf4d4782271aaa06994,From Values to Opinions: Predicting Human Beha...,2023,"Dongjun Kang, Joonsuk Park, Yohan Jo, Jinyeong...",Being able to predict people's opinions on iss...,https://www.semanticscholar.org/paper/52e963c4...,4
2,J,"Bisbee, J; Clinton, JD; Dorff, C; Kenkel, B; L...",,,,"Bisbee, James; Clinton, Joshua D.; Dorff, Cass...",,,Synthetic Replacements for Human Survey Data? ...,,...,,0,synthetic replacements for human survey data t...,58d735a54d3aba79ad3bffbfa2433d8e5ee27313,Synthetic Replacements for Human Survey Data? ...,2024,"James Bisbee, Joshua D. Clinton, C. Dorff, Bre...",\n Large language models (LLMs) offer new rese...,https://www.semanticscholar.org/paper/58d735a5...,74
3,J,"Liu, HJ; Cao, Y; Wu, X; Qiu, C; Gu, JG; Liu, M...",,,,"Liu, Haijiang; Cao, Yong; Wu, Xun; Qiu, Chen; ...",,,Towards realistic evaluation of cultural value...,,...,,0,towards realistic evaluation of cultural value...,3ab59b3d4a4b2e89f7eda93a950eeaa77b37332e,Towards realistic evaluation of cultural value...,2025,"Haijiang Liu, Yong Cao, Xun Wu, Chen Qiu, Jing...",,https://www.semanticscholar.org/paper/3ab59b3d...,2
4,J,"Boelaert, J; Coavoux, S; Ollion, E; Petev, I; ...",,,,"Boelaert, Julien; Coavoux, Samuel; Ollion, Eti...",,,Machine Bias. How Do Generative Language Model...,,...,,0,machine bias how do generative language models...,45f9ea8d0dc1a7e6c56ff6e1f23c8e632687d2a7,Machine Bias. How Do Generative Language Model...,2025,"J. Boelaert, Samuel Coavoux, Étienne Ollion, I...",Generative artificial intelligence (AI) is inc...,https://www.semanticscholar.org/paper/45f9ea8d...,9
5,J,"Qu, Y; Wang, J",,,,"Qu, Yao; Wang, Jue",,,Performance and biases of Large Language Model...,,...,,0,performance and biases of large language model...,e6d14d140c4faaf8f3d9f47e61cc5c6091bccf1e,Performance and Biases of Large Language Model...,2024,"Yao Qu, Jue Wang",,https://www.semanticscholar.org/paper/e6d14d14...,46
6,C,"Nguyen, H; Nguyen, V; López-Fierro, S; Ludovis...",,,ASSOC COMPUTING MACHINERY,"Ha Nguyen; Nguyen, Victoria; Lopez-Fierro, Sar...",,,Simulating Climate Change Discussion with Larg...,,...,,0,simulating climate change discussion with larg...,dd95064d28ee5d123a6a284422bbba3d443f0416,Simulating Climate Change Discussion with Larg...,2024,"Ha Nguyen, Victoria Nguyen, Saríah López-Fierr...",Large language models (LLMs) have shown promis...,https://www.semanticscholar.org/paper/dd95064d...,16
7,J,"Salecha, A; Ireland, ME; Subrahmanya, S; Sedoc...",,,,"Salecha, Aadesh; Ireland, Molly E.; Subrahmany...",,,Large language models display human-like socia...,,...,,0,large language models display human like socia...,8253104f5b1481d8557380d2dc5dab03ff9a7716,Large language models display human-like socia...,2024,"Aadesh Salecha, Molly E. Ireland, Shashanka Su...",Abstract Large language models (LLMs) are beco...,https://www.semanticscholar.org/paper/8253104f...,25
8,J,"Yao, JC; Zhang, HJ; Ou, J; Zuo, DY; Yang, Z; D...",,,,"Yao, Junchi; Zhang, Hongjie; Ou, Jie; Zuo, Din...",,,Social opinions prediction utilizes fusing dyn...,,...,,0,social opinions prediction utilizes fusing dyn...,392de716c8f6610f080ba655e885935c20ac6c73,Social opinions prediction utilizes fusing dyn...,2024,"Junchi Yao, Hongjie Zhang, Jie Ou, Dingyi Zuo,...",In the context where social media emerges as a...,https://www.semanticscholar.org/paper/392de716...,5
9,C,"Hämäläinen, P; Tavast, M; Kunnari, A",,,ACM,"Hamalainen, Perttu; Tavast, Mikke; Kunnari, Anton",,,Evaluating Large Language Models in Generating...,,...,,0,evaluating large language models in generating...,0ffd57884d7957f6b5634b9fa24843dc3759668f,Evaluating Large Language Models in Generating...,2023,"Perttu Hämäläinen, Mikke Tavast, Anton Kunnari",Collecting data is one of the bottlenecks of H...,https://www.semanticscholar.org/paper/0ffd5788...,218


In [185]:
# create a new dataframe that combines papers from both df_WoS_results and df_SS_g7 but remove duplicates based on 'norm_title' column
df_combined = pd.concat([df_WoS_results, df_SS_g7], ignore_index=True).drop_duplicates(subset=['norm_title']).reset_index(drop=True)
df_combined

Unnamed: 0,Publication Type,Authors,Book Authors,Book Editors,Book Group Authors,Author Full Names,Book Author Full Names,Group Authors,title,Source Title,...,Date of Export,UT (Unique WOS ID),Web of Science Record,norm_title,paperId,year,authors,abstract,url,citationCount
0,C,"Kaiser, C; Kaiser, J; Manewitsch, V; Rau, L; S...",,,ACM,"Kaiser, Carolin; Kaiser, Jakob; Manewitsch, Vl...",,,Simulating Human Opinions with Large Language ...,,...,,,0.0,simulating human opinions with large language ...,,,,,,
1,J,"Ferreira, G; Amidei, J; Nieto, R; Kaltenbrunne...",,,,"Ferreira, Gregorio; Amidei, Jacopo; Nieto, Rub...",,,How Well Do Simulated Population Samples with ...,,...,,,0.0,how well do simulated population samples with ...,,,,,,
2,C,"Kane, D; Parke, J; Jo, Y; Bak, J",,"Bouamor, H; Pino, J; Bali, K",,"Kane, Dongjun; Parke, JoonSuk; Jo, Yohan; Bak,...",,,From Values to Opinions: Predicting Human Beha...,,...,,,0.0,from values to opinions predicting human behav...,,,,,,
3,J,"Arora, N; Chakraborty, I; Nishimura, Y",,,,"Arora, Neeraj; Chakraborty, Ishita; Nishimura,...",,,AI-Human Hybrids for Marketing Research: Lever...,,...,,,0.0,ai human hybrids for marketing research levera...,,,,,,
4,J,"Antal, M; Beder, N",,,,"Antal, Margit; Beder, Norbert",,,Eysenck Personality Questionnaire: A Comparati...,,...,,,0.0,eysenck personality questionnaire a comparativ...,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,,,,,,,,,Generating Interpretations of Policy Announcem...,,...,,,,generating interpretations of policy announcem...,00837a426339a384df537eaaac69e52480c8e8b5,2024.0,"Andreas Marfurt, Ashley Thornton, David Sylvan...",Recent advances in language modeling have focu...,https://www.semanticscholar.org/paper/00837a42...,0.0
2915,,,,,,,,,Demystifying diagnosis: an efficient deep lear...,,...,,,,demystifying diagnosis an efficient deep learn...,0081eedf01655a7c541e52c8fb6a04b8da18e9f4,2025.0,"Ahmed Alzahrani, Muhammad Ali Raza, Muhammad Z...","As per a WHO survey conducted in 2023, more th...",https://www.semanticscholar.org/paper/0081eedf...,0.0
2916,,,,,,,,,Usability Testing of ChatGPT Website as a Medi...,,...,,,,usability testing of chatgpt website as a medi...,00798a978fa3f62624668109bb414bb4add1ff32,2023.0,"Harry Ma'ruf, Bayu Rima Aditya, Elis Hernawati...",This study aims to determine the level of usab...,https://www.semanticscholar.org/paper/00798a97...,0.0
2917,,,,,,,,,Artificial Intelligence for Urban Safety: A Ca...,,...,,,,artificial intelligence for urban safety a cas...,0043df60e07f3c5f6d8aece33aa999f036c35c00,2024.0,"Alessandro Marceddu, Massimo Miccoli, Alessand...",Abstract. This study explores the application ...,https://www.semanticscholar.org/paper/0043df60...,0.0
