In [113]:
import pandas as pd
import numpy as np
from os.path import join
import requests

In [114]:
df = pd.read_csv("data/dr_algorithms-2.csv")

In [115]:
doi_col_1 = 'DOI or ISBN for method'
doi_col_2 = 'Secondary DOI'
doi_col_3 = 'Tertiary DOI'

single_doi_col = 'Single DOI'
single_ssid_col = 'Single Semantic Scholar ID'
single_wos_col = 'Single WOS UT'

isbn_col = 'ISBN'
ss_col = 'Semantic Scholar ID'

title_col = 'Title of method'
year_col = 'Year of method'
author_col = 'Authors of method'

In [116]:
df['Exclude'] = df['Notes'].apply(lambda val: "exclude" in str(val).lower())
df = df.loc[~df['Exclude']]

In [117]:
df.columns.tolist()

['Acronym',
 'Name',
 'DOI or ISBN for method',
 'Secondary DOI',
 'Tertiary DOI',
 'Semantic Scholar ID',
 'Single DOI',
 'Single Semantic Scholar ID',
 'Single WOS UT',
 'ISBN',
 'Notes',
 'Caveats',
 'WOS not found',
 'Authors of method',
 'Title of method',
 'Year of method',
 'Citation count for method',
 'Source',
 'Exclude']

In [118]:
def get_ss_query_val(row):
    doi_val = row[single_doi_col]
    ssid_val = row[single_ssid_col]

    if pd.notna(ssid_val):
        return ssid_val
    if pd.notna(doi_val):
        return f"DOI:{doi_val}"
    return np.nan

In [119]:
df['ss_query_val'] = df.apply(get_ss_query_val, axis='columns')

In [120]:
def get_wos_query_val(row):
    doi_val = row[single_doi_col]
    wos_val = row[single_wos_col]

    if pd.notna(wos_val):
        return f"UT={wos_val}"
    if pd.notna(doi_val):
        return f"DO={doi_val}"
    return np.nan

In [121]:
df['wos_query_val'] = df.apply(get_wos_query_val, axis='columns')

In [122]:
df = df.reset_index(drop=True)

In [123]:
paper_query_vals = df['ss_query_val'].tolist()
paper_query_vals

['02552a8b40f3a82a5353f596264db71d899a9b4a',
 'DOI:10.1037/0033-2909.85.2.410',
 'DOI:10.1109/VISUAL.1996.567787',
 'DOI:10.1142/S0218001415510088',
 'DOI:10.1109/72.554199',
 'DOI:10.1016/j.acha.2006.04.006',
 'DOI:10.1016/j.physrep.2004.03.006',
 'DOI:10.2307/1412107',
 'DOI:10.1057/palgrave.ivs.9500054',
 'd86648cb3ec497347d6e918116003b0e42910694',
 'DOI:10.1162/089976600300014980',
 '9193b90a944ecf4ceb3432e35caf3d391830a4b0',
 'DOI:10.1162/089976698300017953',
 'DOI:10.1016/0165-1684(91)90079-X',
 'DOI:10.1109/ISCAS.1999.777510',
 'DOI:10.1017/CBO9780511624148.005',
 'DOI:10.1117/12.650880',
 'DOI:10.1126/science.290.5500.2319',
 'DOI:10.1109/IGARSS.2006.144',
 'DOI:10.1109/TPAMI.2009.100',
 'DOI:10.1109/TVCG.2015.2464797',
 'DOI:10.1109/TVCG.2011.220',
 'DOI:10.1111/j.1469-1809.1936.tb02137.x',
 '9d16c547d15a08091e68c86a99731b14366e3f0d',
 'a0ac8c6271b4b10dcee2a0aa68f5284ee4b306df',
 'DOI:10.1126/science.290.5500.2323',
 'DOI:10.1073/pnas.1031596100',
 '0b060fdbd92cbcc66b383bcaa9b

In [124]:
r_ids = requests.post(
    'https://api.semanticscholar.org/graph/v1/paper/batch',
    params={'fields': 'citationCount,title,corpusId'},
    json={"ids": paper_query_vals}
)

In [125]:
response_vals = r_ids.json()

In [126]:
for i, r_val in enumerate(response_vals):
    if r_val is not None:
        df.at[i, 'ss_id'] = r_val['paperId']
        df.at[i, 'ss_corpus_id'] = r_val['corpusId']

In [128]:
df.loc[df['ss_corpus_id'].isna()]

Unnamed: 0,Acronym,Name,DOI or ISBN for method,Secondary DOI,Tertiary DOI,Semantic Scholar ID,Single DOI,Single Semantic Scholar ID,Single WOS UT,ISBN,...,Authors of method,Title of method,Year of method,Citation count for method,Source,Exclude,ss_query_val,wos_query_val,ss_id,ss_corpus_id


In [132]:
df['ss_corpus_id'] = df['ss_corpus_id'].astype(int).astype(str)

In [133]:
df.to_csv('data/dr_algorithms_with_ids.csv')

In [134]:
df.head()

Unnamed: 0,Acronym,Name,DOI or ISBN for method,Secondary DOI,Tertiary DOI,Semantic Scholar ID,Single DOI,Single Semantic Scholar ID,Single WOS UT,ISBN,...,Authors of method,Title of method,Year of method,Citation count for method,Source,Exclude,ss_query_val,wos_query_val,ss_id,ss_corpus_id
0,AE,Autoencoder,10.1126/science.1127647,,,02552a8b40f3a82a5353f596264db71d899a9b4a,10.1126/science.1127647,02552a8b40f3a82a5353f596264db71d899a9b4a,,,...,,,,,Espadoto et al.,False,02552a8b40f3a82a5353f596264db71d899a9b4a,DO=10.1126/science.1127647,02552a8b40f3a82a5353f596264db71d899a9b4a,262637400
1,CCA,Canonical correlation analysis,10.1037/0033-2909.85.2.410,,,,10.1037/0033-2909.85.2.410,,,,...,,,,,Espadoto et al.,False,DOI:10.1037/0033-2909.85.2.410,DO=10.1037/0033-2909.85.2.410,085049bec04020baecdb17bec196442a4abaedab,144661565
2,CHL,Chalmers,10.1109/VISUAL.1996.567787,,,,10.1109/VISUAL.1996.567787,,,,...,,,,,Espadoto et al.,False,DOI:10.1109/VISUAL.1996.567787,DO=10.1109/VISUAL.1996.567787,0f73d5a2669143d6fcbf4db0a4a9495bab115eb8,789498
3,CLM,ClassiMap,10.1142/S0218001415510088,,,,10.1142/S0218001415510088,,,,...,,,,,Espadoto et al.,False,DOI:10.1142/S0218001415510088,DO=10.1142/S0218001415510088,8d70b1981fc2947867a33f20256e88dc87fbb422,44333344
4,CuCA,Curvilinear component analysis,10.1109/72.554199,,,,10.1109/72.554199,,,,...,,,,,Espadoto et al.,False,DOI:10.1109/72.554199,DO=10.1109/72.554199,bfb5410593385a279cc62844bd395f744dec2302,6520113


In [135]:
wos_query_vals = df['wos_query_val'].tolist()

In [136]:
" OR ".join([v for v in wos_query_vals if pd.notna(v)])

'DO=10.1126/science.1127647 OR DO=10.1037/0033-2909.85.2.410 OR DO=10.1109/VISUAL.1996.567787 OR DO=10.1142/S0218001415510088 OR DO=10.1109/72.554199 OR DO=10.1016/j.acha.2006.04.006 OR DO=10.1016/j.physrep.2004.03.006 OR DO=10.2307/1412107 OR DO=10.1057/palgrave.ivs.9500054 OR DO=10.1145/568271.223812 OR DO=10.1162/089976600300014980 OR UT=WOS:000225309500042 OR DO=10.1162/089976698300017953 OR DO=10.1016/0165-1684(91)90079-X OR DO=10.1109/ISCAS.1999.777510 OR DO=10.1017/CBO9780511624148.005 OR DO=10.1117/12.650880 OR DO=10.1126/science.290.5500.2319 OR DO=10.1109/IGARSS.2006.144 OR DO=10.1109/TPAMI.2009.100 OR DO=10.1109/TVCG.2015.2464797 OR DO=10.1109/TVCG.2011.220 OR DO=10.1111/j.1469-1809.1936.tb02137.x OR UT=WOS:000180520100073 OR DO=10.1126/science.290.5500.2323 OR DO=10.1073/pnas.1031596100 OR UT=WOS:000270824200005 OR DO=10.1016/j.neucom.2014.07.071 OR UT=WOS:000225309500020 OR DO=10.1109/TVCG.2007.70443 OR DO=10.1007/S11741-004-0051-1 OR DO=10.1016/j.neucom.2006.11.007 OR DO=

In [137]:
# TODO: use above string to perform Advanced query on WOS website.
# Then, add all results to a new "Marked List".
# Then, download the Marked List as an excel file containing the WOS: identifiers for each paper.
# Join these IDs back into the table of identifiers.
# Check which references were not found in Web of Science. For example, UMAP will not be found because it is a Preprint so it is not included in the WOS "core collection".