In [84]:
import pandas as pd
import numpy as np
from os.path import join
import requests

In [85]:
df = pd.read_csv("data/dr_algorithms.csv")

In [86]:
doi_col_1 = 'DOI or ISBN for method'
doi_col_2 = 'Secondary DOI'
doi_col_3 = 'Tertiary DOI'
isbn_col = 'ISBN'
ss_col = 'Semantic Scholar ID'

title_col = 'Title of method'
year_col = 'Year of method'
author_col = 'Authors of method'

In [87]:
df['Exclude'] = df['Notes'].apply(lambda val: "exclude" in str(val).lower())
df = df.loc[~df['Exclude']]

In [88]:
out_df = pd.DataFrame(columns=['acronym', 'doi', 'ss_id'], index=[], data=[])

In [89]:
for row_i, row in df.iterrows():
    doi_1 = row[doi_col_1]
    doi_2 = row[doi_col_2]
    doi_3 = row[doi_col_3]
    

In [90]:
df.columns.tolist()

['Acronym',
 'Name',
 'DOI or ISBN for method',
 'Secondary DOI',
 'Tertiary DOI',
 'Semantic Scholar ID',
 'ISBN',
 'Notes',
 'Authors of method',
 'Title of method',
 'Year of method',
 'Citation count for method',
 'Source',
 'Exclude']

In [91]:
df.head()

Unnamed: 0,Acronym,Name,DOI or ISBN for method,Secondary DOI,Tertiary DOI,Semantic Scholar ID,ISBN,Notes,Authors of method,Title of method,Year of method,Citation count for method,Source,Exclude
0,AE,Autoencoder,10.1126/science.1127647,,,02552a8b40f3a82a5353f596264db71d899a9b4a,,,,,,,Espadoto et al.,False
1,CCA,Canonical correlation analysis,10.1037/0033-2909.85.2.410,,,,,,,,,,Espadoto et al.,False
2,CHL,Chalmers,10.1109/VISUAL.1996.567787,,,,,,,,,,Espadoto et al.,False
3,CLM,ClassiMap,10.1142/S0218001415510088,,,,,,,,,,Espadoto et al.,False
4,CuCA,Curvilinear component analysis,10.1109/72.554199,,,,,,,,,,Espadoto et al.,False


In [112]:
wide_df = df[['Acronym', doi_col_1, doi_col_2, doi_col_3, ss_col]]

In [113]:
narrow_df = wide_df.melt(id_vars='Acronym').dropna(subset=['value'])
narrow_df = narrow_df.reset_index(drop=True)
narrow_df

Unnamed: 0,Acronym,variable,value
0,AE,DOI or ISBN for method,10.1126/science.1127647
1,CCA,DOI or ISBN for method,10.1037/0033-2909.85.2.410
2,CHL,DOI or ISBN for method,10.1109/VISUAL.1996.567787
3,CLM,DOI or ISBN for method,10.1142/S0218001415510088
4,CuCA,DOI or ISBN for method,10.1109/72.554199
...,...,...,...
105,SDR,Semantic Scholar ID,02b8ee23d604da1271d6af0cbd08c8c0db2098bb
106,SMA,Semantic Scholar ID,45e288d93a674b10009ff0be9b7848b8d22493c8
107,SNE,Semantic Scholar ID,14d46c6396837986bb4b9a14024cb64797b8c6c0
108,T-SNE,Semantic Scholar ID,1c46943103bd7b7a2c7be86859995a4144d1938b


In [114]:
def get_ss_query_val(row):
    if row['variable'] == ss_col:
        return row['value']
    doi_val = row['value']
    doi_lower = doi_val.lower()
    if 'arxiv' in doi_lower:
        query_val = f"arXiv:{doi_val[doi_lower.index('arxiv')+6:]}"
    else:
        query_val = doi_val

    return query_val

In [115]:
narrow_df['query_val'] = narrow_df.apply(get_ss_query_val, axis='columns')

In [116]:
narrow_df['ss_id'] = np.nan

In [117]:
paper_query_vals = narrow_df['query_val'].tolist()

In [118]:
r_ids = requests.post(
    'https://api.semanticscholar.org/graph/v1/paper/batch',
    params={'fields': 'citationCount,title'},
    json={"ids": paper_query_vals}
)

In [119]:
response_vals = r_ids.json()

In [120]:
for i, r_val in enumerate(response_vals):
    if r_val is not None:
        narrow_df.at[i, 'ss_id'] = r_val['paperId']

  narrow_df.at[i, 'ss_id'] = r_val['paperId']


In [121]:
# Assume we have manually identified SS IDs for DOIs that return None
narrow_df = narrow_df.dropna(subset=['ss_id'])

In [122]:
narrow_df.to_csv('data/ss_id.csv')