# Data Input Modules

## DOI Retreival using title fuzzy match and author name

In [3]:
import pandas as pd
import json
import requests
import inputmodules

In [4]:
inputmodules.getdoi("Choi", "Investigating the importance of trust on adopting an autonomous vehicle")

'10.1080/10447318.2015.1070549'

### Programatically from a csv

Make sure the cells above have been run. Make sure encoding is correctly set, or function will not run properly

In [1]:
csv_file = "csv-Calcitonin-set-filtered.csv"
df = pd.read_csv(csv_file, encoding = 'utf-8')
df = df.head(10)
df

NameError: name 'pd' is not defined

In [None]:
df2 = pd.DataFrame()

df2['title'] = df.Title

doi = []
for i in range(len(df)):
    doi.append('')

df2['retreived doi'] = doi

for i in range (0, len(df)):
    author = df['First Author'][i]
    title = df['Title'][i]
    df2['retreived doi'][i] = inputmodules.getdoi(author, title)

In [None]:
df2.to_csv('Title_to_DOI.csv')

## PubMed to DOI

In [None]:
import xmltodict

In [None]:
def pubmedtodoi(id):
    q = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=' + str(id) + '&version=2.0'
    response = requests.get(q)
    val = "Not Found"
    if response.status_code == 200:
        x = response.content.decode('utf-8')
        x = xmltodict.parse(x)
        try:
            x = x['eSummaryResult']['DocumentSummarySet']['DocumentSummary']['ArticleIds']['ArticleId']
            for i in x:
                if i['IdType'] == "doi":
                    val = i['Value']
                    break

        except:
            return val

    return val

In [3]:
inputmodules.pubmedtodoi("35690723")

'10.1186/s10194-022-01431-x'

In [None]:
csv_file = "csv-Calcitonin-set-filtered.csv"
df = pd.read_csv(csv_file, encoding = 'utf-8')
df = df.head(10)
df2 = pd.DataFrame()

df2['pmid'] = df.PMID

doi = []
for i in range(len(df)):
    doi.append('')

df2['doi'] = doi

for i in range (0, len(df)):
    pmid = df['PMID'][i]
    df2['doi'][i] = inputmodules.pubmedtodoi(pmid)

In [None]:
df2.to_csv('PMID_to_DOI.csv')

# BIBTEX + RIS

In [None]:
import bibtexparser

In [None]:
def bibtodf(input):
    y = open(input, "r", encoding = 'utf-8')
    y = y.read()
    y = bibtexparser.loads(y)
    df = pd.DataFrame(y.entries)
    return df

In [None]:
inputmodules.bibtodf("Salt_SRRs.bib")

In [None]:
import rispy

In [None]:
def ristodf(input):
    y = open(input, "r", encoding = "utf-8")
    y = rispy.load(y)
    df = pd.DataFrame(y)
    return df

In [None]:
inputmodules.ristodf("Salt_SRRs.ris")

# Raw Text

In [13]:
def getdoifromcite(citation):
    q = "https://api.crossref.org/works/?query.bibliographic=" + citation
    val = "Not Found"
    response = requests.get(q)
    if response.status_code == 200:
        x = json.loads(response.content.decode('utf-8'))['message']
        x = x['items']
        for i in x:
            print(i['DOI'])
        val = x[0]['DOI']
        
    return val

In [14]:
getdoifromcite()

10.1186/s10194-022-01527-4
10.1007/s11916-022-01083-1
10.1007/s11916-019-0820-y
10.1186/s10194-022-01437-5
10.1007/s11916-019-0826-5
10.1007/s10194-008-0081-3
10.1186/s10194-022-01406-y
10.1186/s10194-022-01404-0
10.1111/head.v62.10
10.1111/head.14395
10.1007/s11916-022-01080-4
10.1186/s10194-021-01371-y
10.1186/s10194-022-01462-4
10.1111/head.14164
10.1186/s10194-022-01525-6
10.1111/j.1526-4610.1983.hed2301010.x
10.1007/s11916-022-01079-x
10.1007/s11916-022-01086-y
10.1007/s11916-019-0824-7
10.1007/s11916-019-0825-6


'10.1186/s10194-022-01527-4'

In [2]:
csv_file = "csv-Calcitonin-set-filtered.csv"
df = pd.read_csv(csv_file, encoding = 'utf-8')
df = df.head(10)
df

df2 = pd.DataFrame()

df2['Citation'] = df.Citation

doi = []
for i in range(len(df)):
    doi.append('')

df2['retreived cite'] = doi

for i in range (0, len(df)):
    citation = df['Citation'][i]
    df2['retreived cite'][i] = inputmodules.getdoifromcite(citation)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df2['retreived cite'][i] = inputmodules.getdoifromcite(citation)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update 

In [3]:
df2.to_csv('Citation_to_DOI.csv')