Prototype the [manubot](https://github.com/greenelab/manubot) parsing of the `cite_as` field. [Issue 88](https://github.com/kipoi/website/issues/77).

I installed manubot from 

In [2]:
import kipoi

In [17]:
import manubot

In [18]:
manubot.__version__

'0.1.0'

In [169]:
df = kipoi.list_models()

In [136]:
df.head()

Unnamed: 0,source,model,version,authors,contributors,doc,type,inputs,targets,veff_score_variants,license,cite_as,trained_on,training_procedure,tags
0,kipoi,extended_coda,0.1,[Author(name='Pang We...,[Author(name='Johnny ...,Single bp resolution ...,keras,[H3K27AC_subsampled],[H3K27ac],False,MIT,https://doi.org/10.10...,Described in https://...,Described in https://...,[Histone modification]
1,kipoi,FactorNet/FOXA1/multi...,0.1,[Author(name='Daniel ...,[Author(name='Ziga Av...,FactorNet: a deep lea...,keras,"[seq, seq_rc]",is_binding_site,False,MIT,https://doi.org/10.11...,,,[DNA binding]
2,kipoi,FactorNet/FOXA1/onePe...,0.1,[Author(name='Daniel ...,[Author(name='Ziga Av...,FactorNet: a deep lea...,keras,"[seq, seq_rc]",is_binding_site,False,MIT,https://doi.org/10.11...,,,[DNA binding]
3,kipoi,FactorNet/JUND/meta_U...,0.1,[Author(name='Daniel ...,[Author(name='Ziga Av...,FactorNet: a deep lea...,keras,"[seq, seq_rc, meta_fe...",is_binding_site,False,MIT,https://doi.org/10.11...,,,[DNA binding]
4,kipoi,FactorNet/JUND/meta_U...,0.1,[Author(name='Daniel ...,[Author(name='Ziga Av...,FactorNet: a deep lea...,keras,"[seq, seq_rc, meta_fe...",is_binding_site,False,MIT,https://doi.org/10.11...,,,[DNA binding]


In [20]:
links = list(df.cite_as.dropna().unique())
links

['https://doi.org/10.1093/bioinformatics/btx243',
 'https://doi.org/10.1101/151274',
 'https://doi.org/10.1093/nar/gkv1249',
 'https://doi.org/10.1186/s13059-017-1189-z, https://doi.org/10.5281/zenodo.1094823',
 'https://doi.org/10.1016/j.cell.2015.09.054',
 'https://doi.org/10.1101/185868',
 'https://doi.org/10.1101/328138',
 'https://doi.org/10.1101/gr.200535.115',
 'https://doi.org/10.1038/nmeth.3547',
 'https://doi.org/10.1093/bioinformatics/btx727',
 'https://doi.org/10.1101/gr.227819.117',
 'https://doi.org/10.1093/bioinformatics/btw142',
 'https://arxiv.org/pdf/1603.09123.pdf',
 'https://doi.org/10.1038/nbt.3300',
 'https://doi.org/10.1089/1066527041410418',
 'https://doi.org/10.1093/nar/gkx177']

In [173]:
import logging

def parse_cite_as(cite_as):
    import re
    # in case multiple urls are provided,
    # take the first one
    url = re.split(' |; |, |\*|\n|,|;',cite_as)[0]
    
    replace_dict = {
        "https://doi.org/": "doi:",
        "https://arxiv.org/pdf/": "arxiv:",
        "https://arxiv.org/abs/": "arxiv:",
    }
    
    for k,v in replace_dict.items():
        if url.startswith("https://arxiv.org/pdf/"):
            return url.replace("https://arxiv.org/pdf/", "arxiv:").replace(".pdf", "")
        if url.startswith(k):
            return url.replace(k,v)
    return None


def parse_author(author_dict):
    if "given" in author_dict and "family" in author_dict:
        return kipoi.specs.Author(author_dict['given'] + " " + author_dict['family'])
    elif "literal" in author_dict:
        return kipoi.specs.Author(author_dict['literal'])
    else:
        raise ValueError("Author name not found")

def get_authors(cite_as):
    """Given a doi, get a list of Authors
    """
    from manubot.cite import citation_to_citeproc
    try:
        citation = citation_to_citeproc(parse_cite_as(cite_as))
        
        authors = [parse_author(d) for d in citation['author']]
        return authors
    except Exception as e:
        logging.warning("Unable to get the authors for: {}\n{}".format(cite_as, e))
        return []
    
    
def update_authors(authors, cite_as):
    """Given a list of authors, augment it
    Args:
      authors: a list of kipoi.specs.Author
      cite_as: cite_as field of a model
    """
    if cite_as is not None:
        scraped_authors = get_authors(cite_as)
    else:
        scraped_authors = []

    # now we need to merge the existing authors in the model.yaml file
    # with the scraped ones.

    # For now, just use a simple solution of completely overriding the
    # authors with the parsed ones
    # TODO - update to a more sofisticated solution?
    
    def find_orig_author(author, orig_authors):
        for orig_author in orig_authors:
            # dots are ignored
            if orig_author.name.replace(".", "") == author.name.replace(".", ""):
                return orig_author
        return None
    
    if scraped_authors:
        # sanity check. No additional authors
        for orig_author in authors:
            if find_orig_author(orig_author, scraped_authors) is None:
                logging.warning("specified author: {} not found in parsed authors:\n {}".format(orig_author, scraped_authors))
        out = []
        for author in scraped_authors:
            orig_author = find_orig_author(author, authors)
            if orig_author is not None:
                out.append(orig_author)
            else:
                out.append(author)
        return out
    else:
        return authors

In [167]:
get_authors("https://arxiv.org/pdf/1603.09123.pdf")

[Author(name='Byunghan Lee', github=None, email=None),
 Author(name='Junghwan Baek', github=None, email=None),
 Author(name='Seunghyun Park', github=None, email=None),
 Author(name='Sungroh Yoon', github=None, email=None)]

In [168]:
from manubot.cite import citation_to_citeproc

In [164]:
# check that all manuscripts can be successfully parsed
for l in links:
    if parse_cite_as(l) is None:
        raise ValueError

In [112]:
get_authors(links[0])

[Author(name='Pang Wei Koh', github=None, email=None),
 Author(name='Emma Pierson', github=None, email=None),
 Author(name='Anshul Kundaje', github=None, email=None)]

In [137]:
dfb = df[df.model=="Basenji"]

In [120]:
update_authors(df.iloc[0].authors, df.iloc[0].cite_as)

[Author(name='Pang Wei Koh', github='kohpangwei', email=None),
 Author(name='Emma Pierson', github=None, email=None),
 Author(name='Anshul Kundaje', github='akundaje', email=None)]

In [138]:
update_authors(dfb.iloc[0].authors, dfb.iloc[0].cite_as)

[Author(name='David R. Kelley', github='davek44', email=None),
 Author(name='Yakir A. Reshef', github=None, email=None),
 Author(name='Maxwell Bileschi', github=None, email=None),
 Author(name='David Belanger', github=None, email=None),
 Author(name='Cory Y. McLean', github=None, email=None),
 Author(name='Jasper Snoek', github=None, email=None)]

In [171]:
dfu = df.iloc[df.cite_as.drop_duplicates().index]
dfu

Unnamed: 0,source,model,version,authors,contributors,doc,type,inputs,targets,veff_score_variants,license,cite_as,trained_on,training_procedure,tags
0,kipoi,extended_coda,0.1,[Author(name='Pang We...,[Author(name='Johnny ...,Single bp resolution ...,keras,[H3K27AC_subsampled],[H3K27ac],False,MIT,https://doi.org/10.10...,Described in https://...,Described in https://...,[Histone modification]
1,kipoi,FactorNet/FOXA1/multi...,0.1,[Author(name='Daniel ...,[Author(name='Ziga Av...,FactorNet: a deep lea...,keras,"[seq, seq_rc]",is_binding_site,False,MIT,https://doi.org/10.11...,,,[DNA binding]
31,kipoi,pwm_HOCOMOCO/human/FOXA1,0.1,[Author(name='Ivan V....,[Author(name='Ziga Av...,'''Simple PWM-scannin...,keras,seq,pwm_match,True,MIT,https://doi.org/10.10...,,,[DNA binding]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1088,kipoi,DeepBind/Neurospora_c...,0.1,[Author(name='Babak A...,[Author(name='Johnny ...,Abstract: Knowing the...,keras,seq,binding_prob,True,BSD 3-Clause,https://doi.org/10.10...,,,[]
2019,kipoi,CpGenie/MCF_10A_ENCSR...,0.1,[Author(name='Haoyang...,[Author(name='Roman K...,Abstract: DNA methyla...,keras,seq,methylation_prob,True,Apache License v2,https://doi.org/10.10...,RRBS (restricted repr...,RMSprop,[DNA methylation]
2022,kipoi,CpGenie/GM19239_ENCSR...,0.1,[Author(name='Haoyang...,[Author(name='Roman K...,Abstract: DNA methyla...,keras,seq,methylation_prob,True,Apache License v2,https://doi.org/10.10...,RRBS (restricted repr...,RMSprop,[DNA methylation]


In [172]:
for i, row in dfu.iterrows():
    print(f"model: {row.model}, link: {row.cite_as}")
    print(update_authors(row.authors, row.cite_as))
    print("-"*40)

model: extended_coda, link: https://doi.org/10.1093/bioinformatics/btx243
[Author(name='Pang Wei Koh', github='kohpangwei', email=None), Author(name='Emma Pierson', github=None, email=None), Author(name='Anshul Kundaje', github='akundaje', email=None)]
----------------------------------------
model: FactorNet/FOXA1/multiTask_DGF, link: https://doi.org/10.1101/151274
[Author(name='Daniel Quang', github='daquang', email=None), Author(name='Xiaohui Xie', github=None, email=None)]
----------------------------------------
model: pwm_HOCOMOCO/human/FOXA1, link: https://doi.org/10.1093/nar/gkv1249
[Author(name='Ivan V. Kulakovskiy', github=None, email=None), Author(name='Ilya E. Vorontsov', github=None, email=None), Author(name='Ivan S. Yevshin', github=None, email=None), Author(name='Anastasiia V. Soboleva', github=None, email=None), Author(name='Artem S. Kasianov', github=None, email=None), Author(name='Haitham Ashoor', github=None, email=None), Author(name='Wail Ba-alawi', github=None, ema