In [125]:
import pandas as pd
import requests
import json
import numpy as np

In [2]:
def pretty_json(json_object): print(json.dumps(json_object, indent=2))

In [3]:
api_key = ""
with open ("apikey.txt", "r") as apikey_file:
    api_key=apikey_file.readlines()[0].strip()
api_endpoint = "https://api.core.ac.uk/v3/"

In [4]:
def query_api(url_fragment, query,is_scroll=False, limit=100, scrollId=None):
    headers={"Authorization":"Bearer "+api_key}
    query = {"q":query, "limit":limit}
    if not is_scroll:
        response = requests.post(f"{api_endpoint}{url_fragment}",data = json.dumps(query), headers=headers)
    elif not scrollId:
        query["scroll"]="true"
        response = requests.post(f"{api_endpoint}{url_fragment}",data = json.dumps(query),headers=headers)
    else:
        query["scrollId"]=scrollId
        response = requests.post(f"{api_endpoint}{url_fragment}",data = json.dumps(query),headers=headers)
    if response.status_code ==200:
        return response.json(), response.elapsed.total_seconds()
    else:
        print(f"Error code {response.status_code}, {response.content}")

def scroll(search_url, query, extract_info_callback=None):
    allresults = []
    count = 0
    scrollId=None
    while True:
        result, elapsed =query_api(search_url, query, is_scroll=True, scrollId=scrollId)
        scrollId=result["scrollId"]
        totalhits = result["totalHits"]
        result_size = len(result["results"])
        if result_size==0:
            break
        for hit in result["results"]:
            if extract_info_callback:
              allresults.append(extract_info_callback(hit))
            else:
              allresults.append(hit)
        count+=result_size
        print(f"{count}/{totalhits} {elapsed}s")
    return allresults

In [5]:
results , elapsed = query_api("search/works", "doi:10.1371/journal.pone.0019981")

pretty_json(results)

{
  "totalHits": 1,
  "limit": 100,
  "offset": 0,
  "scrollId": null,
  "results": [
    {
      "acceptedDate": "2011-05-24T00:00:00",
      "arxivId": null,
      "authors": [
        {
          "name": "A Kacelnik"
        },
        {
          "name": "A Papoulis"
        },
        {
          "name": "AD Briscoe"
        },
        {
          "name": "AJ Moore"
        },
        {
          "name": "B Luttbeg"
        },
        {
          "name": "BG Svensson"
        },
        {
          "name": "BM Dowds"
        },
        {
          "name": "CIM Healey"
        },
        {
          "name": "EB Mallon"
        },
        {
          "name": "EB Mallon"
        },
        {
          "name": "EJH Robinson"
        },
        {
          "name": "EJH Robinson"
        },
        {
          "name": "EJH Robinson"
        },
        {
          "name": "Elva J. H. Robinson"
        },
        {
          "name": "EO Wilson"
        },
        {
          "name": "F Ra

In [6]:
results , elapsed = query_api("search/outputs", "doi:10.1371/journal.pone.0019981")

pretty_json(results)

{
  "totalHits": 9,
  "limit": 100,
  "offset": 0,
  "scrollId": null,
  "results": [
    {
      "acceptedDate": "",
      "authors": [
        {
          "name": "Robinson, Elva J. H."
        },
        {
          "name": "Franks, Nigel R."
        },
        {
          "name": "Ellis, Samuel"
        },
        {
          "name": "Okuda, Saki"
        },
        {
          "name": "Marshall, James A. R."
        }
      ],
      "contributors": [],
      "createdDate": "2012-07-08T14:39:10+01:00",
      "dataProvider": {
        "id": 150,
        "name": "PubMed Central",
        "url": "https://api.core.ac.uk/v3/data-providers/150",
        "logo": "https://api.core.ac.uk/data-providers/150/logo"
      },
      "depositedDate": "",
      "documentType": "research",
      "doi": "10.1371/journal.pone.0019981",
      "downloadUrl": "https://core.ac.uk/download/8569611.pdf",
      "fullText": "",
      "id": 8569611,
      "identifiers": {
        "doi": "10.1371/journal.pone.0

In [7]:
results.keys()

dict_keys(['totalHits', 'limit', 'offset', 'scrollId', 'results', 'tooks', 'esTook'])

In [8]:
results['results']

[{'acceptedDate': '',
  'authors': [{'name': 'Robinson, Elva J. H.'},
   {'name': 'Franks, Nigel R.'},
   {'name': 'Ellis, Samuel'},
   {'name': 'Okuda, Saki'},
   {'name': 'Marshall, James A. R.'}],
  'contributors': [],
  'createdDate': '2012-07-08T14:39:10+01:00',
  'dataProvider': {'id': 150,
   'name': 'PubMed Central',
   'url': 'https://api.core.ac.uk/v3/data-providers/150',
   'logo': 'https://api.core.ac.uk/data-providers/150/logo'},
  'depositedDate': '',
  'documentType': 'research',
  'doi': '10.1371/journal.pone.0019981',
  'downloadUrl': 'https://core.ac.uk/download/8569611.pdf',
  'fullText': '',
  'id': 8569611,
  'identifiers': {'doi': '10.1371/journal.pone.0019981',
   'oai': 'oai:pubmedcentral.nih.gov:3101226'},
  'title': 'A Simple Threshold Rule Is Sufficient to Explain Sophisticated Collective Decision-Making',
  'language': {'code': 'en', 'name': 'English'},
  'publishedDate': '2011-05-24T01:00:00+01:00',
  'publisher': "'Public Library of Science (PLoS)'",
  're

In [172]:
def get_entity(url_fragment):
    headers={"Authorization":"Bearer "+api_key}
    response = requests.get(api_endpoint + url_fragment, headers=headers)
    if response.status_code == 200:
        return response.json(), response.elapsed.total_seconds()
    else:
        print(f"Error code {response.status_code}, {response.content}")

In [173]:
data_provider, elapsed = get_entity("outputs/5225915")
pretty_json(data_provider)

{
  "acceptedDate": "2011-05-24T00:00:00+01:00",
  "authors": [
    {
      "name": "Robinson, Elva J. H."
    },
    {
      "name": "Franks, Nigel R."
    },
    {
      "name": "Ellis, Samuel"
    },
    {
      "name": "Okuda, Saki"
    },
    {
      "name": "Marshall, James A. R."
    }
  ],
  "contributors": [],
  "createdDate": "2012-06-28T04:00:54+01:00",
  "dataProvider": {
    "id": 140,
    "name": "White Rose Research Online",
    "url": "https://api.core.ac.uk/v3/data-providers/140",
    "logo": "https://api.core.ac.uk/data-providers/140/logo"
  },
  "depositedDate": "2011-05-24T00:00:00+01:00",
  "documentType": "unknown",
  "doi": "10.1371/journal.pone.0019981",
  "downloadUrl": "https://core.ac.uk/download/5225915.pdf",
  "fullText": "This is a repository copy of A Simple Threshold Rule Is Sufficient to Explain Sophisticated Collective Decision-Making.White Rose Research Online URL for this paper:http://eprints.whiterose.ac.uk/63768/Version: Published VersionArticle:Ro

In [11]:
type(data_provider)

dict

In [13]:
data_provider

dict_keys(['acceptedDate', 'authors', 'contributors', 'createdDate', 'dataProvider', 'depositedDate', 'documentType', 'doi', 'downloadUrl', 'fullText', 'id', 'identifiers', 'title', 'language', 'publishedDate', 'publisher', 'references', 'sourceFulltextUrls', 'updatedDate', 'yearPublished', 'links', 'abstract', 'tags', 'fulltextStatus', 'subjects', 'oai', 'deleted', 'disabled', 'journals', 'repositories', 'repositoryDocument', 'urls', 'lastUpdate'])

In [7]:
data_provider['publishedDate']

'2011-05-24T00:00:00+01:00'

In [23]:
data_provider

{'acceptedDate': '2011-05-24T00:00:00+01:00',
 'authors': [{'name': 'Robinson, Elva J. H.'},
  {'name': 'Franks, Nigel R.'},
  {'name': 'Ellis, Samuel'},
  {'name': 'Okuda, Saki'},
  {'name': 'Marshall, James A. R.'}],
 'contributors': [],
 'createdDate': '2012-06-28T04:00:54+01:00',
 'dataProvider': {'id': 140,
  'name': 'White Rose Research Online',
  'url': 'https://api.core.ac.uk/v3/data-providers/140',
  'logo': 'https://api.core.ac.uk/data-providers/140/logo'},
 'depositedDate': '2011-05-24T00:00:00+01:00',
 'documentType': 'unknown',
 'doi': '10.1371/journal.pone.0019981',
 'downloadUrl': 'https://core.ac.uk/download/5225915.pdf',
 'fullText': 'This is a repository copy of A Simple Threshold Rule Is Sufficient to Explain Sophisticated Collective Decision-Making.White Rose Research Online URL for this paper:http://eprints.whiterose.ac.uk/63768/Version: Published VersionArticle:Robinson, Elva J. H. orcid.org/0000-0003-4914-9327, Franks, Nigel R., Ellis, Samuel et al.(2 more author

In [9]:
data_provider, elapsed = get_entity("outputs/81800077")

In [34]:
data_provider['references']

[]

In [10]:
data_provider

{'acceptedDate': '2014-07-29T00:00:00+01:00',
 'authors': [],
 'contributors': [],
 'createdDate': '2017-05-03T20:50:51+01:00',
 'dataProvider': {'id': 2612,
  'name': 'Springer - Publisher Connector',
  'url': 'https://api.core.ac.uk/v3/data-providers/2612',
  'logo': 'https://api.core.ac.uk/data-providers/2612/logo'},
 'depositedDate': '2014-01-01T00:00:00+00:00',
 'documentType': 'research',
 'doi': '10.12942/lrsp-2014-4',
 'downloadUrl': 'https://core.ac.uk/download/pdf/81800077.pdf',
 'id': 81800077,
 'identifiers': {'doi': '10.12942/lrsp-2014-4', 'oai': None},
 'title': 'Coronal Loops: Observations and Modeling of Confined Plasma',
 'language': None,
 'publishedDate': '',
 'publisher': 'Springer',
 'references': [],
 'sourceFulltextUrls': ['file:///data/core-remote/dit/data/Springer-OA/pdf/ec4/aHR0cDovL2xpbmsuc3ByaW5nZXIuY29tLzEwLjEyOTQyL2xyc3AtMjAxNC00LnBkZg==.pdf'],
 'updatedDate': '',
 'yearPublished': '2014',
 'links': [{'type': 'download',
   'url': 'https://core.ac.uk/downl

In [301]:
df = pd.read_csv('/home/roland/Projects/JP_citation_classification/feature_scraping/data/ACT2_dataset.tsv', sep='\t')

In [14]:
df.cited_abstract.isna().sum()

960

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   unique_id                 4000 non-null   object 
 1   core_id                   4000 non-null   int64  
 2   citation_offset           3995 non-null   float64
 3   total_doc_length          4000 non-null   int64  
 4   section_info              4000 non-null   object 
 5   citing_title              4000 non-null   object 
 6   citing_author             4000 non-null   object 
 7   citing_publication_info   3705 non-null   object 
 8   citing_abstract           3992 non-null   object 
 9   cited_title               4000 non-null   object 
 10  cited_author              4000 non-null   object 
 11  cited_abstract            3040 non-null   object 
 12  cited_doi                 3646 non-null   object 
 13  cited_publication_date    3888 non-null   object 
 14  cited_pu

In [17]:
cited_infos = df[['cited_title', 'cited_author', 'cited_abstract', 'cited_doi', 'cited_publication_date', 'cited_publication_info']]

In [22]:
cited_infos.cited_title[0]

'The English Village Community Examined in its Relation to the Manorial and Tribal Systems and to the Common or Open Field System of Husbandry'

In [24]:
def search_open_alex(search_value, search_parameter):
    url = 'https://api.openalex.org/works?filter=' + search_parameter +'.search:' + search_value
    try:
        open_alex_call = requests.get(url=url).json()['results'][0]
        result = open_alex_call
        return result
    except:
        return 0

In [26]:
x = search_open_alex('The English Village Community Examined in its Relation to the Manorial and Tribal Systems and to the Common or Open Field System of Husbandry', 'display_name')

In [27]:
type(x)

dict

In [29]:
cited_infos['oa'] = cited_infos.cited_title.apply(lambda x: search_open_alex(search_value=x, search_parameter = 'display_name'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cited_infos['oa'] = cited_infos.cited_title.apply(lambda x: search_open_alex(search_value=x, search_parameter = 'display_name'))


In [32]:
rest = cited_infos.loc[cited_infos.oa == 0]

In [43]:
rest['oa'] = rest.cited_doi.apply(lambda x: search_open_alex(search_value=x, search_parameter = 'doi'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rest['oa'] = rest.cited_doi.apply(lambda x: search_open_alex(search_value=x, search_parameter = 'doi'))


In [45]:
rest.loc[rest.oa == 0]

Unnamed: 0,cited_title,cited_author,cited_abstract,cited_doi,cited_publication_date,cited_publication_info,oa
2,Wien im Bild historischer Karten Die Entwicklu...,"[""Ferdinand Opll""]",,10.7767/boehlau.9783205114741,29/06/2004,Jahrhunderts,0
5,Les caract res originaux de l histoire rurale ...,"[""M Bloch""]",,,1931,Les caractères originaux de l'histoire rurale ...,0
6,Methods for spatial and temporal land use and ...,"[""Timothy W. Foresman"", ""Steward T. A. Pickett...",Understanding contemporary urban landscapes re...,10.1023/a:1018583729727,1997,Urban Ecosystems,0
7,Landschaft hat Geschichte Historische Entwickl...,"[""Forschungsinitiative Umweltgeschichte"", ""K E...",,,1999,CD-ROM,0
8,Channel planform change on the river dee meand...,"[""A. M. Gurnell"", ""S. R. Downward"", ""R. Jones""]",Channel planform change was investigated along...,10.1002/rrr.3450090402,1994-12,Regulated Rivers: Research & Management,0
...,...,...,...,...,...,...,...
3990,"Floods, fights and a fluid river: the Viennese...","[""Severin Hohensinner"", ""Bernhard Lager"", ""Chr...",Alluvial rivers can show unpredictable channel...,10.1007/s12685-013-0074-2,2013-7,Water History,0
3993,"ANABRANCHING RIVERS: THEIR CAUSE, CHARACTER AN...","[""GERALD C. NANSON"", ""A. DAVID KNIGHTON""]",,10.1002/(sici)1096-9837(199603)21:3<217::aid-e...,1996-3,Earth Surface Processes and Landforms,0
3994,Wien und die fr he Donaukartographie,"[""F Slezak""]",,10.1515/9783050074597-010,31/12/1997,Anthropologie und Geschichte,0
3995,Modelling channel evolution and floodplain mor...,"[""A D Howard""]",,,1996,Modelling channel evolution and floodplain mor...,0


In [52]:
cited_infos.oa[1]

{'id': 'https://openalex.org/W2332335873',
 'doi': 'https://doi.org/10.7767/dnrm.1977.22.3.167',
 'title': 'Die italienischen Begründer der Wiener Donaukartographie',
 'display_name': 'Die italienischen Begründer der Wiener Donaukartographie',
 'relevance_score': 279.1719,
 'publication_year': 1977,
 'publication_date': '1977-12-01',
 'ids': {'openalex': 'https://openalex.org/W2332335873',
  'doi': 'https://doi.org/10.7767/dnrm.1977.22.3.167',
  'mag': '2332335873'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://doi.org/10.7767/dnrm.1977.22.3.167',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S4210205521',
   'display_name': 'Der Donauraum',
   'issn_l': '0012-5415',
   'issn': ['0012-5415', '2307-289X'],
   'is_oa': False,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/P4310321073',
   'host_organization_name': 'Böhlau Verlag',
   'host_organization_lineage': ['https://openalex.org/P4310320561',
    'https

In [54]:
cited_infos['abstract_inverted_index'] = df['my_column'].apply(lambda x: extract_abstract_inverted_index(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cited_infos['abstract_inverted_index'] = cited_infos['oa'].apply(lambda x: extract_abstract_inverted_index(x))


In [57]:
cited_infos.abstract_inverted_index.isna().sum()

1297

In [83]:
cited_infos.loc[(cited_infos.cited_abstract.isna() == True) & (cited_infos.oa_abstract.isna() == False)]

Unnamed: 0,cited_title,cited_author,cited_abstract,cited_doi,cited_publication_date,cited_publication_info,oa,abstract_inverted_index,oa_title,oa_abstract
4,From memory to written record,"[""M T Clanchy""]",,,1993,From memory to written record,"{'id': 'https://openalex.org/W2071014039', 'do...","{'Introduction': [0], 'Part': [1, 36], 'I': [2...","From Memory to Written Record: England, 1066-1307",Introduction Part I The Making of Records 1 Me...
18,An agent-based model of collective nest choice...,"[""Stephen C. Pratt"", ""David J.T. Sumpter"", ""Ea...",,10.1016/j.anbehav.2005.01.022,2005-11,Animal Behaviour,"{'id': 'https://openalex.org/W2051178235', 'do...","{'Colonies': [0], 'of': [1, 14, 21, 45, 76, 10...",An agent-based model of collective nest choice...,Colonies of the ant Temnothorax (formerly Lept...
29,Consensus decision making in animals,"[""Larissa Conradt"", ""Timothy J. Roper""]",,10.1016/j.tree.2005.05.008,2005-8,Trends in Ecology & Evolution,"{'id': 'https://openalex.org/W2119382550', 'do...","{'Individual': [0, 145], 'animals': [1, 146, 4...",Consensus decision making in animals,Individual animals routinely face decisions th...
35,Rationality in collective decision-making by a...,"[""Susan C. Edwards"", ""Stephen C. Pratt""]",,10.1098/rspb.2009.0981,22/07/2009,Proceedings of the Royal Society B: Biological...,"{'id': 'https://openalex.org/W2155875500', 'do...","{'Economic': [0], 'models': [1], 'of': [2, 35,...",Rationality in collective decision-making by a...,Economic models of animal behaviour assume tha...
39,The effect of prior experience on nest site ev...,"[""Christiane I.M. Healey"", ""Stephen C. Pratt""]",,10.1016/j.anbehav.2008.02.016,2008-9,Animal Behaviour,"{'id': 'https://openalex.org/W2102592156', 'do...","{'Animals': [0], 'are': [1], 'expected': [2], ...",The effect of prior experience on nest site ev...,Animals are expected to follow decision-making...
...,...,...,...,...,...,...,...,...,...,...
3985,The Origin of Stories evolution Cognition and ...,"[""B Boyd""]",,,2009,"The Origin of Stories: evolution, Cognition, a...","{'id': 'https://openalex.org/W1604299710', 'do...","{'A': [0], 'century': [1], 'and': [2, 36, 53, ...","On the origin of stories: evolution, cognition...",A century and a half after the publication of ...
3987,Speech and gesture share the same communicatio...,"[""Paolo Bernardis"", ""Maurizio Gentilucci""]",,10.1016/j.neuropsychologia.2005.05.007,2006-1,Neuropsychologia,"{'id': 'https://openalex.org/W2057029517', 'do...","{'Humans': [0], 'speak': [1], 'and': [2, 13, 7...",Speech and gesture share the same communicatio...,Humans speak and produce symbolic gestures. Do...
3988,Gesture and the Nature of Language,"[""D F Armstrong""]",,10.2307/415935,1999,"Original Signs: Gesture, Sign, and the Source ...","{'id': 'https://openalex.org/W2061248998', 'do...","{'This': [0], 'book': [1], 'proposes': [2], 'a...",Gesture and the Nature of Language,This book proposes a radical alternative to do...
3989,From monkey-like action recognition to human l...,"[""Michael A. Arbib""]",,10.1017/s0140525x05000038,2005-4,Behavioral and Brain Sciences,"{'id': 'https://openalex.org/W2156256694', 'do...","{'The': [0, 47, 146], 'article': [1], 'analyze...",From monkey-like action recognition to human l...,The article analyzes the neural and functional...


In [59]:
def extract_title(dictionary):
    if isinstance(dictionary, dict) and 'title' in dictionary:
        return dictionary['title']
    else:
        return None

In [61]:
cited_infos['oa_title'] = cited_infos['oa'].apply(lambda x: extract_title(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cited_infos['oa_title'] = cited_infos['oa'].apply(lambda x: extract_title(x))


In [69]:
type(cited_infos.abstract_inverted_index[4])

dict

In [79]:
def generate_full_text(abstract_inverted_index):
    if abstract_inverted_index is None:
        return None  # Return empty string if the input is None

    # Sort the dictionary based on the location indices to ensure proper order
    sorted_index = sorted(abstract_inverted_index.items(), key=lambda x: x[1][0])

    # Reconstruct the full text
    full_text = ""
    for word, locations in sorted_index:
        full_text += word + " "
    
    return full_text.strip()

In [80]:
cited_infos['oa_abstract'] = cited_infos.abstract_inverted_index.apply(lambda x: generate_full_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cited_infos['oa_abstract'] = cited_infos.abstract_inverted_index.apply(lambda x: generate_full_text(x))


In [82]:
cited_infos.oa_abstract.isna().sum()

1297

In [119]:
def extract_display_names(concepts_list):
    if isinstance(concepts_list, list):
        display_names = [concept['display_name'] for concept in concepts_list]
        return ', '.join(display_names)
    else:
        return 'FUCK U'

In [120]:
cited_infos['concepts'] = cited_infos['oa'].apply(lambda x: x.get('concepts', []) if isinstance(x, dict) else [])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cited_infos['concepts'] = cited_infos['oa'].apply(lambda x: x.get('concepts', []) if isinstance(x, dict) else [])


In [121]:
cited_infos['concepts'] = cited_infos['concepts'].apply(extract_display_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cited_infos['concepts'] = cited_infos['concepts'].apply(extract_display_names)


In [126]:
cited_infos.concepts.replace('', np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cited_infos.concepts.replace('', np.nan, inplace=True)


In [128]:
cited_infos.concepts

0       Animal husbandry, Relation (database), Field (...
1       Download, Humanities, Geography, Art, World Wi...
2                                                     NaN
3       Cross section (physics), Section (typography),...
4       CONQUEST, Literacy, Mythology, History, Litera...
                              ...                        
3995                                                  NaN
3996                                                  NaN
3997                               Political science, Art
3998    Channel (broadcasting), Computer science, Tele...
3999    Aggregate (composite), Materials science, Slag...
Name: concepts, Length: 4000, dtype: object

In [129]:
cited_infos

Unnamed: 0,cited_title,cited_author,cited_abstract,cited_doi,cited_publication_date,cited_publication_info,oa,abstract_inverted_index,oa_title,oa_abstract,concepts
0,The English Village Community Examined in its ...,"[""Frederic Seebohm""]",,10.1017/cbo9781139094443.008,,The English Village Community Examined in its ...,"{'id': 'https://openalex.org/W4247988757', 'do...",,The English Village Community Examined in its ...,,"Animal husbandry, Relation (database), Field (..."
1,Die italienischen Begründer der Wiener Donauka...,"[""Friedrich Slezak""]",Die Beziehungen der Kaiserstadt an der Donau z...,10.7767/dnrm.1977.22.3.167,1977-12,Der Donauraum,"{'id': 'https://openalex.org/W2332335873', 'do...","{'No': [0], 'AccessDie': [1], 'italienischen':...",Die italienischen Begründer der Wiener Donauka...,No AccessDie italienischen Begründer der Wiene...,"Download, Humanities, Geography, Art, World Wi..."
2,Wien im Bild historischer Karten Die Entwicklu...,"[""Ferdinand Opll""]",,10.7767/boehlau.9783205114741,29/06/2004,Jahrhunderts,0,,,,
3,The hydraulic geometry of stream channels and ...,"[""Luna Bergere Leopold"", ""Thomas Maddock""]",Some hydraulic characteristics of stream chann...,10.3133/pp252,1953,Geological Survey Professional Paper,"{'id': 'https://openalex.org/W1587869157', 'do...","{'Some': [0], 'hydraulic': [1], 'characteristi...",The hydraulic geometry of stream channels and ...,Some hydraulic characteristics of stream chann...,"Cross section (physics), Section (typography),..."
4,From memory to written record,"[""M T Clanchy""]",,,1993,From memory to written record,"{'id': 'https://openalex.org/W2071014039', 'do...","{'Introduction': [0], 'Part': [1, 36], 'I': [2...","From Memory to Written Record: England, 1066-1307",Introduction Part I The Making of Records 1 Me...,"CONQUEST, Literacy, Mythology, History, Litera..."
...,...,...,...,...,...,...,...,...,...,...,...
3995,Modelling channel evolution and floodplain mor...,"[""A D Howard""]",,,1996,Modelling channel evolution and floodplain mor...,0,,,,
3996,Atlas of the Danube River Vienna A history of ...,"[""P Mohilla"", ""F Michlmayr""]",,,1996,Donauatlas Wien: Geschichte der Donauregulieru...,0,,,,
3997,Alte Grenzen im Wiener Raum,"[""F Opll""]",,,1986,"Kommentare zum Historischen Atlas von Wien, Vi...","{'id': 'https://openalex.org/W621383568', 'doi...",,Alte Grenzen im Wiener Raum,,"Political science, Art"
3998,Avulsive channel systems: characteristics and ...,"[""Keith Richards"", ""Shobhit Chandra"", ""Peter F...",,10.1144/gsl.sp.1993.075.01.12,1993,"Geological Society, London, Special Publications","{'id': 'https://openalex.org/W2145807268', 'do...",,Avulsive channel systems: characteristics and ...,,"Channel (broadcasting), Computer science, Tele..."


In [130]:
cited_infos['cited_abstract'].fillna(cited_infos['oa_abstract'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cited_infos['cited_abstract'].fillna(cited_infos['oa_abstract'], inplace=True)


In [133]:
cited_infos.cited_abstract.isna().sum()

493

In [136]:
df.core_id.value_counts()

93144659     120
81800077      93
82852800      86
81068384      74
52040220      58
            ... 
46954401       1
2191277        1
19783337       1
2185862        1
187054721      1
Name: core_id, Length: 229, dtype: int64

In [137]:
cited_infos

Unnamed: 0,cited_title,cited_author,cited_abstract,cited_doi,cited_publication_date,cited_publication_info,oa,abstract_inverted_index,oa_title,oa_abstract,concepts
0,The English Village Community Examined in its ...,"[""Frederic Seebohm""]",,10.1017/cbo9781139094443.008,,The English Village Community Examined in its ...,"{'id': 'https://openalex.org/W4247988757', 'do...",,The English Village Community Examined in its ...,,"Animal husbandry, Relation (database), Field (..."
1,Die italienischen Begründer der Wiener Donauka...,"[""Friedrich Slezak""]",Die Beziehungen der Kaiserstadt an der Donau z...,10.7767/dnrm.1977.22.3.167,1977-12,Der Donauraum,"{'id': 'https://openalex.org/W2332335873', 'do...","{'No': [0], 'AccessDie': [1], 'italienischen':...",Die italienischen Begründer der Wiener Donauka...,No AccessDie italienischen Begründer der Wiene...,"Download, Humanities, Geography, Art, World Wi..."
2,Wien im Bild historischer Karten Die Entwicklu...,"[""Ferdinand Opll""]",,10.7767/boehlau.9783205114741,29/06/2004,Jahrhunderts,0,,,,
3,The hydraulic geometry of stream channels and ...,"[""Luna Bergere Leopold"", ""Thomas Maddock""]",Some hydraulic characteristics of stream chann...,10.3133/pp252,1953,Geological Survey Professional Paper,"{'id': 'https://openalex.org/W1587869157', 'do...","{'Some': [0], 'hydraulic': [1], 'characteristi...",The hydraulic geometry of stream channels and ...,Some hydraulic characteristics of stream chann...,"Cross section (physics), Section (typography),..."
4,From memory to written record,"[""M T Clanchy""]",Introduction Part I The Making of Records 1 Me...,,1993,From memory to written record,"{'id': 'https://openalex.org/W2071014039', 'do...","{'Introduction': [0], 'Part': [1, 36], 'I': [2...","From Memory to Written Record: England, 1066-1307",Introduction Part I The Making of Records 1 Me...,"CONQUEST, Literacy, Mythology, History, Litera..."
...,...,...,...,...,...,...,...,...,...,...,...
3995,Modelling channel evolution and floodplain mor...,"[""A D Howard""]",,,1996,Modelling channel evolution and floodplain mor...,0,,,,
3996,Atlas of the Danube River Vienna A history of ...,"[""P Mohilla"", ""F Michlmayr""]",,,1996,Donauatlas Wien: Geschichte der Donauregulieru...,0,,,,
3997,Alte Grenzen im Wiener Raum,"[""F Opll""]",,,1986,"Kommentare zum Historischen Atlas von Wien, Vi...","{'id': 'https://openalex.org/W621383568', 'doi...",,Alte Grenzen im Wiener Raum,,"Political science, Art"
3998,Avulsive channel systems: characteristics and ...,"[""Keith Richards"", ""Shobhit Chandra"", ""Peter F...",,10.1144/gsl.sp.1993.075.01.12,1993,"Geological Society, London, Special Publications","{'id': 'https://openalex.org/W2145807268', 'do...",,Avulsive channel systems: characteristics and ...,,"Channel (broadcasting), Computer science, Tele..."


In [163]:
citing_paper = df[['core_id', 'citing_title']].drop_duplicates('core_id')

In [165]:
citing_paper['oa'] = citing_paper.citing_title.apply(lambda x: search_open_alex(x, 'display_name'))

In [183]:
citing_paper

Unnamed: 0,core_id,citing_title,oa
0,81153632,"Two steps back, one step forward: reconstructi...",0
13,50694047,Reproducibility of tender point examination in...,"{'id': 'https://openalex.org/W2118076555', 'do..."
15,5225915,A Simple Threshold Rule Is Sufficient to Expla...,"{'id': 'https://openalex.org/W2142581949', 'do..."
53,82875399,Interpreting Null Findings from Trials of Alco...,"{'id': 'https://openalex.org/W2112943852', 'do..."
67,82218905,Effects of rabeprazole on the antiplatelet eff...,"{'id': 'https://openalex.org/W1993013707', 'do..."
...,...,...,...
3856,52040210,Governance costs in foreign direct investments...,"{'id': 'https://openalex.org/W2012543815', 'do..."
3912,82867877,Pathology and pathogenesis of vascular cogniti...,"{'id': 'https://openalex.org/W2054985387', 'do..."
3913,1385821,Policy and practice in the use of root cause a...,"{'id': 'https://openalex.org/W2059046137', 'do..."
3936,82867364,Wandering tales: evolutionary origins of menta...,"{'id': 'https://openalex.org/W2002852594', 'do..."


In [184]:
test = citing_paper.loc[citing_paper.oa == 0]

In [185]:
test.to_csv('test.csv')

In [177]:
def get_core(id):
    id = str(id)
    data_provider, elapsed = get_entity("outputs/" + id)
    return data_provider

In [179]:
get_core(78263683)

{'acceptedDate': '2016-05-26T00:00:00+01:00',
 'authors': [{'name': 'Gurgiser, Wolfgang'},
  {'name': 'Juen, Irmgard'},
  {'name': 'Singer, Katrin'},
  {'name': 'Neuburger, Martina'},
  {'name': 'Schauwecker, Simone'},
  {'name': 'Hofer, Marlis'},
  {'name': 'Kaser, Georg'}],
 'contributors': [],
 'createdDate': '2017-02-26T12:47:44+00:00',
 'dataProvider': {'id': 345,
  'name': 'ZORA',
  'url': 'https://api.core.ac.uk/v3/data-providers/345',
  'logo': 'https://api.core.ac.uk/data-providers/345/logo'},
 'depositedDate': '2016-05-26T00:00:00+01:00',
 'documentType': 'unknown',
 'doi': '10.5194/esd-7-499-2016',
 'downloadUrl': 'http://www.zora.uzh.ch/127499/1/2016_Gurgiser_etal_2016_earth_syst_dynam.pdf',
 'fullText': '',
 'id': 78263683,
 'identifiers': {'doi': '10.5194/esd-7-499-2016',
  'oai': 'oai:www.zora.uzh.ch:127499'},
 'title': "Comparing peasants' perceptions of precipitation change with precipitation records in the tropical Callejón de Huaylas, Peru",
 'language': None,
 'publ

In [182]:
search_open_alex('W1918047623', 'id')

0

In [190]:
anno = pd.read_csv('book.csv', sep=";").drop(columns= 'Column1')

In [195]:
anno

Unnamed: 0,core_id,citing_title,oa
0,81153632,"Two steps back, one step forward: reconstructi...",W2026588215
1,81158461,"The see-saw mechanism: Neutrino mixing, leptog...",W2895630496
2,36200578,Evolutionary History of the Live-Bearing Endem...,W1527348357
3,36209468,Response of Atmospheric Structure to Global Wa...,W2102148809
4,20349879,Markedly divergent estimates of Amazon forest ...,W2136830425
5,33740162,"Interaction Effects of Light, Temperature and ...",W239601108
6,82105846,"Evidence on the role of prebiotics, probiotics...",W1984774224
7,76526352,From a Robotic Vacuum Cleaner to a Robot Compa...,W2014991815
8,186893859,Case Report Molecular Profiling: A Case of ZBT...,W2609015272
9,186894183,Clinical Study Circulating Prostate Cells Foun...,W2093703180


In [197]:
def search_oa_id(search_value):
    search_value = str(search_value)
    url = 'https://api.openalex.org/' + search_value
    try:
        open_alex_call = requests.get(url=url).json()#['results'][0]
        result = open_alex_call
        return result
    except:
        return 0

In [199]:
anno['new_data'] = anno.oa.apply(lambda x: search_oa_id(x))

In [200]:
anno

Unnamed: 0,core_id,citing_title,oa,new_data
0,81153632,"Two steps back, one step forward: reconstructi...",W2026588215,"{'id': 'https://openalex.org/W2026588215', 'do..."
1,81158461,"The see-saw mechanism: Neutrino mixing, leptog...",W2895630496,"{'id': 'https://openalex.org/W2895630496', 'do..."
2,36200578,Evolutionary History of the Live-Bearing Endem...,W1527348357,"{'id': 'https://openalex.org/W1527348357', 'do..."
3,36209468,Response of Atmospheric Structure to Global Wa...,W2102148809,"{'id': 'https://openalex.org/W2102148809', 'do..."
4,20349879,Markedly divergent estimates of Amazon forest ...,W2136830425,"{'id': 'https://openalex.org/W2136830425', 'do..."
5,33740162,"Interaction Effects of Light, Temperature and ...",W239601108,"{'id': 'https://openalex.org/W239601108', 'doi..."
6,82105846,"Evidence on the role of prebiotics, probiotics...",W1984774224,"{'id': 'https://openalex.org/W1984774224', 'do..."
7,76526352,From a Robotic Vacuum Cleaner to a Robot Compa...,W2014991815,"{'id': 'https://openalex.org/W2014991815', 'do..."
8,186893859,Case Report Molecular Profiling: A Case of ZBT...,W2609015272,"{'id': 'https://openalex.org/W2609015272', 'do..."
9,186894183,Clinical Study Circulating Prostate Cells Foun...,W2093703180,"{'id': 'https://openalex.org/W2093703180', 'do..."


In [204]:
citing_paper = pd.merge(left = citing_paper, right = anno[['core_id', 'new_data']], on = 'core_id', how = 'left')

In [206]:
citing_paper.oa.replace(0, np.nan, inplace=True)

In [213]:
citing_paper.oa.isna().sum()

0

In [211]:
citing_paper['oa'].fillna(citing_paper['new_data'], inplace=True)

In [212]:
citing_paper

Unnamed: 0,core_id,citing_title,oa,new_data
0,81153632,"Two steps back, one step forward: reconstructi...","{'id': 'https://openalex.org/W2026588215', 'do...","{'id': 'https://openalex.org/W2026588215', 'do..."
1,50694047,Reproducibility of tender point examination in...,"{'id': 'https://openalex.org/W2118076555', 'do...",
2,5225915,A Simple Threshold Rule Is Sufficient to Expla...,"{'id': 'https://openalex.org/W2142581949', 'do...",
3,82875399,Interpreting Null Findings from Trials of Alco...,"{'id': 'https://openalex.org/W2112943852', 'do...",
4,82218905,Effects of rabeprazole on the antiplatelet eff...,"{'id': 'https://openalex.org/W1993013707', 'do...",
...,...,...,...,...
224,52040210,Governance costs in foreign direct investments...,"{'id': 'https://openalex.org/W2012543815', 'do...",
225,82867877,Pathology and pathogenesis of vascular cogniti...,"{'id': 'https://openalex.org/W2054985387', 'do...",
226,1385821,Policy and practice in the use of root cause a...,"{'id': 'https://openalex.org/W2059046137', 'do...",
227,82867364,Wandering tales: evolutionary origins of menta...,"{'id': 'https://openalex.org/W2002852594', 'do...",


In [216]:
next_try = cited_infos.loc[cited_infos.oa == 0]

In [217]:
def search_open_alex(search_value, search_parameter):
    url = 'https://api.openalex.org/works?filter=' + search_parameter +'.search:' + search_value
    try:
        open_alex_call = requests.get(url=url).json()['results'][0]
        result = open_alex_call
        return result
    except:
        return 0

In [228]:
next_try#.cited_doi.isna().sum()

Unnamed: 0,cited_title,cited_author,cited_abstract,cited_doi,cited_publication_date,cited_publication_info,oa,abstract_inverted_index,oa_title,oa_abstract,concepts
2,Wien im Bild historischer Karten Die Entwicklu...,"[""Ferdinand Opll""]",,10.7767/boehlau.9783205114741,29/06/2004,Jahrhunderts,0,,,,
5,Les caract res originaux de l histoire rurale ...,"[""M Bloch""]",,,1931,Les caractères originaux de l'histoire rurale ...,0,,,,
6,Methods for spatial and temporal land use and ...,"[""Timothy W. Foresman"", ""Steward T. A. Pickett...",Understanding contemporary urban landscapes re...,10.1023/a:1018583729727,1997,Urban Ecosystems,0,,,,
7,Landschaft hat Geschichte Historische Entwickl...,"[""Forschungsinitiative Umweltgeschichte"", ""K E...",,,1999,CD-ROM,0,,,,
8,Channel planform change on the river dee meand...,"[""A. M. Gurnell"", ""S. R. Downward"", ""R. Jones""]",Channel planform change was investigated along...,10.1002/rrr.3450090402,1994-12,Regulated Rivers: Research & Management,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...
3990,"Floods, fights and a fluid river: the Viennese...","[""Severin Hohensinner"", ""Bernhard Lager"", ""Chr...",Alluvial rivers can show unpredictable channel...,10.1007/s12685-013-0074-2,2013-7,Water History,0,,,,
3993,"ANABRANCHING RIVERS: THEIR CAUSE, CHARACTER AN...","[""GERALD C. NANSON"", ""A. DAVID KNIGHTON""]",,10.1002/(sici)1096-9837(199603)21:3<217::aid-e...,1996-3,Earth Surface Processes and Landforms,0,,,,
3994,Wien und die fr he Donaukartographie,"[""F Slezak""]",,10.1515/9783050074597-010,31/12/1997,Anthropologie und Geschichte,0,,,,
3995,Modelling channel evolution and floodplain mor...,"[""A D Howard""]",,,1996,Modelling channel evolution and floodplain mor...,0,,,,


In [226]:
def search_oa_doi(search_value):
    search_value = str(search_value)
    url = 'https://api.openalex.org/works/https://doi.org/' + search_value
    try:
        open_alex_call = requests.get(url=url).json()#['results'][0]
        result = open_alex_call
        return result
    except:
        return 0

In [229]:
next_try['oa'] = next_try.cited_doi.apply(lambda x: search_oa_doi(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  next_try['oa'] = next_try.cited_doi.apply(lambda x: search_oa_doi(x))


In [232]:
next_try.loc[next_try.oa != 0]

Unnamed: 0,cited_title,cited_author,cited_abstract,cited_doi,cited_publication_date,cited_publication_info,oa,abstract_inverted_index,oa_title,oa_abstract,concepts
2,Wien im Bild historischer Karten Die Entwicklu...,"[""Ferdinand Opll""]",,10.7767/boehlau.9783205114741,29/06/2004,Jahrhunderts,"{'id': 'https://openalex.org/W1590955822', 'do...",,,,
6,Methods for spatial and temporal land use and ...,"[""Timothy W. Foresman"", ""Steward T. A. Pickett...",Understanding contemporary urban landscapes re...,10.1023/a:1018583729727,1997,Urban Ecosystems,"{'id': 'https://openalex.org/W2129426812', 'do...",,,,
8,Channel planform change on the river dee meand...,"[""A. M. Gurnell"", ""S. R. Downward"", ""R. Jones""]",Channel planform change was investigated along...,10.1002/rrr.3450090402,1994-12,Regulated Rivers: Research & Management,"{'id': 'https://openalex.org/W2153361544', 'do...",,,,
12,Hydromorphological characteristics of the Danu...,"[""S. Hohensinner"", ""H. Habersack"", ""M. Jungwir...",,10.1002/rra.719,10/12/2003,River Research and Applications,"{'id': 'https://openalex.org/W2043145277', 'do...",,,,
14,"Pressure pain threshold in pain-free subjects,...","[""Gerald Granges"", ""Geoffrey Littlejohn""]",We hypothesized that change in pain threshold ...,10.1002/art.1780360510,1993-5,Arthritis & Rheumatism,"{'id': 'https://openalex.org/W2066620877', 'do...",,,,
...,...,...,...,...,...,...,...,...,...,...,...
3970,Patterns of Brain Activity Supporting Autobiog...,"[""R. Nathan Spreng"", ""Cheryl L. Grady""]",,10.1162/jocn.2009.21282,01/06/2010,Journal of Cognitive Neuroscience,"{'id': 'https://openalex.org/W2009594384', 'do...",,,,
3984,The brain s default network anatomy function a...,"[""Randy L. Buckner"", ""Jessica R. Andrews-Hanna...",,10.1196/annals.1440.011,2008-3,Annals of the New York Academy of Sciences,"{'id': 'https://openalex.org/W2136435696', 'do...",,,,
3990,"Floods, fights and a fluid river: the Viennese...","[""Severin Hohensinner"", ""Bernhard Lager"", ""Chr...",Alluvial rivers can show unpredictable channel...,10.1007/s12685-013-0074-2,2013-7,Water History,"{'id': 'https://openalex.org/W2070146044', 'do...",,,,
3993,"ANABRANCHING RIVERS: THEIR CAUSE, CHARACTER AN...","[""GERALD C. NANSON"", ""A. DAVID KNIGHTON""]",,10.1002/(sici)1096-9837(199603)21:3<217::aid-e...,1996-3,Earth Surface Processes and Landforms,"{'id': 'https://openalex.org/W2097996280', 'do...",,,,


In [235]:
next_try['oa_new'] = next_try.oa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  next_try['oa_new'] = next_try.oa


In [234]:
cited_infos.merge(next_try['oa_new'], on

Unnamed: 0,cited_title,cited_author,cited_abstract,cited_doi,cited_publication_date,cited_publication_info,oa,abstract_inverted_index,oa_title,oa_abstract,concepts
0,The English Village Community Examined in its ...,"[""Frederic Seebohm""]",,10.1017/cbo9781139094443.008,,The English Village Community Examined in its ...,"{'id': 'https://openalex.org/W4247988757', 'do...",,The English Village Community Examined in its ...,,"Animal husbandry, Relation (database), Field (..."
1,Die italienischen Begründer der Wiener Donauka...,"[""Friedrich Slezak""]",Die Beziehungen der Kaiserstadt an der Donau z...,10.7767/dnrm.1977.22.3.167,1977-12,Der Donauraum,"{'id': 'https://openalex.org/W2332335873', 'do...","{'No': [0], 'AccessDie': [1], 'italienischen':...",Die italienischen Begründer der Wiener Donauka...,No AccessDie italienischen Begründer der Wiene...,"Download, Humanities, Geography, Art, World Wi..."
2,Wien im Bild historischer Karten Die Entwicklu...,"[""Ferdinand Opll""]",,10.7767/boehlau.9783205114741,29/06/2004,Jahrhunderts,0,,,,
3,The hydraulic geometry of stream channels and ...,"[""Luna Bergere Leopold"", ""Thomas Maddock""]",Some hydraulic characteristics of stream chann...,10.3133/pp252,1953,Geological Survey Professional Paper,"{'id': 'https://openalex.org/W1587869157', 'do...","{'Some': [0], 'hydraulic': [1], 'characteristi...",The hydraulic geometry of stream channels and ...,Some hydraulic characteristics of stream chann...,"Cross section (physics), Section (typography),..."
4,From memory to written record,"[""M T Clanchy""]",Introduction Part I The Making of Records 1 Me...,,1993,From memory to written record,"{'id': 'https://openalex.org/W2071014039', 'do...","{'Introduction': [0], 'Part': [1, 36], 'I': [2...","From Memory to Written Record: England, 1066-1307",Introduction Part I The Making of Records 1 Me...,"CONQUEST, Literacy, Mythology, History, Litera..."
...,...,...,...,...,...,...,...,...,...,...,...
3995,Modelling channel evolution and floodplain mor...,"[""A D Howard""]",,,1996,Modelling channel evolution and floodplain mor...,0,,,,
3996,Atlas of the Danube River Vienna A history of ...,"[""P Mohilla"", ""F Michlmayr""]",,,1996,Donauatlas Wien: Geschichte der Donauregulieru...,0,,,,
3997,Alte Grenzen im Wiener Raum,"[""F Opll""]",,,1986,"Kommentare zum Historischen Atlas von Wien, Vi...","{'id': 'https://openalex.org/W621383568', 'doi...",,Alte Grenzen im Wiener Raum,,"Political science, Art"
3998,Avulsive channel systems: characteristics and ...,"[""Keith Richards"", ""Shobhit Chandra"", ""Peter F...",,10.1144/gsl.sp.1993.075.01.12,1993,"Geological Society, London, Special Publications","{'id': 'https://openalex.org/W2145807268', 'do...",,Avulsive channel systems: characteristics and ...,,"Channel (broadcasting), Computer science, Tele..."


In [237]:
final = pd.merge(left=cited_infos, right=next_try['oa_new'], left_index=True, right_index=True, how = 'left')

In [239]:
final.oa.replace(0, np.nan, inplace=True)

In [241]:
final['oa'].fillna(final['oa_new'], inplace=True)

In [244]:
final.loc[final.oa != 0]

Unnamed: 0,cited_title,cited_author,cited_abstract,cited_doi,cited_publication_date,cited_publication_info,oa,abstract_inverted_index,oa_title,oa_abstract,concepts,oa_new
0,The English Village Community Examined in its ...,"[""Frederic Seebohm""]",,10.1017/cbo9781139094443.008,,The English Village Community Examined in its ...,"{'id': 'https://openalex.org/W4247988757', 'do...",,The English Village Community Examined in its ...,,"Animal husbandry, Relation (database), Field (...",
1,Die italienischen Begründer der Wiener Donauka...,"[""Friedrich Slezak""]",Die Beziehungen der Kaiserstadt an der Donau z...,10.7767/dnrm.1977.22.3.167,1977-12,Der Donauraum,"{'id': 'https://openalex.org/W2332335873', 'do...","{'No': [0], 'AccessDie': [1], 'italienischen':...",Die italienischen Begründer der Wiener Donauka...,No AccessDie italienischen Begründer der Wiene...,"Download, Humanities, Geography, Art, World Wi...",
2,Wien im Bild historischer Karten Die Entwicklu...,"[""Ferdinand Opll""]",,10.7767/boehlau.9783205114741,29/06/2004,Jahrhunderts,"{'id': 'https://openalex.org/W1590955822', 'do...",,,,,"{'id': 'https://openalex.org/W1590955822', 'do..."
3,The hydraulic geometry of stream channels and ...,"[""Luna Bergere Leopold"", ""Thomas Maddock""]",Some hydraulic characteristics of stream chann...,10.3133/pp252,1953,Geological Survey Professional Paper,"{'id': 'https://openalex.org/W1587869157', 'do...","{'Some': [0], 'hydraulic': [1], 'characteristi...",The hydraulic geometry of stream channels and ...,Some hydraulic characteristics of stream chann...,"Cross section (physics), Section (typography),...",
4,From memory to written record,"[""M T Clanchy""]",Introduction Part I The Making of Records 1 Me...,,1993,From memory to written record,"{'id': 'https://openalex.org/W2071014039', 'do...","{'Introduction': [0], 'Part': [1, 36], 'I': [2...","From Memory to Written Record: England, 1066-1307",Introduction Part I The Making of Records 1 Me...,"CONQUEST, Literacy, Mythology, History, Litera...",
...,...,...,...,...,...,...,...,...,...,...,...,...
3993,"ANABRANCHING RIVERS: THEIR CAUSE, CHARACTER AN...","[""GERALD C. NANSON"", ""A. DAVID KNIGHTON""]",,10.1002/(sici)1096-9837(199603)21:3<217::aid-e...,1996-3,Earth Surface Processes and Landforms,"{'id': 'https://openalex.org/W2097996280', 'do...",,,,,"{'id': 'https://openalex.org/W2097996280', 'do..."
3994,Wien und die fr he Donaukartographie,"[""F Slezak""]",,10.1515/9783050074597-010,31/12/1997,Anthropologie und Geschichte,"{'id': 'https://openalex.org/W4250409433', 'do...",,,,,"{'id': 'https://openalex.org/W4250409433', 'do..."
3997,Alte Grenzen im Wiener Raum,"[""F Opll""]",,,1986,"Kommentare zum Historischen Atlas von Wien, Vi...","{'id': 'https://openalex.org/W621383568', 'doi...",,Alte Grenzen im Wiener Raum,,"Political science, Art",
3998,Avulsive channel systems: characteristics and ...,"[""Keith Richards"", ""Shobhit Chandra"", ""Peter F...",,10.1144/gsl.sp.1993.075.01.12,1993,"Geological Society, London, Special Publications","{'id': 'https://openalex.org/W2145807268', 'do...",,Avulsive channel systems: characteristics and ...,,"Channel (broadcasting), Computer science, Tele...",


In [247]:
def extract_abstract_inverted_index(dictionary):
    if isinstance(dictionary, dict) and 'abstract_inverted_index' in dictionary:
        return dictionary['abstract_inverted_index']
    else:
        return None

In [254]:
final['abstract_inverted_index'] = final.oa.apply(lambda x: extract_abstract_inverted_index(x))

In [255]:
final.abstract_inverted_index.isna().sum()

778

In [250]:
def generate_full_text(abstract_inverted_index):
    if abstract_inverted_index is None:
        return None  # Return empty string if the input is None

    # Sort the dictionary based on the location indices to ensure proper order
    sorted_index = sorted(abstract_inverted_index.items(), key=lambda x: x[1][0])

    # Reconstruct the full text
    full_text = ""
    for word, locations in sorted_index:
        full_text += word + " "
    
    return full_text.strip()

In [252]:
final.oa_abstract.isna().sum()

1297

In [259]:
final['oa_abstract'] = final.abstract_inverted_index.apply(lambda x: generate_full_text(x))

In [260]:
final.oa_abstract.isna().sum()

778

In [261]:
final.cited_abstract.isna().sum()

493

In [262]:
final['cited_abstract'].fillna(final['oa_abstract'], inplace=True)

In [263]:
final.cited_abstract.isna().sum()

329

In [265]:
def extract_display_names(concepts_list):
    if isinstance(concepts_list, list):
        display_names = [concept['display_name'] for concept in concepts_list]
        return ', '.join(display_names)
    else:
        return None

In [266]:
final['concepts'] = final['oa'].apply(lambda x: x.get('concepts', []) if isinstance(x, dict) else [])

In [270]:
final['concepts'] = final['concepts'].apply(extract_display_names)

In [272]:
final.concepts.replace('', np.nan, inplace=True)

In [283]:
final = final.drop(columns = ['oa_new', 'abstract_inverted_index', 'oa_title', 'oa_abstract'])

In [291]:
final.rename(columns={'oa': 'cited_oa', 'concepts': 'cited_concepts'}, inplace=True)

In [293]:
final.cited_oa.replace(0, np.nan, inplace=True)

In [296]:
citing_paper.rename(columns={'oa': 'citing_oa'}, inplace=True)

In [305]:
df = pd.merge(left = df, right = final[['cited_abstract', 'cited_oa', 'cited_concepts']], left_index=True, right_index=True, how = 'left')

In [309]:
#sanity check

In [314]:
len(df.loc[(df.cited_abstract_x == df.cited_abstract_y) == False]) == df.cited_abstract_x.isna().sum()

True

In [316]:
df.rename(columns={'cited_abstract_y': 'cited_abstract'}, inplace=True)

In [318]:
df = df.drop(columns=['cited_abstract_x'])

In [319]:
df.cited_abstract.isna().sum()

329

In [328]:
df = pd.merge(left = df, right = citing_paper[['core_id', 'citing_oa']], on = 'core_id', how = 'left')

In [330]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 0 to 3999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   unique_id                 4000 non-null   object 
 1   core_id                   4000 non-null   int64  
 2   citation_offset           3995 non-null   float64
 3   total_doc_length          4000 non-null   int64  
 4   section_info              4000 non-null   object 
 5   citing_title              4000 non-null   object 
 6   citing_author             4000 non-null   object 
 7   citing_publication_info   3705 non-null   object 
 8   citing_abstract           3992 non-null   object 
 9   cited_title               4000 non-null   object 
 10  cited_author              4000 non-null   object 
 11  cited_doi                 3646 non-null   object 
 12  cited_publication_date    3888 non-null   object 
 13  cited_publication_info    3928 non-null   object 
 14  citation

In [339]:
df.loc[df.citing_abstract.isna() == True].core_id.value_counts()

20652885    8
Name: core_id, dtype: int64

In [341]:
df['citing_concepts'] = df['citing_oa'].apply(lambda x: x.get('concepts', []) if isinstance(x, dict) else [])

In [343]:
df['citing_concepts'] = df['citing_concepts'].apply(extract_display_names)

In [350]:
original.isna().sum()

unique_id                     0
core_id                       0
citation_offset               5
total_doc_length              0
section_info                  0
citing_title                  0
citing_author                 0
citing_publication_info     295
citing_abstract               8
cited_title                   0
cited_author                  0
cited_abstract              960
cited_doi                   354
cited_publication_date      112
cited_publication_info       72
citation_context              0
self_citation               422
direct_citations             68
co_mentions                  68
citation_class_label          0
citation_influence_label      0
dtype: int64

In [348]:
df.isna().sum()

unique_id                     0
core_id                       0
citation_offset               5
total_doc_length              0
section_info                  0
citing_title                  0
citing_author                 0
citing_publication_info     295
citing_abstract               8
cited_title                   0
cited_author                  0
cited_doi                   354
cited_publication_date      112
cited_publication_info       72
citation_context              0
self_citation               422
direct_citations             68
co_mentions                  68
citation_class_label          0
citation_influence_label      0
cited_abstract              329
cited_oa                    192
cited_concepts              192
citing_oa                     0
citing_concepts               0
dtype: int64

In [349]:
original = pd.read_csv('/home/roland/Projects/JP_citation_classification/feature_scraping/data/ACT2_dataset.tsv', sep='\t')

In [351]:
df.to_csv('open_alex_enrichment.csv', index = False)