In [33]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import json

In [34]:
concept_id_name_json = {}

In [35]:
works_df = pd.read_csv('works.csv', delimiter=',')
works_3000 = works_df.head(3000)

In [36]:
columns = ['Title', 'OpenAlexConcepts']
df = pd.DataFrame(columns=columns)
print(df.head())

Empty DataFrame
Columns: [Title, OpenAlexConcepts]
Index: []


In [37]:
columns = ['Title', 'OpenAlexConcepts', 'OpenAIRESubjects_fromMag', 'Matching']
df1 = pd.DataFrame(columns=columns)
print(df1.head())

Empty DataFrame
Columns: [Title, OpenAlexConcepts, OpenAIRESubjects_fromMag, Matching]
Index: []


In [38]:

concept_base_url = "https://api.openalex.org/concepts/"
openaire_base_url = "https://api.openaire.eu/search/publications?fromDateAccepted=1990-01-01&toDateAccepted=1990-12-31&title="

In [39]:
def get_concept_name_from_contept_id(concept_id: str):
    concept_id = concept_id.split('/')[-1]
    if concept_id in concept_id_name_json.keys():
        return concept_id_name_json[concept_id]
    else:
        url = concept_base_url + concept_id
        response = requests.get(url)
        if response.status_code == 200:
            concept_name = response.json()['display_name']
            concept_id_name_json[concept_id] = concept_name
            return concept_name

In [40]:
def get_openaire_subjects(response_text: str, orig_title: str):
    subjects = []
    tree = ET.fromstring(response_text)
    try:
        metadata = tree.findall('results')[0].findall('result')[0].findall('metadata')[0]
        subjects_xml = metadata.findall('{http://namespace.openaire.eu/oaf}entity')[0].findall('{http://namespace.openaire.eu/oaf}result')[0].findall('subject')
        for subject in subjects_xml:
            subjects.append(subject.text)
    except:
        print(f'Not found: {orig_title}')
    #print(tree.findall('results')[0].findall('result')[0].findall('metadata')[0].findall('entity'))#.findall('oaf:result')[0].findall('subject'))
    return subjects

In [41]:
data_json = {}
for idx, row in works_3000.iterrows():
     concept_ids_row = row['concept_ids']
     concept_ids_row = concept_ids_row.split('|')
     if pd.isna(row['display_name']):
          continue
     title = row['display_name']
     if title and str(title) != "":
          title_words = title.split(' ')
          title_search_string = ""
          for word in title_words:
               word_clean = ''.join(x for x in word if x.isalpha())
               title_search_string += word_clean.lower() + ' '
          url = openaire_base_url+title_search_string
          response = requests.get(url)
          if response.status_code == 200:
               openaire_subjects = get_openaire_subjects(response.text, title)
          concepts = []
          for concept_id in concept_ids_row:
               concepts.append(get_concept_name_from_contept_id(concept_id))
          concepts.sort()
          openaire_subjects.sort()
          matching = []
          for concept in concepts:
               for subject in openaire_subjects:
                    concept = concept.lower()
                    subject = subject.lower()
                    c = concept.split(' ')
                    s = subject.split(' ')
                    found = False
                    if found:
                         break
                    for c_w in c:
                         for s_w in s:
                              if s_w == c_w:
                                   new_match = f'Openaire: {subject} - Openalex: {concept}'
                                   if new_match not in matching:
                                        matching.append(new_match)
                                   found = True
          concepts = " | ".join(concepts)
          openaire_subjects = " | ".join(openaire_subjects)
          matching = " | ".join(matching)
          data_json[title] = {
               'openaire': openaire_subjects,
               'openalex': concepts,
               'matching': matching,
          }
          #new_row = {'Title': title, 'OpenAlexConcepts': concepts, 'OpenAIRESubjects_fromMag': openaire_subjects}
          df1.loc[len(df1.index)] = [title, concepts, openaire_subjects, matching]  

Not found: Scale-space and edge detection using anisotropic diffusion
Not found: The R*-tree: an efficient and robust access method for points and rectangles
Not found: A logic of authentication
Not found: Advanced mammalian gene transfer: high titre retroviral vectors with multiple drug selection markers and a complementary helper-free packaging cell line
Not found: Multiquadrics—A scattered data approximation scheme with applications to computational fluid-dynamics—II solutions to parabolic, hyperbolic and elliptic partial differential equations
Not found: Matrix pencil method for estimating parameters of exponentially damped/undamped sinusoids in noise
Not found: Elephants don't play chess
Not found: <i>Tracking and Data Association</i>
Not found: Knowledge and common knowledge in a distributed environment
Not found: Knowing what to think by knowing who you are: Self‐categorization and the nature of norm formation, conformity and group polarization*
Not found: Scale-space for discre

In [42]:
import json
with open('data_3000.json', 'w') as f:
    json.dump(data_json, f, indent=4)

In [43]:
print(df1.head())
df1.drop_duplicates(inplace=True)
df1.to_csv('title_concept_openalex_openaire.csv', index=False, sep=';')

                                               Title  \
0  MAXIMUM LIKELIHOOD ESTIMATION AND INFERENCE ON...   
1               Indexing by latent semantic analysis   
2  Scale-space and edge detection using anisotrop...   
3  Backpropagation through time: what it does and...   
4                           Neural network ensembles   

                                    OpenAlexConcepts  \
0  Artificial intelligence | Cointegration | Comp...   
1  Algorithm | Artificial intelligence | Basis (l...   
2  Algorithm | Anisotropic diffusion | Artificial...   
3  Algorithm | Artificial intelligence | Artifici...   
4  Algorithm | Artificial intelligence | Artifici...   

                            OpenAIRESubjects_fromMag  \
0  05 social sciences | 0502 economics and busine...   
1  Automatic indexing | Document retrieval | Docu...   
2                                                      
3  Artificial intelligence | Artificial neural ne...   
4  Applied Mathematics | Artificial Intelligen

In [44]:
df.to_csv('title_concept_openalex.csv', index=False, sep=';')