In [103]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import json
import spacy

In [104]:
openalex_works_api = "https://api.openalex.org/works?page=1&filter=concepts.id:C154945302&sort=cited_by_count:desc&per_page=200"
concept_base_url = "https://api.openalex.org/concepts/"
openaire_base_url = "https://api.openaire.eu/search/publications?title="

In [105]:
concept_id_name_json = {}
nlp = spacy.load("en_core_web_md")

In [106]:
def get_concept_name_from_contept_id(concept_id: str):
    concept_id = concept_id.split('/')[-1]
    if concept_id in concept_id_name_json.keys():
        return concept_id_name_json[concept_id]
    else:
        url = concept_base_url + concept_id
        response = requests.get(url)
        if response.status_code == 200:
            concept_name = response.json()['display_name']
            concept_id_name_json[concept_id] = concept_name
            return concept_name

In [107]:
def get_openaire_subjects(response_text: str, orig_title: str):
    subjects = []
    tree = ET.fromstring(response_text)
    try:
        metadata = tree.findall('results')[0].findall('result')[0].findall('metadata')[0]
        subjects_xml = metadata.findall('{http://namespace.openaire.eu/oaf}entity')[0].findall('{http://namespace.openaire.eu/oaf}result')[0].findall('subject')
        #print('Subjects: ', subjects_xml)
        for subject in subjects_xml:
            subjects.append(subject.text)
    except:
        print(f'Not found: {orig_title}')
        return None
    #print(tree.findall('results')[0].findall('result')[0].findall('metadata')[0].findall('entity'))#.findall('oaf:result')[0].findall('subject'))
    return subjects

In [108]:
def get_openaire_subjects_from_title(title):
	title_words = title.split(' ')
	title_search_string = ""
	for word in title_words:
		word_clean = ''.join(x for x in word if x.isalpha())
		title_search_string += word_clean.lower() + ' '
	url = openaire_base_url+title_search_string
	#print(url)
	response = requests.get(url)
	if response.status_code == 200:
		openaire_subjects = get_openaire_subjects(response.text, title)
	return openaire_subjects

In [109]:
def preprocess_term(term):
	# Split the term into words and filter out numeric-only words
	filtered_words = [word.lower() for word in term.split(' ') if not word.isnumeric()]
	# Rejoin the words to form the preprocessed term
	preprocessed_term = ' '.join(filtered_words)
	return preprocessed_term

In [110]:
def get_works_from_openalex(url, start_page=1, pages=100):
    
	for page in range(start_page, pages+1):
		out_json = {}
		print(f'Fetching page: {page}')
		
  
		url = url.replace('page=1', f'page={page}')
		response = requests.get(url)

		if response.status_code == 200:
			data = response.json()
			page_results = data['results']
			with open(f'data/raw_openalex_api_outputs/page_{page}.json', 'a+') as f:
				json.dump(page_results, f, indent=4)
			
			cnt = 1
			for work in page_results:
				if not work['title']:
					continue
				#print(work['title'])
				print(f'Work count: {cnt}')
				openalex_consepts = []
				for concept in work['concepts']:
					openalex_consepts.append(concept['display_name'])
				
	
				openaire_subjects = get_openaire_subjects_from_title(work['title'])
				
				if not openaire_subjects:
					continue
				out_json[work['title']] = {}
				out_json[work['title']]['openalex'] = ' | '.join(openalex_consepts)
				out_json[work['title']]['openaire'] = ' | '.join(openaire_subjects)
    
				matching = []
				similarities = []
				missmatches = []
				#print(out_json[work['title']]['openaire'])
				max_similarity = 0
				openaire_terms = out_json[work['title']]['openaire'].split(' | ')
				openalex_terms = out_json[work['title']]['openalex'].split(' | ')
				for openaire_term in openaire_terms:
					for openalex_term in openalex_terms:
						
						openaire_term = preprocess_term(openaire_term)
						openalex_term = preprocess_term(openalex_term)
						doc1 = nlp(openaire_term)
						doc2 = nlp(openalex_term)
						similarity = doc1.similarity(doc2)
						if similarity >= 1:
							matching.append({
								'openaire': openaire_term,
								'openalex': openalex_term,
								'similarity': similarity
							})
						elif similarity >= 0.8:
							similarities.append({
								'openaire': openaire_term,
								'openalex': openalex_term,
								'similarity': similarity
							})
						else:
							missmatches.append({
								'openaire': openaire_term,
								'openalex': openalex_term,
								'similarity': similarity
							})
				
				out_json[work['title']]['exact_matches'] = matching
				out_json[work['title']]['similarity'] = similarities
				out_json[work['title']]['missmatches'] = missmatches
                
				cnt += 1
			with open(f'data/output_{page}', 'a+') as f:  
				f.write(json.dumps(out_json) + '\n')

				
			page += 1
		else:
			print(f"Failed to fetch data: {response.status_code}")
			break
		break

In [111]:
get_works_from_openalex(openalex_works_api)

Fetching page: 1
Deep Residual Learning for Image Recognition
Work count: 1
https://api.openaire.eu/search/publications?title=deep residual learning for image recognition 
FOS: Computer and information sciences | Computer science | Computer Vision and Pattern Recognition (cs.CV) | Computer Science - Computer Vision and Pattern Recognition | 02 engineering and technology | Machine learning | computer.software_genre | Residual | One-shot learning | Convolutional neural network | Residual neural network | 030218 nuclear medicine & medical imaging | 03 medical and health sciences | 0302 clinical medicine | 0202 electrical engineering, electronic engineering, information engineering | Computer vision | 03021804 Radiology/Image segmentation | Transformer (machine learning model) | Vanishing gradient problem | 02020101 Applications of computer vision/Computer vision/Image processing | Artificial neural network | business.industry | Deep learning | Pattern recognition | Test set | Softmax func

  similarity = doc1.similarity(doc2)


Basic local alignment search tool
Work count: 2
https://api.openaire.eu/search/publications?title=basic local alignment search tool 
Information retrieval | Basic Local Alignment Search Tool | Biology
Gapped BLAST and PSI-BLAST: a new generation of protein database search programs
Work count: 3
https://api.openaire.eu/search/publications?title=gapped blast and psiblast a new generation of protein database search programs 
Not found: Gapped BLAST and PSI-BLAST: a new generation of protein database search programs
Long Short-Term Memory
Work count: 3
https://api.openaire.eu/search/publications?title=long shortterm memory 
Long short term memory | Mean squared error | Statistics | Mathematics
The theory of planned behavior
Work count: 4
https://api.openaire.eu/search/publications?title=the theory of planned behavior 
05010908 Group processes/Collective identity | Management science | 05 social sciences | Theory of planned behavior | 050109 social psychology | 0501 psychology and cognitive

KeyboardInterrupt: 