In [1]:
import requests
import json
import re
from tqdm.notebook import tqdm
import time
import glob
import pandas as pd
import redis
import numpy as np
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
class MyQuery:
    def __init__(self,dir = "output",dataPerYear = 1600,offsetYear = 0):
        self.apiKey = input("Input your API key")
        self.start_year = 2018 + offsetYear
        self.end_year = 2024
        self.total_count = dataPerYear
        self.dir = dir
        self.cache = set([re.sub(f"{self.dir}\\\\","",e) for e in glob.glob(f"{self.dir}/*")])
    
    def _search_loop(self,year):
        search = f'PUBYEAR = {year} AND NOT AFFIL (Chulalongkorn University) '
        res = requests.get("https://api.elsevier.com/content/search/scopus",headers={"Accept" : "application/json","X-ELS-APIKey" : self.api_key},params={"query" : search,"cursor" : "*","count" : "5","view" : "STANDARD","sort" : "-citedby-count"})
        for c in tqdm(range(0,self.total_count,5)):
            jfile = res.json()
            for entry in jfile["search-results"]["entry"]:
                scopus_id = self.get_scopusId(entry['dc:identifier'])
                if (scopus_id in self.cache):
                    continue
                json_abstract = self.query_abstract(scopus_id)
                self.save_abstract(json_abstract,scopus_id)
            time.sleep(1)
            next_cursor = jfile["search-results"]['cursor']['@next']
            res = requests.get("https://api.elsevier.com/content/search/scopus",headers={"Accept" : "application/json","X-ELS-APIKey" : self.api_key},params={"query" : search,"cursor" : next_cursor,"count" : "5","view" : "STANDARD","sort" : "-citedby-count"})
        
    
    def search_loop(self):
        for year in tqdm(range(self.start_year,self.end_year)):
            self._search_loop(year)
        return
        
    def get_scopusId(self,identifier):
        return re.sub("SCOPUS_ID:","",identifier)
        
    def query_abstract(self,scopus_id):
        res = requests.get(f"https://api.elsevier.com/content/abstract/scopus_id/{scopus_id}",headers={"Accept" : "application/json","X-ELS-APIKey" : self.apiKey},params={"view" : "FULL"})
        return res.json()
    
    def save_abstract(self,json_abstract,scopus_id):
        with open(f"{self.dir}/{scopus_id}","w") as file:
            json.dump(json_abstract, file, indent = 6) 
        return
    

In [55]:
query_tools = MyQuery(offsetYear = 0,dir = "output2")

Input your API keycc36e050bd46ac1513ace4dd80633bd3


In [None]:
query_tools.search_loop()

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/320 [00:00<?, ?it/s]

  0%|          | 0/320 [00:00<?, ?it/s]

  0%|          | 0/320 [00:00<?, ?it/s]

  0%|          | 0/320 [00:00<?, ?it/s]

  0%|          | 0/320 [00:00<?, ?it/s]

  0%|          | 0/320 [00:00<?, ?it/s]

In [72]:
dir = "output"

In [73]:
def get_author(res):
    names = []
    if res["authors"] != None:
        for auth in res["authors"]["author"]:
            auth = auth["preferred-name"]
            names.append(auth["ce:indexed-name"])
    return ";".join(names)

def get_ref_title(res):
    title = []
    if res["item"]["bibrecord"]["tail"] != None:
        try:
            for ref in res["item"]["bibrecord"]["tail"]["bibliography"]["reference"]:
                    try:
                        if "ref-title" not in ref["ref-info"]:
                            continue
                        title.append(ref["ref-info"]["ref-title"]["ref-titletext"])
                    except:
                        pass
        except:
            pass
    return ";".join(title)

def get_ref_author(res):
    names = []
    if res["item"]["bibrecord"]["tail"] != None:
        try:
            for ref in res["item"]["bibrecord"]["tail"]["bibliography"]["reference"]:
                try:
                    if "ref-authors" not in ref["ref-info"]:
                        continue
                    if "author" not in ref["ref-info"]["ref-authors"]:
                        selection = ref["ref-info"]["ref-authors"]["collaboration"]
                        if type(selection) == list:
                            for sel in selection:
                                names.append(sel["ce:indexed-name"])
                        else:
                            names.append(selection["ce:indexed-name"])
                    else:
                        for auth in ref["ref-info"]["ref-authors"]["author"]:
                            names.append(auth["ce:indexed-name"])
                except:
                    pass
        except:
            pass
            
    return ";".join(list(set(filter(None,names))))
                

def extract_result(json_dict):
    result = {}
    res = json_dict['abstracts-retrieval-response']
    result["scopus_id"]= re.sub("SCOPUS_ID:","",res["coredata"]["dc:identifier"])
    result["author_name"] = get_author(res)
    if "dc:title" not in res["coredata"]:
        result["title"] = ""
    else:
        result["title"] = res["coredata"]["dc:title"]
    result["abstracts"] = res["coredata"]["dc:description"] if "dc:description" in res["coredata"] else None
    result["abstracts"] = "" if result["abstracts"]  == None else result["abstracts"]
    result["related_field"] = []
    result["auth_keywords"] = []
    date = res["item"]["bibrecord"]["head"]["source"]["publicationdate"]
    if "day" in date:
        day = date["day"]
    else:
        day = -1
    if "month" in date:
        month = date["month"]
    else:
        month = -1
    year = date["year"]
    result["publish_day"] =day
    result["publish_month"] =month
    result["publish_year"] =year
    if res["item"]["bibrecord"]["tail"] != None:
        result["ref_count"] = int(res["item"]["bibrecord"]["tail"]["bibliography"]["@refcount"])
    else:
        #print(res)
        result["ref_count"] = 0
        
    result["ref_authors"] = get_ref_author(res)
    result["ref_titles"] = get_ref_title(res)
    if "citedby-count" in res["coredata"]:
        if res["coredata"]["citedby-count"] != None:
            result["cited_count"] = int(res["coredata"]["citedby-count"])
        else:
            result["cited_count"] = 0
    else:
        result["cited_count"] = 0
    if (res["subject-areas"] != None):
        for area in res["subject-areas"]["subject-area"]:
            try:
                result["related_field"].append(area["@abbrev"])
            except Exception as e:
                print("@abbrev key not found")
    result["related_field"] = ";".join(result["related_field"])
    if (res["authkeywords"] != None):
        if type(res["authkeywords"]["author-keyword"]) == list:
            for keyword in res["authkeywords"]["author-keyword"]:
                try:
                    result["auth_keywords"].append(keyword["$"])
                except Exception as e:
                    print(res["authkeywords"]["author-keyword"])
                    print("$ key not found")
        else:
            result["auth_keywords"].append(res["authkeywords"]["author-keyword"]["$"])
    result["auth_keywords"] = ";".join(result["auth_keywords"])
    return result

In [74]:
data_point = glob.glob(f"{dir}/*")
data_point

['output\\85061637194',
 'output\\85061637476',
 'output\\85061637615',
 'output\\85061637915',
 'output\\85061638067',
 'output\\85061638076',
 'output\\85061638234',
 'output\\85061638252',
 'output\\85061638289',
 'output\\85061638325',
 'output\\85061638386',
 'output\\85061639059',
 'output\\85061639720',
 'output\\85061639879',
 'output\\85061639954',
 'output\\85061640160',
 'output\\85061640230',
 'output\\85061640377',
 'output\\85061640475',
 'output\\85061641061',
 'output\\85061641136',
 'output\\85061641140',
 'output\\85061641189',
 'output\\85061641583',
 'output\\85061641626',
 'output\\85061641891',
 'output\\85061642159',
 'output\\85061642682',
 'output\\85061642971',
 'output\\85061643857',
 'output\\85061644514',
 'output\\85061644644',
 'output\\85061644841',
 'output\\85061645075',
 'output\\85061645318',
 'output\\85061645341',
 'output\\85061645643',
 'output\\85061645658',
 'output\\85061645691',
 'output\\85061646095',
 'output\\85061646113',
 'output\\850616

In [75]:
my_zipper = []
for path in tqdm(data_point):
    with open(path,'r') as file:
        jfile = json.load(file)
        result = extract_result(jfile)
        my_zipper.append((result["scopus_id"],result['author_name'],result['title'],result['abstracts'],result['related_field'],result['auth_keywords'],result['publish_day'],result['publish_month'],result['publish_year'],result['ref_count'],result["ref_authors"],result["ref_titles"],result['cited_count']))
                

  0%|          | 0/9600 [00:00<?, ?it/s]

In [76]:
col = ["scopus_id","author_name","title","abstracts","related_field","auth_keywords","publish_day","publish_month","publish_year","ref_count","ref_authors","ref_titles","cited_count"]
arr = list(zip(*my_zipper))
datapoint = {key : list(value) for key, value in zip(col,zip(*my_zipper))}

In [77]:
scrap_df = pd.DataFrame(datapoint)
scrap_df['scopus_id'] = scrap_df['scopus_id'].astype(str)
scrap_df = scrap_df.set_index('scopus_id')
scrap_df

Unnamed: 0_level_0,author_name,title,abstracts,related_field,auth_keywords,publish_day,publish_month,publish_year,ref_count,ref_authors,ref_titles,cited_count
scopus_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
85061637194,Saito M.,Periodically Variable Antenna Pattern for Maxi...,We propose a periodically variable antenna pat...,COMP;COMP;ENGI;MATE;PHYS,ESPAR antenna;MIMO;Path diversity;Periodically...,31,12,2018,15,Kakinuma Y.;Idoguchi Y.;Bains R.;Ishizu M.;Gyo...,Electronically steerable passive array radiato...,2
85061637476,Yadava N.;Chauhan R.,Impact of Asymmetricity in Gate Oxide on the R...,This paper presents the impact of asymmetricit...,COMP;COMP;ENGI;MATE;PHYS,FDSOI MOSFET;High-k;intrinsic capacitances and...,31,12,2018,13,Pradhan K.-P.;Kilchytska V.;Chen C.-L.;Lee M.-...,Multiple-gate soi-mosfets: Device design guide...,0
85061637615,Nazari Z.;Kang D.,A New Hierarchical Clustering Algorithm with I...,This paper builds upon our previous paper that...,COMP;COMP;ENGI;MATE;PHYS,cluster analysis;clustering algorithm;Data min...,31,12,2018,17,Sung Y.;Romesburg H.C.;Everitt B.S.;Richard M....,New hierarchical clustering algorithm;How many...,3
85061637915,Thatere A.;Zade P.;Dadarao Sontakke P.,Rectangular Shaped Defected Ground structure M...,In the Digital world for communication purpose...,COMP;COMP;ENGI;MATE;PHYS,Defected Ground Structure (DGS);Microstrip Pat...,31,12,2018,5,Khandelwal M.K.;Aggarwal M.;Kanaujia B.K.;Nand...,"Defected ground structure: Fundamentals, analy...",3
85061638067,Noor N.;Farooq O.;Shahnawaz S.,Detection of Event Related Potential for a Cog...,Event related potential (ERPs) are the small v...,COMP;COMP;ENGI;MATE;PHYS,age group;auditory stimulus;EEG;Event Related ...,31,12,2018,12,Starr A.;Polich J.;Kolarova M.;Oppitz S.J.;Ste...,Time is of the essence: A review of electroenc...,0
...,...,...,...,...,...,...,...,...,...,...,...,...
85191180582,Uršič E.D.;Čede P.;Steinicke E.;Jelen I.,The mountainous areas of Friuli-Venezia Giulia...,This article aims to demonstrate that migratio...,SOCI;EART,amenity migration;Friuli-Venezia Giulia;ghost ...,31,12,2023,50,Fassio G.;Jacob J. C.;Glorioso R. S.;Kristense...,Neue Pioniere in ostalpinen Peripherräumen: di...,0
85191192368,Smrekar A.;Gašperič P.;Tičar J.;Horvat K.P.,Active involvement of stakeholders in the mana...,This article focuses on introducing the concep...,SOCI;EART,agreement;geography;memorandum;participatory p...,31,12,2023,51,Chazee L.;Bravard J. P.;Jones C. E.;Charpentie...,"China’s natural wetlands: Past problems, curre...",0
85191196153,Sergeyeva A.;Omirzakova M.;Saparov K.,DEVELOPMENT OF GEOTOURISM AND RURAL TOURISM FO...,This article deals with the sustainable develo...,SOCI;EART,Aktobe Oblast;Delphi method;geotourism;Kazakhs...,31,12,2023,27,Adamov T.;Saputro K. E. A.;Gajic T.;Galka E.;M...,Pathways toward the transformation of sustaina...,0
85191230110,Lihus M.;Branco P.C.,PERSPECTIVES ON UKRAINIAN CINEMA: CONSTRUCTING...,,SOCI;ARTS,,31,12,2023,0,,,0


In [78]:
scrap_df.to_csv("scrap_df.csv")

In [79]:
rd = redis.Redis(host = "107.22.67.15",port = 6379,password = "cryogen",charset="utf-8", decode_responses=True)
rd.ping()

True

In [80]:
cache = set([re.sub("scopus:","",key) for key in rd.keys("scopus:*")])
cache

{'85040604215',
 '85118982145',
 '85126682482',
 '85170818511',
 '85115276784',
 '85179222118',
 '85070224476',
 '85061124164',
 '85067374171',
 '85084518477',
 '85040971180',
 '85065535628',
 '85062877976',
 '85054210184',
 '85059555270',
 '85083187947',
 '85072768207',
 '85164504623',
 '85050016962',
 '85128245790',
 '85149648558',
 '85141239401',
 '85046672343',
 '85092648010',
 '85128818309',
 '85128187894',
 '85044867718',
 '85150042634',
 '85121413226',
 '85179268406',
 '85107066971',
 '85131712878',
 '85059645593',
 '85044190749',
 '85141891236',
 '85140054478',
 '85084859626',
 '85071168537',
 '85079496239',
 '85082864249',
 '85085937954',
 '85081690530',
 '85077654325',
 '85078303338',
 '85118285890',
 '85042540245',
 '85082441674',
 '85154580930',
 '85107805071',
 '85143846301',
 '85118459520',
 '85083058614',
 '85074455045',
 '85145652754',
 '85116873282',
 '85102739473',
 '85146131322',
 '85124447517',
 '85064114011',
 '85101973990',
 '85121697778',
 '85117513220',
 '850849

In [81]:
col = ["scopus_id","author_name","title","abstracts","related_field","auth_keywords","publish_day","publish_month","publish_year","ref_count","ref_authors","ref_titles","cited_count"]

In [82]:
def multiThreadSender(thread_size = 8):
    cache = set([re.sub("scopus:","",key) for key in rd.keys("scopus:*")])
    scrape_ids = np.array(list(set(scrap_df.index.values) - cache))
    
    
    def send(key):
        row = scrap_df.loc[key]
        my_map = {c : int(row[c]) if (c == "ref_count" or c == "cited_count") else row[c] for c in col[1:]}
        rd.hset(f"scopus:{key}",mapping = my_map)
    
    with tqdm(total = len(scrape_ids)) as pbar:
        with ThreadPoolExecutor(max_workers=thread_size) as executor:
            future = [executor.submit(send,key) for key in scrape_ids]
            for f in as_completed(future):
                pbar.update(1)     
    
    return 

In [83]:
multiThreadSender(512)

  0%|          | 0/9600 [00:00<?, ?it/s]

In [84]:
len(rd.keys("scopus:*"))

19200