In [10]:
import os
import glob
import json
from datetime import datetime
import pandas as pd
from tqdm.notebook import tqdm
import redis
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
os.path

<module 'ntpath' (frozen)>

In [3]:
glob.glob("Data 2018-2023/Project/2018/*")

['Data 2018-2023/Project/2018\\201800000',
 'Data 2018-2023/Project/2018\\201800001',
 'Data 2018-2023/Project/2018\\201800002',
 'Data 2018-2023/Project/2018\\201800003',
 'Data 2018-2023/Project/2018\\201800004',
 'Data 2018-2023/Project/2018\\201800005',
 'Data 2018-2023/Project/2018\\201800006',
 'Data 2018-2023/Project/2018\\201800007',
 'Data 2018-2023/Project/2018\\201800008',
 'Data 2018-2023/Project/2018\\201800009',
 'Data 2018-2023/Project/2018\\201800010',
 'Data 2018-2023/Project/2018\\201800011',
 'Data 2018-2023/Project/2018\\201800012',
 'Data 2018-2023/Project/2018\\201800013',
 'Data 2018-2023/Project/2018\\201800014',
 'Data 2018-2023/Project/2018\\201800015',
 'Data 2018-2023/Project/2018\\201800016',
 'Data 2018-2023/Project/2018\\201800017',
 'Data 2018-2023/Project/2018\\201800018',
 'Data 2018-2023/Project/2018\\201800019',
 'Data 2018-2023/Project/2018\\201800020',
 'Data 2018-2023/Project/2018\\201800021',
 'Data 2018-2023/Project/2018\\201800022',
 'Data 2018

In [5]:
def get_author(res):
    names = []
    if res["authors"] != None:
        for auth in res["authors"]["author"]:
            auth = auth["preferred-name"]
            names.append(auth["ce:indexed-name"])
    return ";".join(names)

def get_ref_title(res):
    title = []
    if res["item"]["bibrecord"]["tail"] != None:
         for ref in res["item"]["bibrecord"]["tail"]["bibliography"]["reference"]:
                try:
                    if "ref-title" not in ref["ref-info"]:
                        continue
                    title.append(ref["ref-info"]["ref-title"]["ref-titletext"])
                except:
                    pass
    return ";".join(title)

def get_ref_author(res):
    names = []
    if res["item"]["bibrecord"]["tail"] != None:
        for ref in res["item"]["bibrecord"]["tail"]["bibliography"]["reference"]:
            try:
                if "ref-authors" not in ref["ref-info"]:
                    continue
                if "author" not in ref["ref-info"]["ref-authors"]:
                    selection = ref["ref-info"]["ref-authors"]["collaboration"]
                    if type(selection) == list:
                        for sel in selection:
                            names.append(sel["ce:indexed-name"])
                    else:
                        names.append(selection["ce:indexed-name"])
                else:
                    for auth in ref["ref-info"]["ref-authors"]["author"]:
                        names.append(auth["ce:indexed-name"])
            except:
                pass
            
    return ";".join(list(set(filter(None,names))))
                

def extract_result(json_dict):
    result = {}
    res = json_dict['abstracts-retrieval-response']
    result["scopus_id"]= re.sub("SCOPUS_ID:","",res["coredata"]["dc:identifier"])
    result["author_name"] = get_author(res)
    if "dc:title" not in res["coredata"]:
        result["title"] = ""
    else:
        result["title"] = res["coredata"]["dc:title"]
    result["abstracts"] = res["coredata"]["dc:description"] if "dc:description" in res["coredata"] else None
    result["abstracts"] = "" if result["abstracts"]  == None else result["abstracts"]
    result["related_field"] = []
    result["auth_keywords"] = []
    date = res["item"]["bibrecord"]["head"]["source"]["publicationdate"]
    if "day" in date:
        day = date["day"]
    else:
        day = -1
    if "month" in date:
        month = date["month"]
    else:
        month = -1
    year = date["year"]
    result["publish_day"] =day
    result["publish_month"] =month
    result["publish_year"] =year
    if res["item"]["bibrecord"]["tail"] != None:
        result["ref_count"] = int(res["item"]["bibrecord"]["tail"]["bibliography"]["@refcount"])
    else:
        #print(res)
        result["ref_count"] = 0
        
    result["ref_authors"] = get_ref_author(res)
    result["ref_titles"] = get_ref_title(res)
    if "citedby-count" in res["coredata"]:
        if res["coredata"]["citedby-count"] != None:
            result["cited_count"] = int(res["coredata"]["citedby-count"])
        else:
            result["cited_count"] = 0
    else:
        result["cited_count"] = 0
    if (res["subject-areas"] != None):
        for area in res["subject-areas"]["subject-area"]:
            try:
                result["related_field"].append(area["@abbrev"])
            except Exception as e:
                print("@abbrev key not found")
    result["related_field"] = ";".join(result["related_field"])
    if (res["authkeywords"] != None):
        if type(res["authkeywords"]["author-keyword"]) == list:
            for keyword in res["authkeywords"]["author-keyword"]:
                try:
                    result["auth_keywords"].append(keyword["$"])
                except Exception as e:
                    print(res["authkeywords"]["author-keyword"])
                    print("$ key not found")
        else:
            result["auth_keywords"].append(res["authkeywords"]["author-keyword"]["$"])
    result["auth_keywords"] = ";".join(result["auth_keywords"])
    return result

In [6]:
result_zipper = []
for year in range(2018,2024):
    data_link = glob.glob(f"Data 2018-2023/Project/{year}/*")
    for i,path in enumerate(tqdm(data_link)):
        
        with open(path,"r",encoding='utf-8') as file:
            jfile = json.load(file)
            #print(jfile["abstracts-retrieval-response"]["item"]["bibrecord"]["head"]["source"]["publicationdate"])
            #print(datetime.strptime(jfile["abstracts-retrieval-response"]["item"]["bibrecord"]["head"]["source"]["publicationdate"]["date-text"]["$"],"%d %B %Y"))
            result = extract_result(jfile)
            result_zipper.append((result["scopus_id"],result['author_name'],result['title'],result['abstracts'],result['related_field'],result['auth_keywords'],result['publish_day'],result['publish_month'],result['publish_year'],result['ref_count'],result["ref_authors"],result["ref_titles"],result['cited_count']))
                
        #if (i == 100): 
            #print("---------------------")
            #break
        

  0%|          | 0/2792 [00:00<?, ?it/s]

  0%|          | 0/3082 [00:00<?, ?it/s]

  0%|          | 0/3393 [00:00<?, ?it/s]

  0%|          | 0/3815 [00:00<?, ?it/s]

  0%|          | 0/4244 [00:00<?, ?it/s]

  0%|          | 0/2890 [00:00<?, ?it/s]

In [12]:
col = ["scopus_id","author_name","title","abstracts","related_field","auth_keywords","publish_day","publish_month","publish_year","ref_count","ref_authors","ref_titles","cited_count"]
arr = list(zip(*result_zipper))
datapoint = {key : list(value) for key, value in zip(col,zip(*result_zipper))}

In [18]:
my_df = pd.DataFrame(datapoint)
my_df = my_df.set_index("scopus_id")
my_df.to_csv("data.csv")

In [19]:
my_df

Unnamed: 0_level_0,author_name,title,abstracts,related_field,auth_keywords,publish_day,publish_month,publish_year,ref_count,ref_authors,ref_titles,cited_count
scopus_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
85077976956,Pongpirul K.;Lungren M.P.,Public health and international epidemiology f...,,MEDI,,31,12,2018,76,Quah S.R.;Morens D.M.;Young E.;Blumberg H.M.;C...,The untilled fields of public health;Committee...,1
85060936020,Pratumsiri T.;Janpugdee P.,Flexible Printed Active Antenna for Digital Te...,This paper presents the development of a flexi...,ENGI;MATE,,31,12,2018,4,Janpugdee P.;Pratumsiri T.,Development of built-in low-profile antenna fo...,1
85052201238,Phuakpunk K.;Chalermsinsuwan B.;Putivisutisak ...,Parametric study of hydrogen production via so...,Computational fluid dynamics was applied for s...,CHEM;CENG;ENGI,Circulating fluidized bed;Computational fluid ...,31,12,2018,42,Alvarez D.;Johnsen K.;Blom R.;Cortes C.;Alonso...,Capture of CO2from combustion gases in a fluid...,21
85051498032,Saengkaew J.;Le D.;Samart C.;Sawada H.;Nishida...,Superhydrophobic coating from fluoroalkylsilan...,A superhydrophobic/superoleophilic mesh was su...,CHEM;PHYS;PHYS;PHYS;MATE,Encapsulation;Fluoroalkylsilane;Natural rubber...,31,12,2018,45,Gauri S.S.;Liu S.;Mates J.E.;Lin T.;Li L.;Chan...,Ceramic membrane performance in microfiltratio...,37
85050678366,Teengam P.;Siangproh W.;Tuantranont A.;Vilaiva...,Electrochemical impedance-based DNA sensor usi...,A label-free electrochemical DNA sensor based ...,CHEM;BIOC;ENVI;CHEM,acpcPNA;Electrochemical impedance spectroscopy...,31,12,2018,55,Niu L.;Liu S.;Nutiu R.;Lin C.B.;Rattanarat P.;...,The diagnosis and misdiagnosis of tuberculosis...,68
...,...,...,...,...,...,...,...,...,...,...,...,...
85111945558,Le D.;Chaidherasuwet N.;Rueangthaweep A.;Kulsi...,Long-chain bio-olefins production via oxidativ...,Long-chain α-olefins (≥ C10) are normally appl...,CENG;CHEM,Long-chain olefins;Mesoporous KIT-6;Oleic acid...,01,01,2023,63,Bineesh K.V.;Liu Z.;Kubickova I.;Zhang B.;Li L...,The chemistry and kinetics of polyethylene pyr...,3
85111408415,Alahmad W.;Varanusupakul P.;Varanusupakul P.,Recent Developments and Applications of Microf...,"Nowadays, food safety has become a major conce...",CHEM,Biological hazards;chemical hazards;food conta...,-1,-1,2023,115,Kaur N.;Meka A.;Fabiano-Tixier A.S.;Kou X.;Pai...,"Food Safety, Food Fraud, and Food Defense: A F...",11
85110903700,Pherali T.,"Social justice, education and peacebuilding: c...",Education is increasingly becoming central to ...,SOCI,conflict;Education;peacebuilding;social justic...,-1,-1,2023,76,Dulyakasem U.;Dryden-Peterson S.;Shah R.A.;Rob...,The Rehabilitation of Jemaah Islamiyah Detaine...,5
85106740832,Mapanao R.;Jiwyam W.;Nithikulworawong N.;Weepl...,Effects of black soldier fly (Hermetia illucen...,The effects of replacing fish meal protein wit...,ENVI;AGRI,Anabas testudineus;Black soldier fly;fish meal...,-1,-1,2023,44,Roth I.;Zhang B.;Waagbo R.;Gasco L.;Lussiana C...,Effect of dietary carbohydrate to lipid ratios...,6


In [20]:
rd = redis.Redis(host = "107.22.67.15",port = 6379,password = "cryogen",charset="utf-8", decode_responses=True)
rd.ping()

True

In [21]:
cache = set(rd.keys())
len(cache)

58522

In [23]:
def send(idx,mapping):
    rd.hset( f"static:scopus:{idx}",mapping = mapping)

with tqdm(total = len(my_df)) as pbar:
    with ThreadPoolExecutor(max_workers=256) as executor:
        
        future = [executor.submit(send,idx,row.to_dict()) for idx,row in my_df.iterrows()]
        for f in as_completed(future):
            pbar.update(1)

  0%|          | 0/20216 [00:00<?, ?it/s]

In [24]:
rd.hgetall("static:scopus:85061529820")

{'ref_titles': 'Predicting judicial decisions of the european court of human rights: A natural language processing perspective;A general approach for predicting the behavior of the supreme court of the nited states;A Two-stage classifier that identifies charge and punishment under criminal law of civil law system;An elements-based multi-stage charges identification model for textual criminal cases;Thai criminal code;An information extraction framework for legal documents: A case study of Thai Supreme Court verdicts;Element of crime structure;Deep learning;Hierarchical question-image co-Attention for visual question answering;Neural machine translation by jointly learning to align and translate;Learning phrase representations using RNN encoderdecoder for statistical machine translation;Long Short-Term Memory;Empirical evaluation of gated recurrent neural networks on sequence modeling;Dynamic coattention networks for question answering;Thai lexeme tokenizer: Lexto;Thai royal society;A St