In [17]:
import os
import glob
import json
from datetime import datetime
import pandas as pd
from tqdm.notebook import tqdm
import redis
import re

In [2]:
!pip install polars



In [3]:
import polars as pl

In [4]:
os.path

<module 'ntpath' (frozen)>

In [5]:
glob.glob("Data 2018-2023/Project/2018/*")

['Data 2018-2023/Project/2018\\201800000',
 'Data 2018-2023/Project/2018\\201800001',
 'Data 2018-2023/Project/2018\\201800002',
 'Data 2018-2023/Project/2018\\201800003',
 'Data 2018-2023/Project/2018\\201800004',
 'Data 2018-2023/Project/2018\\201800005',
 'Data 2018-2023/Project/2018\\201800006',
 'Data 2018-2023/Project/2018\\201800007',
 'Data 2018-2023/Project/2018\\201800008',
 'Data 2018-2023/Project/2018\\201800009',
 'Data 2018-2023/Project/2018\\201800010',
 'Data 2018-2023/Project/2018\\201800011',
 'Data 2018-2023/Project/2018\\201800012',
 'Data 2018-2023/Project/2018\\201800013',
 'Data 2018-2023/Project/2018\\201800014',
 'Data 2018-2023/Project/2018\\201800015',
 'Data 2018-2023/Project/2018\\201800016',
 'Data 2018-2023/Project/2018\\201800017',
 'Data 2018-2023/Project/2018\\201800018',
 'Data 2018-2023/Project/2018\\201800019',
 'Data 2018-2023/Project/2018\\201800020',
 'Data 2018-2023/Project/2018\\201800021',
 'Data 2018-2023/Project/2018\\201800022',
 'Data 2018

In [6]:
df = pl.DataFrame()
df

In [59]:
list(set(filter(None,["","a"])))

['a']

In [159]:
def get_author(res):
    names = []
    for auth in res["authors"]["author"]:
        auth = auth["preferred-name"]
        names.append(auth["ce:indexed-name"])
    return ";".join(names)

def get_ref_title(res):
    title = []
    if res["item"]["bibrecord"]["tail"] != None:
         for ref in res["item"]["bibrecord"]["tail"]["bibliography"]["reference"]:
                try:
                    if "ref-title" not in ref["ref-info"]:
                        continue
                    title.append(ref["ref-info"]["ref-title"]["ref-titletext"])
                except:
                    pass
    return ";".join(title)

def get_ref_author(res):
    names = []
    if res["item"]["bibrecord"]["tail"] != None:
        for ref in res["item"]["bibrecord"]["tail"]["bibliography"]["reference"]:
            try:
                if "ref-authors" not in ref["ref-info"]:
                    continue
                if "author" not in ref["ref-info"]["ref-authors"]:
                    selection = ref["ref-info"]["ref-authors"]["collaboration"]
                    if type(selection) == list:
                        for sel in selection:
                            names.append(sel["ce:indexed-name"])
                    else:
                        names.append(selection["ce:indexed-name"])
                else:
                    for auth in ref["ref-info"]["ref-authors"]["author"]:
                        names.append(auth["ce:indexed-name"])
            except:
                pass
            
    return ";".join(list(set(filter(None,names))))
                

def extract_result(json_dict):
    result = {}
    res = json_dict['abstracts-retrieval-response']
    result["scopus_id"]= re.sub("SCOPUS_ID:","",res["coredata"]["dc:identifier"])
    result["author_name"] = get_author(res)
    if "dc:title" not in res["coredata"]:
        result["title"] = ""
    else:
        result["title"] = res["coredata"]["dc:title"]
    result["abstracts"] = res["coredata"]["dc:description"] if "dc:description" in res["coredata"] else None
    result["abstracts"] = "" if result["abstracts"]  == None else result["abstracts"]
    result["related_field"] = []
    result["auth_keywords"] = []
    date = res["item"]["bibrecord"]["head"]["source"]["publicationdate"]
    if "day" in date:
        day = date["day"]
    else:
        day = -1
    if "month" in date:
        month = date["month"]
    else:
        month = -1
    year = date["year"]
    result["publish_day"] =day
    result["publish_month"] =month
    result["publish_year"] =year
    if res["item"]["bibrecord"]["tail"] != None:
        result["ref_count"] = int(res["item"]["bibrecord"]["tail"]["bibliography"]["@refcount"])
    else:
        #print(res)
        result["ref_count"] = 0
        
    result["ref_authors"] = get_ref_author(res)
    result["ref_titles"] = get_ref_title(res)
    if "citedby-count" in res["coredata"]:
        if res["coredata"]["citedby-count"] != None:
            result["cited_count"] = int(res["coredata"]["citedby-count"])
        else:
            result["cited_count"] = 0
    else:
        result["cited_count"] = 0
    if (res["subject-areas"] != None):
        for area in res["subject-areas"]["subject-area"]:
            try:
                result["related_field"].append(area["@abbrev"])
            except Exception as e:
                print("@abbrev key not found")
    result["related_field"] = ";".join(result["related_field"])
    if (res["authkeywords"] != None):
        if type(res["authkeywords"]["author-keyword"]) == list:
            for keyword in res["authkeywords"]["author-keyword"]:
                try:
                    result["auth_keywords"].append(keyword["$"])
                except Exception as e:
                    print(res["authkeywords"]["author-keyword"])
                    print("$ key not found")
        else:
            result["auth_keywords"].append(res["authkeywords"]["author-keyword"]["$"])
    result["auth_keywords"] = ";".join(result["auth_keywords"])
    return result

In [160]:
result_zipper = []
for year in range(2018,2024):
    data_link = glob.glob(f"Data 2018-2023/Project/{year}/*")
    for i,path in enumerate(tqdm(data_link)):
        
        with open(path,"r",encoding='utf-8') as file:
            jfile = json.load(file)
            #print(jfile["abstracts-retrieval-response"]["item"]["bibrecord"]["head"]["source"]["publicationdate"])
            #print(datetime.strptime(jfile["abstracts-retrieval-response"]["item"]["bibrecord"]["head"]["source"]["publicationdate"]["date-text"]["$"],"%d %B %Y"))
            result = extract_result(jfile)
            result_zipper.append((result["scopus_id"],result['author_name'],result['title'],result['abstracts'],result['related_field'],result['auth_keywords'],result['publish_day'],result['publish_month'],result['publish_year'],result['ref_count'],result["ref_authors"],result["ref_titles"],result['cited_count']))
                
        #if (i == 100): 
            #print("---------------------")
            #break
        

  0%|          | 0/2792 [00:00<?, ?it/s]

  0%|          | 0/3082 [00:00<?, ?it/s]

  0%|          | 0/3393 [00:00<?, ?it/s]

  0%|          | 0/3815 [00:00<?, ?it/s]

  0%|          | 0/4244 [00:00<?, ?it/s]

  0%|          | 0/2890 [00:00<?, ?it/s]

In [161]:
col = ["scopus_id","author_name","title","abstracts","related_field","auth_keywords","publish_day","publish_month","publish_year","ref_count","ref_authors","ref_titles","cited_count"]
arr = list(zip(*result_zipper))
datapoint = {key : list(value) for key, value in zip(col,zip(*result_zipper))}

In [162]:
my_df = pd.DataFrame(datapoint)
my_df.to_csv("data.csv")

In [163]:
my_df

Unnamed: 0,scopus_id,author_name,title,abstracts,related_field,auth_keywords,publish_day,publish_month,publish_year,ref_count,ref_authors,ref_titles,cited_count
0,85077976956,Pongpirul K.;Lungren M.P.,Public health and international epidemiology f...,,MEDI,,31,12,2018,76,Dreyer K.;De Wijkerslooth T.R.;Olszewski J.;Al...,The untilled fields of public health;Committee...,1
1,85060936020,Pratumsiri T.;Janpugdee P.,Flexible Printed Active Antenna for Digital Te...,This paper presents the development of a flexi...,ENGI;MATE,,31,12,2018,4,Pratumsiri T.;Janpugdee P.,Development of built-in low-profile antenna fo...,1
2,85052201238,Phuakpunk K.;Chalermsinsuwan B.;Putivisutisak ...,Parametric study of hydrogen production via so...,Computational fluid dynamics was applied for s...,CHEM;CENG;ENGI,Circulating fluidized bed;Computational fluid ...,31,12,2018,42,Cunha A.F.;Samruamphianskun T.;Alonso M.;Harri...,Capture of CO2from combustion gases in a fluid...,21
3,85051498032,Saengkaew J.;Le D.;Samart C.;Sawada H.;Nishida...,Superhydrophobic coating from fluoroalkylsilan...,A superhydrophobic/superoleophilic mesh was su...,CHEM;PHYS;PHYS;PHYS;MATE,Encapsulation;Fluoroalkylsilane;Natural rubber...,31,12,2018,45,Saito T.;Schondelmaier D.;Mabry J.M.;Lin T.;Wa...,Ceramic membrane performance in microfiltratio...,37
4,85050678366,Teengam P.;Siangproh W.;Tuantranont A.;Vilaiva...,Electrochemical impedance-based DNA sensor usi...,A label-free electrochemical DNA sensor based ...,CHEM;BIOC;ENVI;CHEM,acpcPNA;Electrochemical impedance spectroscopy...,31,12,2018,55,Nuanyai T.;Boey F.;Lawrence M.F.;Svenson S.B.;...,The diagnosis and misdiagnosis of tuberculosis...,68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20211,85111945558,Le D.;Chaidherasuwet N.;Rueangthaweep A.;Kulsi...,Long-chain bio-olefins production via oxidativ...,Long-chain α-olefins (≥ C10) are normally appl...,CENG;CHEM,Long-chain olefins;Mesoporous KIT-6;Oleic acid...,01,01,2023,63,Ahmed R.;Trokourey A.;He H.-Y.;Gerard H.;Kowal...,The chemistry and kinetics of polyethylene pyr...,3
20212,85111408415,Alahmad W.;Varanusupakul P.;Varanusupakul P.,Recent Developments and Applications of Microf...,"Nowadays, food safety has become a major conce...",CHEM,Biological hazards;chemical hazards;food conta...,-1,-1,2023,115,Goncalves M.P.;Shrikrishna N.S.;Wang X.;De Mae...,"Food Safety, Food Fraud, and Food Defense: A F...",11
20213,85110903700,Pherali T.,"Social justice, education and peacebuilding: c...",Education is increasingly becoming central to ...,SOCI,conflict;Education;peacebuilding;social justic...,-1,-1,2023,76,Dulyakasem U.;Buckner E.;Abuza Z.;Chopra V.;IC...,The Rehabilitation of Jemaah Islamiyah Detaine...,5
20214,85106740832,Mapanao R.;Jiwyam W.;Nithikulworawong N.;Weepl...,Effects of black soldier fly (Hermetia illucen...,The effects of replacing fish meal protein wit...,ENVI;AGRI,Anabas testudineus;Black soldier fly;fish meal...,-1,-1,2023,44,Jin P.;Stamer A.;Arsiwalla T.;Uddin K.B.;Ray A...,Effect of dietary carbohydrate to lipid ratios...,6


In [183]:
rd = redis.Redis(host = "redis-lb-static-6417c34dfdf10c08.elb.us-east-1.amazonaws.com",port = 6379,password = "cryogen",charset="utf-8", decode_responses=True)
rd.ping()

True

In [184]:
cache = set(rd.keys())
cache

{'scopus:85077050811',
 'scopus:85051775907',
 'scopus:85031933188',
 'scopus:85008354493',
 'scopus:85033219868',
 'scopus:85038120430',
 'scopus:85041930503',
 'scopus:85050234834',
 'scopus:85055317027',
 'scopus:85060091387',
 'scopus:85107179799',
 'scopus:85074874488',
 'scopus:85076090578',
 'scopus:85063645926',
 'scopus:85040527632',
 'scopus:85068725245',
 'scopus:85081970480',
 'scopus:85072155713',
 'scopus:85050515266',
 'scopus:85054192749',
 'scopus:85044660689',
 'scopus:85057736514',
 'scopus:85067663945',
 'scopus:85048749584',
 'scopus:85045043144',
 'scopus:85074718229',
 'scopus:85049399922',
 'scopus:85041953513',
 'scopus:85063429603',
 'scopus:85108462702',
 'scopus:85083381937',
 'scopus:85045634294',
 'scopus:85081665844',
 'scopus:85021412297',
 'scopus:85058637932',
 'scopus:85046998431',
 'scopus:85039722861',
 'scopus:85066155098',
 'scopus:85061306112',
 'scopus:85044354013',
 'scopus:85066880482',
 'scopus:85055710234',
 'scopus:85060496074',
 'scopus:85

In [188]:
len(rd.hgetall( 'scopus:85055205421').keys())

12

In [None]:
for idx,row in tqdm(my_df.iterrows(),total = len(my_df)):
    #if (f"scopus:{row['scopus_id']}" not in cache):
    rd.hset(f"scopus:{row['scopus_id']}",mapping = {c : row[c] for c in col[1:]})

  0%|          | 0/20216 [00:00<?, ?it/s]

In [None]:
rd.hgetall("scopus:85061529820")

In [None]:
rd.keys()

In [87]:
rd.hget("scopus:85061529820","title")

'Predicting judicial decisions of criminal cases from Thai supreme court using bi-directional gru with attention mechanism'

In [224]:
(my_df["cited_count"] == 0).sum()

5339

In [127]:
person = {'name': 'Alice', 'age': 30, 'city': 'New York'}
name,age,city = **person
print(name)

SyntaxError: invalid syntax (2178170749.py, line 2)

In [123]:
type([]) == list

True

In [212]:
if(-1):
    print("a")

a
