In [1]:
from metadata_extract import ko_main,extract_PDF_elements


In [27]:
def ko_citations(extracted_data):
    """
    - input: list of all tuples
    - output: spliced list of tuples & list of citations

    Extracts citations from Korean articles. 

    Citations are a set of continous streams that
    begin with "[" usually after the occurrence of 
    "참고문헌", but do not strictly obey those rules 
    and are frequently embedded in other streams. 
    Total number of citations are unknown beforehand, 
    hence subsequent streams are iterated in a while loop.

    Other metadata fields (e.g. abstract) tend to 
    occur after the citations, so the range of citations 
    is returned to limit false calls & maximize efficiency 
    of backtrack search. 

    """

    # precheck for efficiency
    runs = [i[2] for i in extracted_data]
    if all("[" not in run for run in runs):
        return None
    
    for i,run in enumerate(extracted_data):
        temp = []
        instance = 0 # count how many runs
        # find instance of keyword
        if any(s in run[2] for s in ("참  고  문  헌","참 고 문 헌","참고문헌")):
            if len (extracted_data[i:]) <=2:
                return None
            cidx = i
            k = 1
            j = i + k
            keepGoing = True
            while keepGoing == True and j < len(extracted_data):
                j = i + k
                try:
                    val = extracted_data[j][2].strip()
                    if not val: # skip if blank; not instance
                        k+=1
                        continue
                    # it is index
                    elif "[" in val:
                        # check if it indexes numbers [0]
                        vv = val.partition("[")[2].partition("]")[0]
                        if vv.isdigit():
                            temp.append(val)
                            k+=1
                            instance+=1 # update instance
                            continue
                        else:
                            keepGoing = False
                            break
                    # takes up multiple lines
                    elif "[" not in val and ("[" in extracted_data[j+1][2]):
                        if k == 1: # if not first one
                            instance+=1 # update instance
                            k+=1
                            keepGoing = True
                            continue
                        else:
                            temp.append(extracted_data[j][2])
                            k+=1
                            instance+=1 # update instance
                            keepGoing = True
                            continue
                    elif has_numbers(val) and val.endswith(".") and "[" not in val and "[" in extracted_data[j-1][2]:
                        temp.append(val)
                        k+=1
                        instance+=1 # update instance
                        keepGoing = False
                        break
                    # pass first one
                    elif k==1:
                        k+=1
                        instance+=1 # update instance
                        keepGoing = True
                        continue
                    else:
                        keepGoing = False
                        break
                except:
                    keepGoing = False
            
            citations = []
            for t in temp:
                citations.append(t.split("\n"))
            citations = [item for sublist in citations for item in sublist]
            try:
                if extracted_data[i+instance+1:]:
                    return extracted_data[i+instance+1:],citations
                
                if len (citations) >3:
                    numbs = [substring_after(cit,"[") for cit in citations if substring_after(cit,"[")]
                    digits = [idx[0] for idx in numbs]
                    duplicated = [idx for idx, x in enumerate(digits) if digits.count(x) > 1]
                    if duplicated:
                        citations = citations[:duplicated[0]+1]
                        cut_idx = [i for i,e in enumerate(extracted_data) if 1 == 1]
                        spliced_data = extracted_data[cidx+2:]
                        return spliced_data,citations
                else:
                    return extracted_data[i+instance+1:],citations
            except:
                return None,None
def substring_after(s, delim):
    return s.partition(delim)[2]
e = extract_PDF_elements("275d7fb2fd45098ad5c3ece2ed4a2824.pdf")
data, citations = ko_citations(e)

In [28]:
data, citations

([[10.501900000000035, 'XDSATU+TimesNewRoman', '주체106(2017)년 6월 5일 원고접수'],
  [12.0,
   'TimesNewRomanPS-BoldMT',
   'Binding Structure and Stability of Complexes between Calix[4]arene  \nDerivatives and Zoledronic Acid Molecule  \nby First Principles Method'],
  [10.501899999999978,
   'XDSATU+TimesNewRoman',
   'Yu Chol Jun, Ri Jin Hyok and Jang Yong Man'],
  [10.5019,
   'XDSATU+TimesNewRoman',
   'We  made  simulations  of  complexes  between  calix[4]arene  molecule,  its  phosphonate  and \nsulphonate  derivatives  and  zoledronic  acid  molecule,  and  made  estimations  of  stability  of \nbinding  structure  through  binding  energy  calculations.  It  was  concluded  that  direct  binding \nbetween  calixarene  and  zoledronic  acid  molecules  induces  positive  binding  energy  and  thus \nthese  complexes  are  instable.  When  zoledronic  acid  binding  to  phosphonate  or  sulphonate \ncalixarene derivatives, the binding energies become negative, indicating the stable fom

In [43]:

if len (citations) >3 and not data:
    numbs = [substring_after(cit,"[") for cit in citations if substring_after(cit,"[")]
    digits = [numb[0] for numb in numbs]
    duplicated = [i for i, x in enumerate(digits) if digits.count(x) > 1]
    citations2 = citations[:duplicated[0]+1]
    extracted_data2 = citations[duplicated[0]+1:]
    print (citations2)
    print (extracted_data2)
    

['[1] M. H. Ri et al.; J. Mat. Sci., 51, 3125, 2016. ', '[2] C. Hoskins et al.; J. Nanomed. Res., 2, 00028, 2015. ', '[3] R. Galindo-Murillo et al.; Comput. Theor. Chem., 1035, 84, 2014. ', '[4] O. Danylyuk et al.; J. Mol. Struct., 965, 116, 2010.']
['주체106(2017)년 6월 5일 원고접수', 'Binding Structure and Stability of Complexes between Calix[4]arene  ', 'Derivatives and Zoledronic Acid Molecule  ', 'by First Principles Method', 'Yu Chol Jun, Ri Jin Hyok and Jang Yong Man', 'We  made  simulations  of  complexes  between  calix[4]arene  molecule,  its  phosphonate  and ', 'sulphonate  derivatives  and  zoledronic  acid  molecule,  and  made  estimations  of  stability  of ', 'binding  structure  through  binding  energy  calculations.  It  was  concluded  that  direct  binding ', 'between  calixarene  and  zoledronic  acid  molecules  induces  positive  binding  energy  and  thus ', 'these  complexes  are  instable.  When  zoledronic  acid  binding  to  phosphonate  or  sulphonate ', 'calixare

In [2]:
e = extract_PDF_elements("275d7fb2fd45098ad5c3ece2ed4a2824.pdf")
e

[[9.0, 'XDSATU+TimesNewRoman', '김일성종합대학학보'],
 [7.9799999999999045, 'XDSATU+TimesNewRoman', '(자연과학)'],
 [9.0, 'XDSATU+TimesNewRoman', 'JOURNAL OF KIM IL SUNG UNIVERSITY'],
 [7.9799999999999045, 'XDSATU+TimesNewRoman', '(NATURAL SCIENCE)'],
 [7.9799999999999045, 'XDSATU+TimesNewRoman', '주체106(2017)년  제63권  제10호'],
 [7.9799999999999045,
  'XDSATU+TimesNewRoman',
  'Vol. 63  No. 10   JUCHE106(2017).'],
 [10.501899999999978,
  'XDSATU+TimesNewRoman',
  '제1원리적방법에 의한 칼릭스[4]아렌유도체와  \n졸레드론산분자의 결합구조와 안정성'],
 [9.480000000000018,
  'NEFBII+PIC-KP-CheonRiMa-Medium-KP-97KPS',
  '유철준, 리진혁, 장영만'],
 [10.501900000000035,
  'XDSATU+TimesNewRoman',
  '일반적으로  수용액에서  용해도가  낮은  약물분자들은  생체반응성과  약리효과가  낮은 \n결함을 가지고있다. 만일 이러한 약물분자가 용해도가 높은 다른 화합물과 호스트－게스트\n형의 복합물을 형성한다면 소수성약물의 용해성을 높일수 있다. 이러한 복합물은 보통 호\n스트분자와  약물게스트분자사이의  비공유성호상작용에  의하여  형성된다.  소수성기능단과 \n친수성기능단을 둘 다 가지며 고리저중합체화합물로 특징지어지는 칼릭스[n]아렌(n은 분자\n를 이루는 탄소6각형고리의 개수)은 많은 종류의 이온 및 분자들과  호스트－게스트형의 복\n합물을 쉽게 형성할수 있으며 따라서 대표적인 약물전달 및 가용화제로 널리 리용되고있\n다.[2－4] 졸

In [3]:
ko_main("275d7fb2fd45098ad5c3ece2ed4a2824.pdf")

{'Title (en)': 'Binding Structure and Stability of Complexes between Calix[4]arene Derivatives and Zoledronic Acid Molecule by First Principles Method',
 'Author (en)': ['Yu Chol Jun', 'Ri Jin Hyok', 'Jang Yong Man'],
 'Start Page': 73,
 'End Page': 74,
 'Keywords': ['first principles', 'calix[4]arene'],
 'Submission Date': '주체106(2017)년 6월 5일 ',
 'Abstract': 'We made simulations of complexes between calix[4]arene molecule, its phosphonate and sulphonate derivatives and zoledronic acid molecule, and made estimations of stability of binding structure through binding energy calculations. It was concluded that direct binding between calixarene and zoledronic acid molecules induces positive binding energy and thus these complexes are instable. When zoledronic acid binding to phosphonate or sulphonate calixarene derivatives, the binding energies become negative, indicating the stable fomation of complexes.  ',
 'Citations': ['[1] M. H. Ri et al.; J. Mat. Sci., 51, 3125, 2016.',
  '[2] C. Ho