In [1]:
import io
import os
import glob
import re
import tqdm

import pdfminer.converter
import pdfminer.layout
import pdfminer.pdfinterp
import pdfminer.pdfpage
import pdfminer.pdfparser

import nltk.corpus
nltk.download('stopwords')

def load_pdf(filepath):
    output_string = io.StringIO()
    with open(filepath, 'rb') as fin:
        parser = pdfminer.pdfparser.PDFParser(fin)
        doc = pdfminer.pdfdocument.PDFDocument(parser)
        rsrcmgr = pdfminer.pdfinterp.PDFResourceManager()
        device = pdfminer.converter.TextConverter(rsrcmgr, output_string, laparams=pdfminer.layout.LAParams())
        interpreter = pdfminer.pdfinterp.PDFPageInterpreter(rsrcmgr, device)
        for page in pdfminer.pdfpage.PDFPage.create_pages(doc):
            interpreter.process_page(page)
    return output_string.getvalue().split('\n')

def combine_spaces(sentence):
    """
    >>> combine_spaces('  hoge    foo bar  ')
    ' hoge foo bar '
    """
    return re.sub(r'\s+', r' ', sentence)

def connect_string(string1, string2):
    """
    >>> connect_string('aaa', 'bbb')
    'aaa bbb'
    >>> connect_string('a-', 'bbb')
    'abbb'
    """
    if re.match(r'.*-$', string1):
        return re.sub(r'-$', '', string1) + string2
    else:
        return string1 + " " + string2
                  
def combine_sentences(sentences: list,  threshold=5):
    """
    文章を結合する。一行の文字列がthreshold未満の場合は、次の行を結合しない。
    """
    result = []
    combine_next = False
    for sentence in sentences:
        if not combine_next:
            result.append(sentence)
            if len(sentence) >= threshold:
                combine_next = True
        else:
            result[-1] = connect_string(result[-1] , sentence)
            if len(sentence) < threshold:
                combine_next = False
    return result

def drop_brackets(strings):
    drop_result = []
    for line in strings:
        drop_result.append(re.sub(r'[\(\)\[\]\{\}]', ' ', line))
    return drop_result
    
def drop_symbols(strings):
    drop_result = []
    for line in strings:
        drop_result.append(re.sub(r'[,;:?*~""%/.|\.]', ' ', line))
    return drop_result

def drop_empty_line(strings):
    drop_result = []
    for line in strings:
        if re.match(r'^\s*$', line):
            continue
        else:
            drop_result.append(line)
    return drop_result

def drop_digits_line(strings):
    drop_result = []
    for line in strings:
        if re.match(r'^[^a-zA-Z]*$', line):
            continue
        else:
            drop_result.append(line)
    return drop_result

def drop_short_line(strings, threshold=5):
    drop_result = []
    for line in strings:
        if len(line) < threshold:
            continue
        else:
            drop_result.append(line)
    return drop_result
            

def drop_digits(strings):
    drop_result = []
    for line in strings:
        drop_result.append(re.sub(r'\+*\-*\d+\.*\d*', 'number', line))
    return drop_result

def preprocess(pdf):
    pdf = combine_sentences(pdf)
    pdf = drop_brackets(pdf)
    pdf = drop_symbols(pdf)
    pdf = drop_digits_line(pdf)
    pdf = drop_short_line(pdf)
    pdf = drop_digits(pdf)
    pdf = drop_empty_line(pdf) 
    return pdf


def load_pdfs(dirpath):
    pdfs = []
    for filepath in tqdm.tqdm_notebook(glob.glob(os.path.abspath(os.path.join(dirpath, '*.pdf')))):
        pdfs.append(load_pdf(filepath))
    return pdfs


    
pdfs = load_pdfs('paper')

for index, pdf in enumerate(pdfs):
    pdfs[index] = ' '.join(preprocess(pdf))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kuboshu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filepath in tqdm.tqdm_notebook(glob.glob(os.path.abspath(os.path.join(dirpath, '*.pdf')))):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=77.0), HTML(value='')))




In [2]:
print(pdfs[0])

Estimation of atrial fibrillation from lead-I ECGs   Comparison with cardiologists and machine learning  model  CurAlive   a clinical validation study   numberst  Necmettin Korucuk   Department of Cardiology  Memorial Antalya Hospital  Antalya  Turkey  necmettinmd@gmail com    numberth Onur Karaman  Vocational School of Health Services  Department of Medical Imaging  Akdeniz University   Antalya  Turkey  onurkaraman@akdeniz edu tr   numberth Nezaket Yıldırım  Faculty of Nursing  Department of  Nursing Management    Akdeniz University   Antalya  Turkey  ozturknezaket@akdeniz edu tr   numbernd  Çağın Polat  Chief Technical Officer  Notrino Research  Ankara  Turkey  caginpolat@notrino com    numberth Veysel Tosun  Department of Cardiology  Şanlıurfa Education And Research  Hospital  Şanlıurfa  Turkey  veyseltosunnumber@gmail com    numberth Yıldıray Çete  Department of Emergency Medicine  Akdeniz University  Antalya  Turkey  ycete@akdeniz edu tr   numberrd  Emine Selda Gündüz   Vocational

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(line):
    line = line.split()
    return [word for word in line if word not in nltk.corpus.stopwords.words('english') + ['number']]


vectorizer = TfidfVectorizer(tokenizer=tokenize)
vectorizer.fit(pdfs)
tfidf = vectorizer.transform(pdfs)



In [4]:
print(len(vectorizer.get_feature_names()))
print(tfidf.toarray().shape)

37506
(77, 37506)


In [5]:
for num, value in enumerate(tfidf.toarray()[0]):
    if value > 0.05:
        print(f'{vectorizer.get_feature_names()[num]} : {value}  *** ', end='')
print("")

afib : 0.41301351193018504  *** al : 0.05579473221332051  *** artificial : 0.09070023712718021  *** atrial : 0.21296009208900166  *** cardiologist : 0.13552005860209196  *** cardiologists : 0.2839467894520022  *** curalive : 0.23232010046072907  *** curalive's : 0.05162668899127313  *** detection : 0.05321241506945952  *** diagnosis : 0.09007553378346883  *** diagnostic : 0.07709520155810319  *** doi : 0.06632506852870457  *** ecg : 0.46464020092145814  *** ecgs : 0.2839467894520022  *** et : 0.05437015169262903  *** fibrillation : 0.20005341984118338  *** fnumber-score : 0.06593008788160891  *** heart : 0.06906340153669563  *** lead-i : 0.21296009208900166  *** nsr : 0.1355220316013979  *** number-lead : 0.18069341146945597  *** rhythm : 0.12261338635427368  *** 


In [6]:
import numpy as np

def cossim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

result = []
for i in range(77):
    result_tmp = []
    for j in range(77):
        result_tmp.append(cossim(tfidf.toarray()[i], tfidf.toarray()[j]))
    result.append(result_tmp)
result = np.array(result)

print(result)

[[1.         0.02896348 0.04299925 ... 0.05517863 0.03627486 0.02285336]
 [0.02896348 1.         0.08367731 ... 0.04275835 0.07300085 0.01718585]
 [0.04299925 0.08367731 1.         ... 0.07310741 0.03866101 0.04549564]
 ...
 [0.05517863 0.04275835 0.07310741 ... 1.         0.05548307 0.03159992]
 [0.03627486 0.07300085 0.03866101 ... 0.05548307 1.         0.19121091]
 [0.02285336 0.01718585 0.04549564 ... 0.03159992 0.19121091 1.        ]]


In [11]:
# from IPython.display import display
# np.set_printoptions(threshold=np.inf)
for i in range(result.shape[0]):
    for j in range(result.shape[1]):
        print(f'{result[i, j]:5.2f} ', end='')
    print("")


 1.00  0.03  0.04  0.02  0.04  0.04  0.04  0.03  0.04  0.05  0.02  0.03  0.04  0.04  0.02  0.04  0.04  0.02  0.04  0.01  0.04  0.05  0.04  0.03  0.02  0.05  0.03  0.02  0.03  0.03  0.04  0.02  0.03  0.01  0.05  0.04  0.01  0.04  0.04  0.03  0.03  0.01  0.03  0.04  0.04  0.05  0.02  0.02  0.03  0.02  0.04  0.01  0.02  0.02  0.03  0.03  0.03  0.03  0.03  0.04  0.04  0.04  0.04  0.02  0.05  0.03  0.04  0.03  0.03  0.03  0.03  0.02  0.04  0.05  0.06  0.04  0.02 
 0.03  1.00  0.08  0.03  0.08  0.06  0.08  0.02  0.12  0.06  0.05  0.06  0.02  0.08  0.04  0.10  0.15  0.09  0.03  0.14  0.05  0.03  0.05  0.07  0.04  0.06  0.09  0.05  0.06  0.10  0.07  0.03  0.05  0.07  0.07  0.18  0.02  0.03  0.03  0.03  0.12  0.04  0.08  0.05  0.05  0.12  0.05  0.08  0.08  0.04  0.05  0.12  0.05  0.12  0.06  0.04  0.05  0.05  0.09  0.03  0.10  0.04  0.11  0.04  0.04  0.03  0.06  0.04  0.03  0.06  0.05  0.09  0.03  0.06  0.04  0.07  0.02 
 0.04  0.08  1.00  0.02  0.09  0.07  0.09  0.02  0.13  0.09  0.04  0.07  0