In [1]:
import os
import pickle
import time
from collections import namedtuple
import textwrap

import sqlalchemy as sa
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.schema import MetaData
from python_utils.data_access import check_connection as pg_check_connection
import numpy as np
from pymongo import MongoClient
import pandas as pd

from esc_identifier.author import Author, RealAuthor
from esc_identifier.author.distance import author_distance
from esc_identifier.author.utils import to_real_author
from esc_identifier.utils.string import normalize_author

%load_ext autoreload
%autoreload 2

In [21]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield index, record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [3]:
pd.options.display.max_rows = 999

In [4]:
ESC_DB_URI = 'postgresql://root:secret@localhost:5432/esc'
KDD_DB_URI = 'postgresql://root:secret@localhost:5432/Kdd2013AuthorPaperIdentification'

In [5]:
pg_check_connection(ESC_DB_URI)
pg_check_connection(KDD_DB_URI)

In [6]:
MONGO_URI = 'mongodb://localhost:27017/'

mongo_client = MongoClient(MONGO_URI)
esc_db = mongo_client.esc
real_author_collection = esc_db.real_author

In [7]:
sa_url = sa.engine.url.make_url(KDD_DB_URI)

engine = create_engine(sa_url)

meta = MetaData()
meta.reflect(bind=engine)

author_table = meta.tables['author']

DBSession = sessionmaker(bind=engine)
session = DBSession()

In [8]:
models_dir = 'models'
model_path = os.path.join(models_dir, f'svm-poly.pickle')
with open(model_path, 'rb') as model_file:
    classifier = pickle.load(model_file)
    print(f'Loaded model "{model_path}"')

Loaded model "models/svm-poly.pickle"


In [9]:
author_dict = RealAuthor(
    kdd_author_ids=[1,2],
    name='pidr',
    affiliation='gnidos'
)._asdict()
author_dict['_id'] = 1
result = real_author_collection.insert_one(author_dict)
# inserted_docs_count = len(result.inserted_ids)
# print(f'Inserted {inserted_docs_count} documents')

In [68]:
same_authors_threshold = 0.98
update_every = 100
limit = 1000
name_distance_threshold = 0.33

authors_count = session.query(sa.func.count(author_table.c.id)).one()[0]

query = (
    session.query(author_table)
    .order_by(author_table.c.name)
)

real_authors = []

total_tic = time.time()
tic = time.time()

for index, author_row in log_progress(query[:limit], every=1):
    author = Author(
        kdd_id=author_row.id,
        name=author_row.name,
        affiliation=author_row.affiliation
    )
    normalized_author = normalize_author(author)

    # find faster way: predict_proba for each author or for batch of them
    distances_with_real_authors = np.empty(shape=(0,2))
    real_authors_indices = []
    for real_author_idx, real_author in enumerate(real_authors):
        distance = author_distance(normalized_author, real_author)
        if distance[0] > name_distance_threshold:
            continue
            
        distances_with_real_authors = np.vstack((
            distances_with_real_authors,
            np.array(distance)
        ))
        real_authors_indices.append(
            real_author_idx
        )
    try:
        are_same_authors_probabilities = classifier.predict_proba(
            distances_with_real_authors
        )[:, 1]
    except(ValueError):
        real_authors.append(to_real_author(author))
        continue

    if max(are_same_authors_probabilities) < same_authors_threshold:
        real_authors.append(to_real_author(author))
    else: 
        real_author_idx = int(np.argmax(are_same_authors_probabilities))
        real_authors[real_author_idx].kdd_author_ids.append(author.kdd_id)
    
    if index % update_every == 0:
        print(textwrap.dedent(f'''
            Progress: {index}/{authors_count} authors
            Total time: {time.time() - tic} s    
        ''').lstrip())
        
        with open('benchmark.tmp', 'wb+') as tmp_file:
            pickle.dump({
                'real_authors': real_authors,
                'last_author_idx': index - 2,
            }, tmp_file)        
            
        tic = time.time()

print(f'Total time: {time.time() - total_tic} s')

Progress: 200/247121 authors
Total time: 3.306046962738037 s    

Progress: 300/247121 authors
Total time: 3.2792670726776123 s    

Progress: 400/247121 authors
Total time: 5.034348726272583 s    

Progress: 500/247121 authors
Total time: 2.570261001586914 s    

Progress: 600/247121 authors
Total time: 3.6742000579833984 s    

Progress: 800/247121 authors
Total time: 9.93295407295227 s    

Progress: 900/247121 authors
Total time: 6.200567007064819 s    

Progress: 1000/247121 authors
Total time: 5.806123733520508 s    

Total time: 39.82888078689575 s


In [39]:
a = np.array(
    [[ 0.6,   1.  ],
     [ 0.61,  1.  ],
     [ 0.67,  1.  ],
     [ 0.53,  1.  ],
     [ 0.63,  1.  ]]
)
a[a[:,0] < 0.6]

array([[ 0.53,  1.  ]])

In [47]:
b = np.array([])
np.concatenate((b, np.array([1,2])))

array([ 1.,  2.])