In [1]:
import os
import pickle
import time
from collections import namedtuple
import textwrap

import sqlalchemy as sa
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.schema import MetaData
from python_utils.data_access import check_connection as pg_check_connection
import numpy as np
from pymongo import MongoClient
import pandas as pd

from esc_identifier.author import Author, RealAuthor
from esc_identifier.author.distance import author_distance
from esc_identifier.author.utils import to_real_author
from esc_identifier.utils.string import normalize_author

%load_ext autoreload
%autoreload 2

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield index, record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [3]:
# pd.options.display.max_rows = 999

In [4]:
ESC_DB_URI = 'postgresql://root:secret@localhost:5432/esc'
KDD_DB_URI = 'postgresql://root:secret@localhost:5432/Kdd2013AuthorPaperIdentification'

In [5]:
pg_check_connection(ESC_DB_URI)
pg_check_connection(KDD_DB_URI)

In [6]:
MONGO_URI = 'mongodb://localhost:27017/'

mongo_client = MongoClient(MONGO_URI)
esc_db = mongo_client.esc
real_author_collection = esc_db.real_author

In [7]:
sa_url = sa.engine.url.make_url(KDD_DB_URI)

engine = create_engine(sa_url)

meta = MetaData()
meta.reflect(bind=engine)

author_table = meta.tables['author']

DBSession = sessionmaker(bind=engine)
session = DBSession()

In [8]:
models_dir = 'models'
model_path = os.path.join(models_dir, f'svm-poly.pickle')
with open(model_path, 'rb') as model_file:
    classifier = pickle.load(model_file)
    print(f'Loaded model "{model_path}"')

Loaded model "models/svm-poly.pickle"


In [9]:
# author_dict = RealAuthor(
#     kdd_author_ids=[1,2],
#     name='pidr',
#     affiliation='gnidos'
# )._asdict()
# author_dict['_id'] = 1
# result = real_author_collection.insert_one(author_dict)
# inserted_docs_count = len(result.inserted_ids)
# print(f'Inserted {inserted_docs_count} documents')

Classify bunch: 39.82888078689575 s for 1000 authors

In [72]:
%%prun

same_authors_threshold = 0.98
# update_every = 100
limit = 1500
name_distance_threshold = 0.33

# authors_count = session.query(sa.func.count(author_table.c.id)).one()[0]

query = (
    session.query(author_table)
    .order_by(author_table.c.name)
)
real_authors = []
total_tic = time.time()
tic = time.time()

for index, author_row in log_progress(query[:limit], every=10):
    author = Author(
        kdd_id=author_row.id,
        name=author_row.name,
        affiliation=author_row.affiliation
    )
    normalized_author = normalize_author(author)
    
    distances_with_real_authors = np.array([
        author_distance(normalized_author, real_author)
        for real_author
        in filter(lambda a: a.name.startswith(author.name[0]), real_authors)
    ])
    
    if not len(distances_with_real_authors):
        real_authors.append(to_real_author(author))
        continue
    
    real_author_indices = np.arange(len(real_authors))[
        np.where(distances_with_real_authors[:, 0] < name_distance_threshold)
    ]

    if not len(real_author_indices):
        real_authors.append(to_real_author(author))
        continue

    are_same_authors_probabilities = classifier.predict_proba(
        distances_with_real_authors[real_author_indices]
    )[:, 1]

    if max(are_same_authors_probabilities) < same_authors_threshold:
        real_authors.append(to_real_author(author))
    else: 
        real_author_idx = int(np.argmax(are_same_authors_probabilities))
        real_authors[real_author_idx].kdd_author_ids.append(author.kdd_id)
    
#     if update_every and index % update_every == 0:
#         print(textwrap.dedent(f'''
#             Progress: {index}/{authors_count} authors
#             Total time: {time.time() - tic} s    
#         ''').lstrip())
        
#         with open('benchmark.tmp', 'wb+') as tmp_file:
#             pickle.dump({
#                 'real_authors': real_authors,
#                 'last_author_idx': index - 2,
#             }, tmp_file)        
            
#         tic = time.time()

print(f'Total time: {time.time() - total_tic} s')

Total time: 63.48866605758667 s
 

In [52]:
from esc_identifier.cluster import get_clusters, group_by_index
from esc_identifier.author.distance import human_name_distance

In [71]:
%%prun

same_authors_threshold = 0.98
limit = 1500
name_distance_threshold = 0.33

query = (
    session.query(author_table)
    .order_by(author_table.c.name)
)
# todo: normalize authors
authors = query[:limit]

clusters_indices = get_clusters(
    authors,
    human_name_distance,
    key=lambda author: author.name,
    eps=name_distance_threshold
)
authors_groups = group_by_index(authors, clusters_indices)
print(f'Similar authors groups: {len(authors_groups)}')

for similar_authors in authors_groups:
    real_authors = []
#     for index, author_row in log_progress(similar_authors, every=10):
    for index, author_row in enumerate(similar_authors):
        author = Author(
            kdd_id=author_row.id,
            name=author_row.name,
            affiliation=author_row.affiliation
        )
        normalized_author = normalize_author(author)

        distances_with_real_authors = np.array([
            author_distance(normalized_author, real_author)
            for real_author
            in real_authors
        ])

        if not len(distances_with_real_authors):
            real_authors.append(to_real_author(author))
            continue

#         real_author_indices = np.arange(len(real_authors))[
#             np.where(distances_with_real_authors[:, 0] < name_distance_threshold)
#         ]

#         if not len(real_author_indices):
#             real_authors.append(to_real_author(author))
#             continue

        are_same_authors_probabilities = classifier.predict_proba(
            distances_with_real_authors
        )[:, 1]

        if max(are_same_authors_probabilities) < same_authors_threshold:
            real_authors.append(to_real_author(author))
        else: 
            real_author_idx = int(np.argmax(are_same_authors_probabilities))
            real_authors[real_author_idx].kdd_author_ids.append(author.kdd_id)


Similar authors groups: 228
 