In [79]:
import os
import pickle
import time
from collections import namedtuple
import textwrap

import sqlalchemy as sa
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.schema import MetaData
from python_utils.data_access import check_connection as pg_check_connection
import numpy as np
from pymongo import MongoClient
import pandas as pd

from esc_identifier.author import (
    Author, RealAuthor,
    author_distance
)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [73]:
pd.options.display.max_rows = 999

In [8]:
ESC_DB_URI = 'postgresql://root:secret@localhost:5432/esc'
KDD_DB_URI = 'postgresql://root:secret@localhost:5432/Kdd2013AuthorPaperIdentification'

In [4]:
pg_check_connection(ESC_DB_URI)
pg_check_connection(KDD_DB_URI)

In [11]:
MONGO_URI = 'mongodb://localhost:27017/'

mongo_client = MongoClient(MONGO_URI)
esc_db = mongo_client.esc
real_author_collection = esc_db.real_author

In [5]:
sa_url = sa.engine.url.make_url(KDD_DB_URI)

engine = create_engine(sa_url)

meta = MetaData()
meta.reflect(bind=engine)

author_table = meta.tables['author']

DBSession = sessionmaker(bind=engine)
session = DBSession()

In [7]:
models_dir = 'models'
model_path = os.path.join(models_dir, f'svm-poly.pickle')
with open(model_path, 'rb') as model_file:
    classifier = pickle.load(model_file)
    print(f'Loaded model "{model_path}"')

Loaded model "models/svm-poly.pickle"


In [86]:
same_authors_threshold = 0.98
period = 100

authors_count = session.query(sa.func.count(author_table.c.id)).one()[0]

query = (
    session.query(author_table)
    .order_by(author_table.c.name)
)
author_row = query[0]
real_author = RealAuthor(
    kdd_author_ids=[author_row.id],
    name=author_row.name,
    affiliation=author_row.affiliation
)
real_authors = [real_author]

total_tic = time.time()
tic = time.time()
for i, author_row in enumerate(query[1:1000], 2):
    author = Author(
        kdd_id=author_row.id,
        name=author_row.name,
        affiliation=author_row.affiliation
    )
    
#     real_authors = real_author_collection.find()
    distances_with_real_authors = np.array([
        author_distance(author, real_author)
        for real_author
        in real_authors
    ])
    
    are_same_authors_probabilities = classifier.predict_proba(distances_with_real_authors)[:, 1]
    if max(are_same_authors_probabilities) < same_authors_threshold:
        new_real_author = RealAuthor(
            kdd_author_ids=[author.kdd_id],
            name=author.name,
            affiliation=author.affiliation
        )
        real_authors.append(new_real_author)
    else:
#         print('=' * 30)
#         print(author)
#         print(are_same_authors_probabilities[are_same_authors_probabilities >= same_authors_threshold])
#         real_author_indices = np.where(are_same_authors_probabilities >= same_authors_threshold)[0]
#         print(real_author_indices)

#       What if two real authors with same score?  
        real_author_idx = int(np.argmax(are_same_authors_probabilities))
        real_authors[real_author_idx].kdd_author_ids.append(author.kdd_id)
    
    if not i % period:
        print(textwrap.dedent(f'''
            Progress: {i}/{authors_count} authors
            Total time: {time.time() - tic} sec    
        '''))
        
        with open('benchmark.tmp', 'wb+') as tmp_file:
            pickle.dump({
                'real_authors': real_authors,
                'last_author_idx': i - 2,
            }, tmp_file)        
            
        tic = time.time()

print(f'Total time: {time.time() - total_tic} sec')


Progress: 100/247121 authors
Total time: 1.1202430725097656 sec    


Progress: 200/247121 authors
Total time: 2.795146942138672 sec    


Progress: 300/247121 authors
Total time: 4.663457870483398 sec    


Progress: 400/247121 authors
Total time: 6.477314710617065 sec    


Progress: 500/247121 authors
Total time: 7.791975259780884 sec    


Progress: 600/247121 authors
Total time: 9.912145137786865 sec    


Progress: 700/247121 authors
Total time: 10.952278137207031 sec    


Progress: 800/247121 authors
Total time: 13.479083061218262 sec    


Progress: 900/247121 authors
Total time: 15.39861512184143 sec    



KeyboardInterrupt: 

In [21]:


RealAuthor = namedtuple('RealAuthor', ['kdd_author_ids', 'name', 'affiliation'])

a._asdict()

OrderedDict([('kdd_author_ids', [1, 2]),
             ('name', 'pidr'),
             ('affiliation', 'gnidos')])

In [18]:
?namedtuple