In [2]:
import itertools
import pprint
import os
import random

from pymongo import MongoClient
import numpy as np
import pandas as pd

from esc_identifier.cluster import distance_matrix, dbscan, get_clusters
from esc_identifier.distance import token_set_distance, levenshtein
from esc_identifier.utils.string import (normalize_human_name,
                                         normalize_affiliation)
from esc_identifier.author.distance import (human_name_distance,
                                            affiliation_distance)
from jupyter_utils import log_progress

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [7]:
MONGO_URI = 'mongodb://localhost:27017/'

mongo_client = MongoClient(MONGO_URI)
kdd_db = mongo_client.kdd2013
author_raw_collection = kdd_db.author_raw
author_collection = kdd_db.author

In [11]:
print('Example from colelction "author_raw"')
pprint.pprint(author_raw_collection.find()[1])

Example from colelction "author_raw"
{'_id': ObjectId('590467cf77b80a00013cd29c'),
 'affiliations': ['Department of Psychology, Lancaster University, Lancaster, '
                  'U.K.',
                  'Department of Psychology, Lancaster University, U.K.',
                  'Department of Psychology, University of Durham, Science '
                  'Laboratories, Durham, UK',
                  'Department of Psychology, University of Lancaster',
                  'Department of Psychology, University of Lancaster, '
                  'Lancaster, U. K',
                  'Department of Psychology, University of Lancaster, '
                  'Lancaster, U.K.',
                  'Department of Psychology, University of New York, UK',
                  'Department of Psychology|University ofYork',
                  'Department of Psychology|University of York',
                  'Department of Psychology, University of York',
                  'Department of Psychology, University 

## Cleaning raw metadata
Clean authors metadata from `author_raw` collection and save to `author` collection

In [13]:
def clean_list(lst, distance_function, dbscan_eps):
    if not lst:
        return []

    indices_clusters = (
        get_clusters(
            lst,
            distance_function=distance_function,
            eps=dbscan_eps
        )
    )
    clusters = [
        [lst[i] for i in cluster]
        for cluster
        in indices_clusters
    ]

    biggest_cluster = max(indices_clusters, key=len)
    
    true_cluster = [lst[idx] for idx in biggest_cluster]
    
    return true_cluster

In [14]:
name_eps = 0.3
affiliation_eps = 0.1

def process_author(author_raw, distance_function):
    author_processed = author_raw.copy()

    affiliations_raw = author_raw['affiliations']
    normalized_affiliations = [
        normalize_affiliation(affiliation)
        for affiliation
        in affiliations_raw
    ]
    affiliations_not_empty = list(filter(len, normalized_affiliations))
    affiliations_processed = clean_list(
            lst=affiliations_not_empty,
            distance_function=distance_function,
            dbscan_eps=affiliation_eps
    )
    author_processed['affiliations'] = list(set(affiliations_processed))
    assert author_processed['affiliations'] and all(author_processed['affiliations'])
    
    names_raw = author_raw['names']
    normalized_names = [
        normalize_human_name(name)
        for name
        in names_raw
    ]
    names_processed = clean_list(
        lst=normalized_names,
        distance_function=distance_function,
        dbscan_eps=name_eps
    )
    author_processed['names'] = list(set(names_processed))
    assert author_processed['names'] and all(author_processed['names'])
    
    return author_processed

In [17]:
distance_function = token_set_distance
authors_raw = author_raw_collection.find()
authors_raw_count = authors_raw.count()
process_author(authors_raw[1], distance_function)

{'_id': ObjectId('590467cf77b80a00013cd29c'),
 'affiliations': ['york usa',
  'psychology york england',
  'psychology lancaster bailrigg united kingdom',
  'psychology lancaster lai 4yf england',
  'lancaster england',
  'psychology york uk',
  'psychology new york uk',
  'psychology york united kingdom',
  'york uk',
  'psychology york heslington yo1 5dd k ',
  'lancaster uk',
  'psychology ofyork',
  'psychology york',
  'psychology lancaster k',
  'york',
  'psychology york yo10 5dd uk',
  'psychology lancaster',
  'psychology lancaster k ',
  'psychology york heslington yo1 5dd uk',
  'psychology york awy united kingdom',
  'psychology york heslington yo10 5dd england uk',
  'psychology york heslington united kingdom',
  'psychology york heslington uk',
  'lancaster k',
  'psychology york heslington yo10 5dd uk',
  'lancaster k ',
  'york neuroimaging centre biocentre science park heslington yo10 5dg uk',
  'lancaster',
  'psychology york heslington england',
  'york psychology ne

In [38]:
distance_function = token_set_distance
authors_raw = author_raw_collection.find()
authors_raw_count = authors_raw.count()

processed_authors = []
for index, author in log_progress(authors_raw, every=1, size=authors_raw_count):
    if not author['affiliations']:
        continue
    
    processed_authors.append(
        process_author(author, distance_function)
    )

AssertionError: 

In [7]:
author_collection.delete_many({})
result = author_collection.insert_many(processed_authors)
n_inserted_doc = len(result.inserted_ids)
print(f'Inserted {n_inserted_doc} documents')

Inserted 3567 documents


## Dataset generating

In [8]:
def uniform_select(iterable, k):
    iterable = list(iterable)
    selection_indices = np.linspace(0, len(iterable) - 1, k, dtype=np.int8)
    selection = [iterable[idx] for idx in selection_indices]
    
    return selection

In [52]:
def generate_dataset(metadata):
    X_positive = []
    X_negative = []

    for author_idx, author_metadata in enumerate(metadata):
        kdd_id = author_metadata['kdd_id']
        names = author_metadata['names']
        affiliations = author_metadata['affiliations']

        selection_size = min(5, max(len(names), len(affiliations)))
        if selection_size == 1:
            continue

        sorted_names = sorted(names, key=len, reverse=True)
        selected_names = uniform_select(sorted_names, selection_size)

        sorted_affiliations = sorted(affiliations, key=len, reverse=True)
        selected_affiliations = uniform_select(sorted_affiliations, selection_size)

        # positive samples
        metadata_vectors = list(zip(selected_names, selected_affiliations))
        for (a, b) in itertools.combinations(metadata_vectors, 2):
            distance = [
                human_name_distance(a[0], b[0]),
                affiliation_distance(a[1], b[1])
            ]
            X_positive.append(distance)

        # negative samples
        other_authors = random.sample(metadata[:author_idx] + metadata[author_idx + 1:],
                                      k=selection_size)
        try:
            other_authors_metadata_vectors = [
                [random.choice(author['names']), random.choice(author['affiliations'])]
                for author
                in other_authors 
            ]
        except IndexError:
            pprint.pprint(other_authors)
            raise
        for (a, b) in itertools.product(metadata_vectors, other_authors_metadata_vectors):
#             distance = metadata_vector_distance(a, b)
            distance = [
                human_name_distance(a[0], b[0]),
                affiliation_distance(a[1], b[1])
            ]
            print(a, b)
            print(distance)
            raise
            if all(d < 0.3 for d in distance):
#                 pprint.pprint(['False-negative sample', a, b, distance])
                continue
            X_negative.append(distance)
            
        for (name_1, name_2, other_author) in zip(selected_names,
                                                  selected_names[1:] + [selected_names[0]],
                                                  other_authors):
            a = (name_1, random.choice(selected_affiliations))
            b = (name_2, random.choice(other_author['affiliations']))
            distance = [
                human_name_distance(a[0], b[0]),
                affiliation_distance(a[1], b[1])
            ]
            print(a, b)
            print(distance)
            raise
            if all(d < 0.3 for d in distance):
#                 pprint.pprint(['False-negative sample', a, b, distance])
                continue
            X_negative.append(distance)
#             print(a)
#             print(b)
#             print(distance)
#             print()
        
    print(f'positive samples: {len(X_positive)}')
    print(f'negative samples: {len(X_negative)}')
    X = X_positive + X_negative
    y = [1] * len(X_positive) + [0] * len(X_negative)
    
    return X, y

In [53]:
# train_size = 3000
train_size = int(len(authors_cached) / 5)

authors_cached = list(author_collection.find())
train_metadata = authors_cached[:train_size]
test_metadata = authors_cached[train_size:]
# print(train_metadata[:2])
# raise

X_train, y_train = generate_dataset(train_metadata)
X_test, y_test = generate_dataset(test_metadata)

('bart vandereycken', 'katholieke leuven computer science celestijnenlaan 200a 3001 belgium') ['john kim', 'computer science']
[0.84, 0.0]


RuntimeError: No active exception to reraise

In [42]:
datasets_dir = 'datasets'
np.save(os.path.join(datasets_dir, 'train-authors'), X_train)
np.save(os.path.join(datasets_dir, 'train-authors-labels'), y_train)
np.save(os.path.join(datasets_dir, 'test-authors'), X_test)
np.save(os.path.join(datasets_dir, 'test-authors-labels'), y_test)

## Dataset evaluation

In [45]:
random.seed(42)
pd.DataFrame({
    'distance': pd.Series(random.sample(X_train, 100)),
    'label': pd.Series(random.sample(y_train, 100))
}).sort_values('distance', ascending=False)

Unnamed: 0,distance,label
62,"[0.92, 0.72]",0
97,"[0.88, 0.6]",0
48,"[0.88, 0.5700000000000001]",0
81,"[0.87, 0.69]",0
87,"[0.87, 0.5800000000000001]",1
82,"[0.85, 0.7]",0
92,"[0.83, 0.69]",0
21,"[0.8200000000000001, 0.88]",0
10,"[0.8200000000000001, 0.73]",0
69,"[0.8200000000000001, 0.56]",0


In [None]:
a = ['Olga Demurin', 'Novosibirsk']
b = ['Oleg Demurin', 'Novosibirsk']
metadata_vector_distance(a,b)

In [36]:
all([])

True