In [None]:
import itertools
import pprint
import os
import random

from pymongo import MongoClient
import numpy as np
import pandas as pd

from esc_identifier.cluster import distance_matrix, dbscan, get_clusters
from esc_identifier.distance import token_set_distance, levenshtein
from esc_identifier.utils.string import (
    normalize_human_name, normalize_affiliation)
from jupyter_utils import log_progress

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
MONGO_URI = 'mongodb://localhost:27017/'

mongo_client = MongoClient(MONGO_URI)
kdd_db = mongo_client.kdd2013
author_raw_collection = kdd_db.author_raw
author_collection = kdd_db.author

## Cleaning raw metadata
Clean authors metadata from `author_raw` collection and save to `author` collection

In [3]:
def clean_list(lst, distance_function, dbscan_eps):
    if not lst:
        return []

    indices_clusters = (
        get_clusters(
            lst,
            distance_function=distance_function,
            eps=dbscan_eps
        )
    )
    clusters = [
        [lst[i] for i in cluster]
        for cluster
        in indices_clusters
    ]

    biggest_cluster = max(indices_clusters, key=len)
    
    true_cluster = [lst[idx] for idx in biggest_cluster]
    
    return true_cluster

In [None]:
name_eps = 0.3
affiliation_eps = 0.1

def process_author(author_raw, distance_function):
    author_processed = author_raw.copy()

    affiliations_raw = author_raw['affiliations']
    normalized_affiliations = [
        normalize_affiliation(affiliation)
        for affiliation
        in affiliations_raw
    ]
    affiliations_not_empty = list(filter(len, normalized_affiliations))
    affiliations_processed = clean_list(
            lst=affiliations_not_empty,
            distance_function=distance_function,
            dbscan_eps=affiliation_eps
    )
    author_processed['affiliations'] = list(set(affiliations_processed))
    assert all(author_processed['affiliations'])
    
    names_raw = author_raw['names']
    normalized_names = [
        normalize_human_name(name)
        for name
        in names_raw
    ]
    names_processed = clean_list(
        lst=normalized_names,
        distance_function=distance_function,
        dbscan_eps=name_eps
    )
    author_processed['names'] = list(set(names_processed))
    assert all(author_processed['names'])
    
    return author_processed

In [None]:
distance_function = token_set_distance
authors_raw = author_raw_collection.find()
authors_raw_count = authors_raw.count()

processed_authors = []
for index, author in log_progress(authors_raw, every=1, size=authors_raw_count):
    if not author['affiliations']:
        continue
    
    processed_authors.append(
        process_author(author, distance_function)
    )

In [None]:
author_collection.delete_many({})
result = author_collection.insert_many(processed_authors)
n_inserted_doc = len(result.inserted_ids)
print(f'Inserted {n_inserted_doc} documents')

## Dataset generating

In [None]:
def uniform_select(iterable, k):
    iterable = list(iterable)
    selection_indices = np.linspace(0, len(iterable) - 1, k, dtype=np.int8)
    selection = [iterable[idx] for idx in selection_indices]
    
    return selection

In [None]:
def generate_dataset(metadata):
    X_positive = []
    X_negative = []

    for author_idx, author_metadata in enumerate(metadata):
        kdd_id = author_metadata['kdd_id']
        names = author_metadata['names']
        affiliations = author_metadata['affiliations']

        selection_size = min(5, max(len(names), len(affiliations)))
        if selection_size == 1:
            continue

        sorted_names = sorted(names, key=len, reverse=True)
        selected_names = uniform_select(sorted_names, selection_size)

        sorted_affiliations = sorted(affiliations, key=len, reverse=True)
        selected_affiliations = uniform_select(sorted_affiliations, selection_size)

        # positive samples
        metadata_vectors = list(zip(selected_names, selected_affiliations))
        for (a, b) in itertools.combinations(metadata_vectors, 2):
            distance = metadata_vector_distance(a, b)
            X_positive.append(distance)

        # negative samples
        other_authors = random.sample(metadata[:author_idx] + metadata[author_idx + 1:],
                                      k=selection_size)
        other_authors_metadata_vectors = [
            [random.choice(author['names']), random.choice(author['affiliations'])]
            for author
            in other_authors 
        ]
        for (a, b) in itertools.product(metadata_vectors, other_authors_metadata_vectors):
            distance = metadata_vector_distance(a, b)
            if all(d < 0.3 for d in distance):
                pprint.pprint(['False-negative sample', a, b, distance])
                continue
            X_negative.append(distance)
            
        for (name_1, name_2, other_author) in zip(selected_names,
                                                  selected_names[1:] + [selected_names[0]],
                                                  other_authors):
            a = (name_1, random.choice(selected_affiliations))
            b = (name_2, random.choice(other_author['affiliations']))
            distance = metadata_vector_distance(a, b)
            if all(d < 0.3 for d in distance):
                pprint.pprint(['False-negative sample', a, b, distance])
                continue
            X_negative.append(distance)
#             print(a)
#             print(b)
#             print(distance)
#             print()
        
    print(f'positive samples: {len(X_positive)}')
    print(f'negative samples: {len(X_negative)}')
    X = X_positive + X_negative
    y = [1] * len(X_positive) + [0] * len(X_negative)
    
    return X, y

In [None]:
authors_cached = list(author_collection.find())
print(f'{len(authors_cached)} authors')

In [None]:
train_size = 3000
train_metadata = authors_cached[:train_size]
test_metadata = authors_cached[train_size:]

X_train, y_train = generate_dataset(train_metadata)
X_test, y_test = generate_dataset(test_metadata)

In [None]:
datasets_dir = 'datasets'
np.save(os.path.join(datasets_dir, 'train-authors'), X_train)
np.save(os.path.join(datasets_dir, 'train-authors-labels'), y_train)
np.save(os.path.join(datasets_dir, 'test-authors'), X_test)
np.save(os.path.join(datasets_dir, 'test-authors-labels'), y_test)

## Dataset evaluation

In [None]:
random.seed(42)
pd.DataFrame({
    'distance': pd.Series(random.sample(X_train, 100)),
    'label': pd.Series(random.sample(y_train, 100))
}).sort_values('distance', ascending=False)

In [None]:
a = ['Olga Demurin', 'Novosibirsk']
b = ['Oleg Demurin', 'Novosibirsk']
metadata_vector_distance(a,b)