In [123]:
import pickle
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from collections import defaultdict

In [294]:
with open('data/museums.pickle', 'rb') as f:
    museums_dict = pickle.load(f)

In [225]:
def preproc_museums(string):
    string = string.strip(' .-').replace('\u200b', '').replace('\xa0', '')
    quotes = list('”»“«"()')
    for quote in quotes:
        string = string.replace(quote, '')
    return string.lower()

In [280]:
museums = museums_dict['Москва']
museums = np.array(list(set(list(map(preproc_museums, museums)))))
museums = museums[museums != '']
all_museums = museums

In [283]:
def gen_params():
    eps_list = [i / 10 for i in range(5, 16)]
    n_samples = (6, 5, 4, 4, 3, 3, 2, 2)
    for eps in eps_list:
        for n in n_samples:
            yield eps, n

In [320]:
def clustering(all_museums, debug=True):
    db_pipeline = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_df=0.9)),
        ('dbscan', DBSCAN(eps=0.5, min_samples=2))
    ])
    museums = all_museums
    cluster_dict = dict()
    mask = np.zeros(museums.shape[0], dtype=bool)
    generator = gen_params()
    while len(museums) > len(all_museums) * 2 / 10:
        museums = museums[~mask]
        mask = np.zeros(museums.shape[0], dtype=bool)
        db_pred = db_pipeline.fit_predict(museums)
        db_clusters = len(np.unique(db_pred))
        clustered = db_pred != -1
        mask[clustered] = True
        for j in range(db_clusters - 1): 
            if debug:
                print(j, ' __________________________\n', sep='')
                print(museums[db_pred == j])
            idx = np.argmin([len(elem) for elem in museums[db_pred == j]])
            cluster_dict[museums[db_pred == j][idx]] = list(museums[db_pred == j])
        eps, min_samples = next(generator)

        db_pipeline.steps[1] = ('dbscan', DBSCAN(eps=eps, min_samples=min_samples))

    for museum in museums:
        cluster_dict[museum] = museum
    return cluster_dict

In [324]:
def make_reverse_dict(cluster_dict, city):
    reverse_dict = dict()
    for key in cluster_dict:
        for elem in cluster_dict[key]:
            reverse_dict[(elem, city)] = key
            if 'винзавод' in key:
                reverse_dict[elem] = 'винзавод'
    return reverse_dict

In [332]:
museums_msk = museums_dict['Москва']
museums_msk = np.array(list(set(list(map(preproc_museums, museums_msk)))))
museums_msk = museums_msk[museums_msk != '']

museums_spb = museums_dict['Санкт-Петербург']
museums_spb = np.array(list(set(list(map(preproc_museums, museums_spb)))))
museums_spb = museums_spb[museums_spb != '']

reverse_dict_msk = make_reverse_dict(clustering(museums_msk, debug=False), 'Москва')
reverse_dict_spb = make_reverse_dict(clustering(museums_spb, debug=False), 'Санкт-Петербург')
reverse_dict = {**reverse_dict_msk, **reverse_dict_spb}

In [306]:
with open('data/reverse_museums_dict.pickle', 'wb') as f:
    pickle.dump(reverse_dict, f)

In [307]:
with open('data/artists_data.pickle', 'rb') as f:
    artists_data = pickle.load(f)

In [334]:
for artist in artists_data:
    if artist['collective_exh'] is not None:
        for exh in artist['collective_exh']:
            if 'city' not in exh:
                continue
            if exh['museum'] is not None:
                tmp = (preproc_museums(exh['museum']), exh['city'])
                if tmp in reverse_dict:
                    exh['museum'] = reverse_dict[tmp]
                else:
                    exh['museum'] = ''
    if artist['personal_exh'] is not None:
        for exh in artist['personal_exh']:
            if 'city' not in exh:
                continue
            if exh['museum'] is not None:
                tmp = (preproc_museums(exh['museum']), exh['city'])
                if tmp in reverse_dict:
                    exh['museum'] = reverse_dict[tmp]
                else:
                    exh['museum'] = ''