In [1]:
import pickle
import pandas as pd
import re

In [2]:
with open('data/data.pickle', 'rb') as f:
    artists_data = pickle.load(f)

In [3]:
cities_df = pd.read_csv('data/cities.csv', sep=',', encoding='utf-8')
cities_df['Население'] = cities_df['Население'].map(lambda x: x.replace('[', '').replace(']', ''))
cities_df['Население'] = cities_df['Население'].astype(int)
cities_df = cities_df.sort_values(by='Население', ascending=False)
cities_df['Город'] = cities_df['Город'].fillna(cities_df['Регион'])

In [4]:
cities = list(cities_df['Город'])

In [5]:
with open('data/translit_dict.pickle', 'rb') as f:
    translit_dict = pickle.load(f)

In [6]:
def parse_exhibitions(exh):
    exh = exh.replace('\r\n\t', '\n').replace('\r\n', '\n').split('\n')
    res = []
    year = None
    for elem in exh:
        now_dict = dict()
        elem = elem.strip()
        if len(elem) == 4:
            year = elem
        elif len(elem) > 10:
            if elem[:4].isdigit() and int(elem[:4]) > 1950:
                year = elem[:4]
                elem = elem[4:].lstrip(' - ')
            now_dict['place'] = elem
            now_dict['year'] = year
            res.append(now_dict)
    return res

In [7]:
col_cnt = 0
pers_cnt = 0
ok_cnt = 0
for i, artist in enumerate(artists_data):
    personal_exh = artist['personal_exh']
    collective_exh = artist['collective_exh']
    if personal_exh is not None:
        artist['personal_exh'] = parse_exhibitions(personal_exh)
    if collective_exh is not None:
        artist['collective_exh'] = parse_exhibitions(collective_exh)

In [8]:
n_vertex = len(artists_data)

In [9]:
def make_exh_graph(artists_data):
    exhibition_graph = [[0 for i in range(n_vertex)] for j in range(n_vertex)]
    n_edges = 0
    for i, artist in enumerate(artists_data):
        if artist['collective_exh'] is None:
            continue
        for j, artist_2 in enumerate(artists_data):
            if artist_2['collective_exh'] is None:
                continue
            for exh in artist['collective_exh']:
                if exh in artist_2['collective_exh'] and i != j:
                    exhibition_graph[i][j] = 1
                    n_edges += 1
                    break
    return exhibition_graph, n_edges

In [10]:
def find_rus_city(string, cities, translit_dict):
    for city in cities:
        if city in string:
            idx = string.rfind(city)
            string = string.replace('г.', '', idx - 2)
            string = string.replace(';', '', idx)
            return 1, string[:string.rfind(city)].strip('/, '), city.strip(), 'Россия'
    for city in translit_dict:
        if city in string:
            idx = string.rfind(city)
            string = string.replace('г.', '', idx - 2)
            string = string.replace(';', '', idx)
            return 1, string[:string.rfind(city)].strip('/, '), translit_dict[city], 'Россия'
    return 0, '', '', ''

In [11]:
gallery_seq = ["artplay", "centre", "center", "dome", "electromuseum", "foundation", "gallery", "galerie", "hall", "land-art", "l’institut", 'mmoma',
               "open-air", "stella", "vglaz", "vladey space",
               "академия", "армянский", "арт-проект", "агенство",
               "библиотека", "бульвар", "бутик", 
               "варочный цех", "вднх", "винзавод", "выставка достижений", 
               "газгалерея", "галерея", "гараж", "гмии", "гостиница", "граунд", "гцси", "грузинская", "гуслица",
               "даче", "двор", "дворец", "деревня", "дк", "дом", "дома гоголя", 
               "зал", "замок", "зоопарк", "издательство", "институт", 
               "кадетский корпус", "калистово", "кафе", "клуб", "коллегия", "красный октябрь", 
               "крепость", "крокус", "кузнецкий мост", "культпроект", "комплекс", "кинотеатр",
               "лаборатория", "лес", "лофт", "ул.", 
               "магазин", "малая грузинская", "мамм", "манеж", "мастерская", "мгу", "министерство", "ммома",
               "ммси", "мси", "музей", 'музеи', 
               "новая голландия", "новый архив", "огород", "особняк",
               "отделение", "отель", "павильон", "палаты", 'пассаж', "парк", "планетарий", "площадь", 
               "покровка", "посольство", "поварская", "паибни", "пустырь", 
               "ресторан", "рюмочная", "росфото", 
               "сандуновских бань", "село", "сколково", "сокольники", 
               "союз художников", "столовая", "студия", "театр", "территория", "ударник", "университет", "училище",
               "фабрика", "филармония", "филиал", "фонд", "фотопроект", "фотоцентр",
               "форпост", "цдх", "центр",
               "цех", "цси", "школа", "шувалово", "экспоцентр", "эрмитаж", "х.л.а.м", 
              ] + [' ' + str(i) + ' ' for i in range(5, 50)]

In [12]:
def find_names(pref, gallery_seq):
    museum = ''
    name_exh = ''
    for elem in pref.split(','):
        for word in gallery_seq:
            if re.search(r'\b' + word + r'\b', elem.lower()) and 'экспозиция' not in elem.lower():
                museum += elem + ', '
                break
        if len(museum) == 0:
            name_exh += elem + ', '
    name_exh = name_exh.strip(' ,')
    museum = museum.strip(' ,')
    return name_exh, museum

In [13]:
def exh_parser(place):
    museum, name_exh = '', ''
    status, pref, city, country = find_rus_city(place, cities, translit_dict)
    if status == 0:
        return 0, '', '', '', ''
    
    pref = pref.replace('”', '»').replace('“', '«')
    
    first_quote = pref.find('»')
    if first_quote != -1 and first_quote < len(pref) * 2 / 3:
        if pref[first_quote + 1] != ',':
            pref = pref[:first_quote + 1] + ', ' + pref[first_quote + 1:].strip()
    
    if pref.count(',') == 1:
        name_exh, museum = pref.split(',')
    elif pref.count(',') == 0:
        pref = pref.replace('.', ',')
        name_exh, museum = find_names(pref, gallery_seq)
    else:
        name_exh, museum = find_names(pref, gallery_seq)
    return 1, name_exh, museum, city, country           

In [14]:
def parse_foreign(place):
    status, name_exh, museum, city, country = 1, '', '', '', ''
    if place.count(',') == 3:
        name_exh, museum, city, country = place.split(',')
    elif place.count(',') == 2:
        name_exh, city, country = place.split(',')
        museum = ''
    elif place.count('»,') == 1:
        name_exh, location = place.split('»,')
        name_exh += '»'
        if location.count(',') == 2:
            museum, city, country = location.split(',')
        elif location.count(',') == 1:
            city, country = location.split(',')
            museum = ''
        else:
            status = 0
    else:
        status = 0
    return status, name_exh.strip(), museum.strip(), city.strip(), country.strip()

In [15]:
from collections import defaultdict
cnt = 0
museums_dict = defaultdict(set)
quotes_beg = r'[”»]'
quotes_fin = r'[“]'
for artist in artists_data:
    if artist['collective_exh'] is None:
        artist['collective_exh'] = []
    if artist['personal_exh'] is None:
        artist['personal_exh'] = []
    for exh in artist['collective_exh']  + artist['personal_exh']:
        place = exh['place'].replace('\r\n', ' ')
        exh['place'] = place
        status, name_exh, museum, city, country = exh_parser(place)
        if not status:
            status, name_exh, museum, city, country = parse_foreign(place)
            if not status:
                continue
        if museum != '':
            museums_dict[city].add(museum.strip())
        exh['museum'] = museum
        exh['name'] = name_exh
        exh['city'] = city
        exh['country'] = country

In [16]:
with open('data/museums.pickle', 'wb') as f:
    pickle.dump(museums_dict, f)

In [17]:
artists_data = list(filter(lambda x: len(x['collective_exh']  + x['personal_exh']) > 0, artists_data))
with open('data/all_artists_data.pickle', 'wb') as f:
    pickle.dump(artists_data, f)

In [18]:
from copy import deepcopy

rus_artists_data = []
for artist in artists_data:
    a = deepcopy(artist)
    a['collective_exh'] = list(filter(lambda x: 0 if 'country' not in x else x['country'] == 'Россия', a['collective_exh']))
    a['personal_exh'] = list(filter(lambda x: 0 if 'country' not in x else x['country'] == 'Россия', a['personal_exh']))
    rus_artists_data.append(a)

In [19]:
rus_artists_data = list(filter(lambda x: len(x['collective_exh']  + x['personal_exh']) > 0, rus_artists_data))
with open('data/rus_artists_data.pickle', 'wb') as f:
    pickle.dump(rus_artists_data, f)