In [2]:
import xml.etree.ElementTree as ET

xml_data = ET.parse("data.xml").getroot()

ns_dict = {"xmlns":"http://www.SDMX.org/resources/SDMXML/schemas/v1_0/message",
           "common":"http://www.SDMX.org/resources/SDMXML/schemas/v1_0/common",
           "compact":"http://www.SDMX.org/resources/SDMXML/schemas/v1_0/compact",
           "cross":"http://www.SDMX.org/resources/SDMXML/schemas/v1_0/cross",
           "generic":"http://www.SDMX.org/resources/SDMXML/schemas/v1_0/generic",
           "query":"http://www.SDMX.org/resources/SDMXML/schemas/v1_0/query",
           "structure":"http://www.SDMX.org/resources/SDMXML/schemas/v1_0/structure",
           "utility":"http://www.SDMX.org/resources/SDMXML/schemas/v1_0/utility",
           "xsi":"http://www.w3.org/2001/XMLSchema-instance"}

In [3]:
okato_xml = xml_data.find(".//structure:CodeList[@id='mОКАТО']", ns_dict)
okato_dict = {}
for region in okato_xml.findall("structure:Code", ns_dict):
    for description in region.iter():
        okato_dict[region.attrib["value"]] = description.text
print(okato_dict)

{'643': 'Российская Федерация', '30': 'Центральный федеральный округ', '140000000001': 'Белгородская область', '150000000001': 'Брянская область', '170000000001': 'Владимирская область', '200000000001': 'Воронежская область', '240000000001': 'Ивановская область', '290000000001': 'Калужская область', '340000000001': 'Костромская область', '380000000001': 'Курская область', '420000000001': 'Липецкая область', '460000000001': 'Московская область', '540000000001': 'Орловская область', '610000000001': 'Рязанская область', '660000000001': 'Смоленская область', '680000000001': 'Тамбовская область', '280000000001': 'Тверская область', '700000000001': 'Тульская область', '780000000001': 'Ярославская область', '450000000001': 'Город Москва столица Российской Федерации город федерального значения', '31': 'Северо-Западный Федеральный округ', '860000000001': 'Республика Карелия', '870000000001': 'Республика Коми', '110000000001': 'Архангельская область', '111000000001': 'Ненецкий автономный округ (

In [4]:
ill_xml = xml_data.find(".//structure:CodeList[@id='ill_15']", ns_dict)
ill_dict = {}
for ill in ill_xml.findall("structure:Code", ns_dict):
    for description in ill.iter():
        ill_dict[ill.attrib["value"]] = description.text
print(ill_dict)

{'O00-O99': 'Беременность, роды и послеродовой период', 'H00-H59': 'Болезни глаза и его придаточного аппарата', 'M00-M99': 'Болезни костно-мышечной системы и соединительной ткани', 'N00-N99': 'Болезни мочеполовой системы', 'G00-G99': 'Болезни нервной системы', 'J00-J99': 'Болезни органов дыхания', 'K00-K93': 'Болезни органов пищеварения', 'I00-I99': 'Болезни системы кровообращения', 'H60-H95': 'Болезни уха и сосцевидного отростка', 'E00-E90': 'Болезни эндокринной системы, расстройства питания и нарушения обмена веществ', 'Q00-Q99': 'Врожденные аномалии (пороки развития), деформации и хромосомные нарушения', 'A00-T98': 'Все заболевания', 'C00-C97': 'Злокачественные новообразования', 'A00-B99': 'Некоторые инфекционные и паразитарные болезни', 'C00-D48': 'Новообразования', 'P00-P96': 'Отдельные состояния, возникающие в перинатальном периоде', 'V00-V99': 'Прочие болезни', 'F00-F99': 'Психические расстройства и расстройства поведения', 'S00-T98': 'Травмы, отравления и другие последствия воз

In [5]:
age_xml = xml_data.find(".//structure:CodeList[@id='old_15']", ns_dict)
age_dict = {}
for age in age_xml.findall("structure:Code", ns_dict):
    for description in age.iter():
        age_dict[age.attrib["value"]] = description.text
print(age_dict)

{'2': '0-14 лет', '3': '15-17 лет', '4': '18 лет и старше', '1': 'Всего'}


In [6]:
data_xml = xml_data.find(".//xmlns:DataSet", ns_dict)
data_list = []
for record in data_xml.findall("generic:Series", ns_dict):
    tmp_dict = {}
    series_key = record.find("generic:SeriesKey", ns_dict)
    for concept in series_key:
        tmp_dict[concept.attrib["concept"]] = concept.attrib["value"]
    observation = record.find("generic:Obs", ns_dict)
    tmp_dict["Time"] = observation.find("generic:Time", ns_dict).text
    tmp_dict["Value"] = observation.find("generic:ObsValue", ns_dict).attrib["value"]
    data_list.append(tmp_dict)
    

In [7]:
import pandas as pd

health_data = pd.DataFrame(data_list)
health_data.head()

Unnamed: 0,Time,Value,ill_15,mОКАТО,old_15
0,2005,235,O00-O99,643,2
1,2006,192,O00-O99,643,2
2,2007,195,O00-O99,643,2
3,2008,243,O00-O99,643,2
4,2009,195,O00-O99,643,2


In [8]:
health_data["Illness"] = health_data["ill_15"].apply(lambda x: ill_dict[x])
health_data["Age"] = health_data["old_15"].apply(lambda x: age_dict[x])
health_data["Region"] = health_data["mОКАТО"].apply(lambda x: okato_dict[x])
health_data["Value"] = health_data["Value"].apply(lambda x: float(x.replace(",", ".")))
health_data["Time"] = health_data["Time"].apply(lambda x: int(x))
#health_data.to_csv("health_data.csv")

In [9]:
def get_best_five_regions(data, illness, age, time):
    regions = data.loc[health_data["mОКАТО"].apply(lambda x: (len(x) > 4))]
    filtered = regions.loc[(regions["ill_15"] == illness) & (regions["old_15"] == age) & (regions["Time"] == time) & (regions["mОКАТО"] != "643")]
    output = filtered.loc[filtered["Value"] != 0]
    return output.sort_values(by = "Value", ascending=True, inplace=False).head().reset_index()

def get_worst_five_regions(data, illness, age, time):
    regions = data.loc[health_data["mОКАТО"].apply(lambda x: (len(x) > 2))]
    filtered = regions.loc[(regions["ill_15"] == illness) & (regions["old_15"] == age) & (regions["Time"] == time) & (regions["mОКАТО"] != "643")]
    output = filtered.loc[filtered["Value"] != 0]
    return output.sort_values(by = "Value", ascending=True, inplace=False).tail().reset_index()

In [10]:
print(get_worst_five_regions(health_data, "J00-J99", "1", 2005))

   index  Time    Value   ill_15        mОКАТО old_15  \
0  21666  2005  47943.9  J00-J99  940000000001      1   
1   9246  2005  48349.6  J00-J99  870000000001      1   
2  27876  2005  52948.9  J00-J99  711400000001      1   
3  10074  2005  63538.4  J00-J99  111000000001      1   
4  37812  2005  64957.1  J00-J99  770000000001      1   

                   Illness    Age  \
0  Болезни органов дыхания  Всего   
1  Болезни органов дыхания  Всего   
2  Болезни органов дыхания  Всего   
3  Болезни органов дыхания  Всего   
4  Болезни органов дыхания  Всего   

                                              Region  
0                              Удмуртская Республика  
1                                    Республика Коми  
2  Ямало-Ненецкий автономный округ (Тюменская обл...  
3  Ненецкий автономный округ (Архангельская область)  
4                         Чукотский автономный округ  


In [11]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()

In [12]:
def get_russia_value(data, illness, age, time):
    russia = data.loc[(data["ill_15"] == illness) & (data["old_15"] == age) & (data["Time"] == time) & (data["mОКАТО"] == "643")]
    return russia["Value"].max()

In [13]:
from ipywidgets import interact
def generate_divergent_plot(illness, age, time, data=health_data):
    russia_value = get_russia_value(data, illness, age, time)
    test_good = get_best_five_regions(data, illness, age, time)
    test_bad = get_worst_five_regions(data, illness, age, time)
    test_good["Russia"] = russia_value
    test_bad["Russia"] = russia_value
    test_data = test_bad.append(test_good, ignore_index = True)
    title_str = "%s: %s" % (test_data["Illness"][0], test_data["Time"][0])
    plot = figure(y_range = test_data["Region"], title = title_str, plot_width = 800,
                 x_axis_label = "Заболеваемость", y_axis_label = "Регион")
    plot.hbar(y = test_good["Region"], height = 0.9, 
          right = test_good["Russia"], left = test_good["Value"], fill_color = "forestgreen")
    plot.hbar(y = test_bad["Region"], height = 0.9, 
          right = test_bad["Value"], left = test_bad["Russia"], fill_color = "firebrick")
    show(plot)
    return plot

@interact(illness_str = health_data["Illness"].unique(), age_str = health_data["Age"].unique(), 
          time = health_data["Time"].unique())
def generate_divergent_plot_interactive(illness_str, age_str, time):
    illness = list(ill_dict.keys())[list(ill_dict.values()).index(illness_str)]
    age = list(age_dict.keys())[list(age_dict.values()).index(age_str)]
    return generate_divergent_plot(illness, age, time, data=health_data)

interactive(children=(Dropdown(description='illness_str', options=('Беременность, роды и послеродовой период',…

In [14]:
def generate_health_dynamic_plot(illness, age, region, data=health_data):
    data_to_plot = data.loc[(data["ill_15"] == illness) & (data["old_15"] == age) & (data["mОКАТО"] == region)]
    title_str = "Динамика %s в %s" % (data_to_plot["Illness"].unique()[0], data_to_plot["Region"].unique()[0])
    plot = figure(title=title_str, x_axis_label="Год", y_axis_label="Заболеваемость")
    plot.line(x=data_to_plot["Time"], y=data_to_plot["Value"], line_color="darkorange", line_width=5.0)
    show(plot)
    return plot

@interact(illness_str = health_data["Illness"].unique(), age_str = health_data["Age"].unique(), 
          region_str = health_data["Region"].unique())
def generate_health_dynamic_plot_interactive(illness_str, age_str, region_str):
    illness = list(ill_dict.keys())[list(ill_dict.values()).index(illness_str)]
    age = list(age_dict.keys())[list(age_dict.values()).index(age_str)]
    region = list(okato_dict.keys())[list(okato_dict.values()).index(region_str)]
    return generate_health_dynamic_plot(illness, age, region, data=health_data)

interactive(children=(Dropdown(description='illness_str', options=('Беременность, роды и послеродовой период',…

In [17]:
import json

def convert_coordinates(region):
    for upper_level in region["geometry"]["coordinates"]:
        for medium_level in upper_level:
            for point in medium_level:
                if point[0] < 0:
                    point[0] += 360.0

def merge_okato_codes(obj):
    regions_df = pd.read_csv("okato-avtocod.csv", header = None)
    for json_region in obj["features"]:
        if "ref" in json_region["properties"].keys():
            value = regions_df.loc[regions_df[4] == json_region["properties"]["ref"], 3]
            if not value.empty:
                json_region["properties"]["okato"] = value.iloc[0][0:2]
            else:
                json_region["properties"]["okato"] = "ZZ"
        else:
            json_region["properties"]["okato"] = "ZZ"

def update_geojson_macro(obj, illness, time, age,data=health_data):
    selection = data.loc[(data["ill_15"] == illness) & (data["Time"] == time) & (data["old_15"] == age)].reset_index()
    selection["Region"] = selection["Region"].apply(lambda x: x.lower()) 
    for json_region in obj["features"]:
        value = selection.loc[selection["Region"].str.contains(json_region["name"].lower()), "Value"]
        if not value.empty:
            json_region["properties"]["Ill_Rate"] = value.iloc[0]
        else:
            json_region["properties"]["Ill_Rate"] = None
        convert_coordinates(json_region)

def update_geojson_region(obj, illness, time, age,data=health_data):
    selection = data.loc[(data["ill_15"] == illness) & (data["Time"] == time) & (data["old_15"] == age) &
                        (data["mОКАТО"].str.len() > 4)].reset_index()
    selection["Region"] = selection["Region"].apply(lambda x: x.lower()) 
    merge_okato_codes(obj)
    for json_region in obj["features"]:
        value = selection.loc[selection["mОКАТО"].str.startswith(json_region["properties"]["okato"])]
        if not value.empty:
            json_region["properties"]["Ill_Rate"] = value["Value"].iloc[0]
            json_region["properties"]["name"] = value["Region"].iloc[0]
        else:
            json_region["properties"]["Ill_Rate"] = None
        convert_coordinates(json_region)

def get_geojson_range(obj):
    values = []
    for json_region in obj["features"]:
        if json_region["properties"]["Ill_Rate"]:
            values.append(json_region["properties"]["Ill_Rate"])
    return min(values), max(values)


In [18]:
from bokeh.models import GeoJSONDataSource, ColorBar, BasicTicker, HoverTool
from bokeh.models.mappers import ColorMapper, LinearColorMapper
from bokeh.palettes import Plasma256

def generate_illness_map(geojson_data, illness, time, age,data=health_data, title=""):
    min_v, max_v = get_geojson_range(geojson_data)
    data_source = GeoJSONDataSource(geojson = json.dumps(geojson_data))
    color_mapper = LinearColorMapper(palette=Plasma256, low = min_v, high = max_v)
    tools = "pan, box_zoom, reset, hover"
    plot = figure(title = title, plot_width = 800, tools = tools)
    plot.patches("xs", "ys", source=data_source, fill_color={'field': 'Ill_Rate', 'transform': color_mapper},
                 line_color = "black", line_width = 2.0)
    color_bar = ColorBar(color_mapper=color_mapper, ticker=BasicTicker(),
                         label_standoff=12, border_line_color=None, location=(0,0))
    plot.add_layout(color_bar, 'right')
    hover = plot.select_one(HoverTool)
    hover.point_policy = "snap_to_data"
    hover.tooltips = [("Region:", "@name"),
                      ("Value", "@Ill_Rate")]
    show(plot)

@interact(map_type = ["macro", "region"], illness_str = health_data["Illness"].unique(), 
          age_str = health_data["Age"].unique(), time = health_data["Time"].unique())
def generate_illness_map_interactive(map_type, illness_str, age_str, time):
    illness = list(ill_dict.keys())[list(ill_dict.values()).index(illness_str)]
    age = list(age_dict.keys())[list(age_dict.values()).index(age_str)]
    title = "Карта %s в возрасте %s в %s году" % (illness_str, age_str, time)
    if map_type == "macro":
        filename_macro = "admin_level_3.geojson"
        with open(filename_macro, encoding="utf-8") as file:
            geojson_data = json.load(file)
        update_geojson_macro(geojson_data, illness, time, age)
    elif map_type == "region":
        filename_region = "admin_level_4.geojson"
        with open(filename_region, encoding="utf-8") as file:
            geojson_data = json.load(file)
            update_geojson_region(geojson_data, illness, time, age)
    else:
        return -1
    generate_illness_map(geojson_data, illness, time, age,data=health_data, title=title)

interactive(children=(Dropdown(description='map_type', options=('macro', 'region'), value='macro'), Dropdown(d…