In [202]:
# Imports
import matplotlib.pyplot as plt
import pandas as pd
import json

In [2]:
# Read Raw data
df = pd.read_csv("DRc5a2011-2014.txt",sep=",")

In [4]:
# Initial filtering
# Reg == 1100 - Whole Russia
# CCl == 10 - IDC 10 codes
# Group == T - urban and rural population together
df = df[(df.Reg == 1100) & (df.CCl == 10) & (df.Group == "T")].drop(['Reg','Group','CCl'], 1)

## Prepare data for gender/age plot

In [54]:
# Remove info about cause of death
by_gender_and_age = df[(df.Cause == 0)].drop(['Cause'], 1)

In [55]:
# Convert group names to integers
new_columns = []
for column in by_gender_and_age.columns:
    if column[:4] == 'Drac':
        new_columns.append(int(column[4:]))
    else:
        new_columns.append(column)
        
by_gender_and_age.columns = new_columns

In [56]:
gae_columns = ['Year', 'Sex',  'Age Group', 'Deaths']
gae = pd.DataFrame([], columns=gae_columns)
for index, row in by_gender_and_age.iterrows():
    sex = "Male" if (row['Sex'] == "M") else "Female"
    for i in [0, 1] + range(5, 90, 5):
        gae = gae.append(pd.DataFrame([[row['Year'], sex, i, row[i]]], columns=gae_columns))

In [57]:
# Export to JSON file
gae.to_json('../data/by_gender_and_age.json', 'records')

## Prepare data for cause of death plot

In [95]:
# Read IDC code groups description
idc_groups = pd.read_csv("idc_codes.txt",sep=";").set_index('Code')['Description'].to_dict()

In [142]:
# Read cause data
causes = {}
import codecs
with codecs.open('causes.txt', encoding='utf-8') as f:
    def r():
        return f.readline()
    
    i = r()
    while i != '':
        i = i.strip()
        causes[i] = {
            'ru': r().strip(),
            'en': r().strip(),
            'idc10': r().strip()
        }
        causes[i]['group'] = causes[i]['idc10'][0]
        i = r()

In [189]:
df["Count"] = df[[ u'Drac0', u'Drac1', u'Drac5', u'Drac10',
    u'Drac15', u'Drac20', u'Drac25', u'Drac30', u'Drac35', u'Drac40',
    u'Drac45', u'Drac50', u'Drac55', u'Drac60', u'Drac65', u'Drac70',
    u'Drac75', u'Drac80', u'Drac85']].sum(axis=1)

In [212]:
# Remove unused columns and rows
by_cause = df[df.Cause != 0][[u'Sex', u'Cause', u'Count']]
# aggregate by year
by_cause = by_cause.groupby(['Sex', 'Cause']).sum().reset_index()
# Add sum for male and female
by_cause_sum = by_cause.groupby(['Cause']).sum().reset_index()
by_cause_sum["Sex"] = 'F+M'
by_cause = by_cause.append(by_cause_sum)

In [213]:
by_cause["Description"] = by_cause.apply(lambda x: causes[str(x["Cause"])]['en'], axis=1)
by_cause["Group"] = by_cause.apply(lambda x: causes[str(x["Cause"])]['idc10'][0], axis=1)
by_cause["Codes"] = by_cause.apply(lambda x: causes[str(x["Cause"])]['idc10'], axis=1)
by_cause = by_cause.drop(['Cause'], 1)

In [221]:
# Export Data to JSON files
def export_data(df, filename):
    data = {"name": 'Deaths', "children": []}
    
    for code, group in idc_groups.iteritems():
        sub = {"name": group, "children": []}
        sub_df = df[df["Group"] == code]
        for index, row in sub_df.iterrows():
            sub["children"].append({"name": row["Description"], "codes": row["Codes"], "count": row["Count"]  }) 
        data["children"].append(sub) 
    with open(''.join(['../data/', filename, '.json']), 'w') as outfile:
        json.dump(data, outfile)
    return data
        
export_data(by_cause[by_cause["Sex"] == "F"].drop(['Sex'], 1), 'by_cause_female')
export_data(by_cause[by_cause["Sex"] == "M"].drop(['Sex'], 1), 'by_cause_male')
export_data(by_cause[by_cause["Sex"] == "F+M"].drop(['Sex'], 1), 'by_cause_all')

{'children': [{'children': [{'codes': u'A00', 'count': 1, 'name': u'Cholera'},
    {'codes': u'A010', 'count': 0, 'name': u'Typhoid fever'},
    {'codes': u'A011,A012,A013,A014',
     'count': 0,
     'name': u'Paratyphoid fever'},
    {'codes': u'A02', 'count': 47, 'name': u'Other salmonella infections'},
    {'codes': u'A03', 'count': 12, 'name': u'Shigellosis'},
    {'codes': u'A05',
     'count': 48,
     'name': u'Other bacterial foodborne intoxications excluding salmonellosis'},
    {'codes': u'A04,A06-A08',
     'count': 503,
     'name': u'Other bacterial intestinal infections including amoebiasis and other protozoal intestinal diseases'},
    {'codes': u'A09',
     'count': 223,
     'name': u'Diarrhoea and gastroenteritis of presumed infectious origin'},
    {'codes': u'A15',
     'count': 10711,
     'name': u'Respiratory tuberculosis, bacteriologically and histologically confirmed'},
    {'codes': u'A16',
     'count': 3355,
     'name': u'Respiratory tuberculosis, not conf