# Setup 
## Update Data Directory

In [1]:
# Update Data/ directory.
%run ./DownloadData.py

## Imports

In [2]:
import os
import numpy as np
import pandas as pd

## Load DataFrames

In [3]:
df = {}
for data in os.listdir('Data/'):
    if data.endswith('.csv'):
        df[data[:-4]] = pd.read_csv('Data/' + data)
print sorted(df.keys())

['caste', 'caste_percent', 'communicabledisease1', 'communicabledisease2', 'hdi2011', 'mentalhealth', 'mentalhealthT', 'poverty1', 'poverty1T', 'poverty2', 'poverty2T', 'sexratio', 'vaccinations', 'vaccinationsT', 'watersanitaion', 'watersanitationT']


# Create DataFrames
## caste_percent
This DataFrame summarizes the percent of the population belonging to each caste by district.

In [37]:
series = []
totalpop = {}
for district in df['caste']['District'].unique():
    caste = df['caste'][(df['caste']['District'] == district)].groupby('Caste/Ethnicity').sum()['In number']
    if district == 'Darchula':
        district = 'Darchaula'
    if district == 'Kavrepalanchok':
        district = 'Kavre'
    series.append((100.0 * (caste / caste.sum())).rename(district))
    totalpop[district] = caste.sum()

df['caste_percent'] = pd.DataFrame(series).fillna(0)
df['caste_percent']['Tot_Pop'] = pd.Series(totalpop).rename('Total_Population')
df['caste_percent'].to_csv('Data/caste_percent.csv')

## Poverty

In [5]:
poverty = df['poverty1'][df['poverty1']['Year AD'] == 2011]
series = []
for district in poverty['District'].unique():
    indicators = poverty[(poverty['District'] == district)].groupby('Indicators').sum()['Value']
    series.append(indicators.rename(district).T)

df['poverty1T'] = pd.DataFrame(series)
df['poverty1T'].to_csv('Data/poverty1T.csv')

series = []
poverty = df['poverty2']

for district in poverty['District'].unique():
    subgroups = poverty[(poverty['District'] == district)].groupby('Sub Group').sum()['Value']
    series.append(subgroups.rename(district).T)

df['poverty2T'] = pd.DataFrame(series)
df['poverty2T'].to_csv('Data/poverty2T.csv')

## vaccinationsT

In [6]:
vacc = df['vaccinations'][df['vaccinations']['Year AD'] == '2011/12']
series = []

for district in vacc['District'].unique():
    vaccines = vacc[vacc['District'] == district].groupby('Vaccines').sum()
    vaccines.columns = [district]
    series.append(vaccines.T.ix[0])
    
df['vaccinationsT'] = pd.DataFrame(series)
df['vaccinationsT'].to_csv('Data/vaccinationsT.csv')

## watersanitationT

In [7]:
df['watersanitaion']
series = []

for district in df['watersanitaion']['Districts'].unique():
    sani = df['watersanitaion'][df['watersanitaion']['Districts'] == district].groupby('Sub group').sum()
    sani.columns = [district]
    series.append(sani.T.ix[0])
df['watersanitationT'] = pd.DataFrame(series)
df['watersanitationT'].to_csv('Data/watersanitationT.csv')

## mentalhealthT

In [8]:
mh = df['mentalhealth']
series = []

for district in mh['District'].unique():
    mental = mh[mh['District'] == district].groupby('Mental problem').sum()
    mental.columns = [district]
    series.append(mental.T.ix[0])

df['mentalhealthT'] = pd.DataFrame(series)
df['mentalhealthT'].to_csv('Data/mentalhealthT.csv')

## Communicable Disease

In [36]:
cd1 = df['communicabledisease1'][df['communicabledisease1']['Year AD'] == '2011/12']
series = []

for district in cd1['District'].unique():
    disease1 = cd1[cd1['District'] == district].groupby('Communicable Diseases').sum()
    disease1.columns = [district]
    series.append(disease1.T.ix[0])

df['communicabledisease1T'] = pd.DataFrame(series)
df['communicabledisease1T'].to_csv('Data/communicabledisease1T.csv')

cd2 = df['communicabledisease2'][df['communicabledisease2']['Year AD'] == '2011/12']
series = []

for district in cd2['District'].unique():
    disease2 = cd2[cd2['District'] == district].groupby('Diseases').sum()
    disease2.columns = [district]
    series.append(disease2.T.ix[0])

df['communicabledisease2T'] = pd.DataFrame(series)
df['communicabledisease2T'].to_csv('Data/communicabledisease2T.csv')

In [10]:
print sorted(df.keys())

['caste', 'caste_percent', 'communicabledisease1', 'communicabledisease1T', 'communicabledisease2', 'communicabledisease2T', 'hdi2011', 'mentalhealth', 'mentalhealthT', 'poverty1', 'poverty1T', 'poverty2', 'poverty2T', 'sexratio', 'vaccinations', 'vaccinationsT', 'watersanitaion', 'watersanitationT']


In [39]:
df['hdi2011'] = df['hdi2011'].set_index(['District '])
df['sexratio'] = df['sexratio'].set_index('District ')

pd.concat([df['caste_percent'], df['communicabledisease1T'],
           df['communicabledisease2T'], df['mentalhealthT'],
           df['poverty1T'], df['poverty2T'], df['vaccinationsT'],
           df['watersanitationT'], df['hdi2011'], df['sexratio']], axis=1).to_csv('Data/master.csv')