In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import humanize
import os

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import ConfusionMatrixDisplay

In [3]:
mon_avg_rain30 = pd.read_csv('./data/pr_climatology_annual-monthly_cru_1991-2020_TZA.csv', header=1)
mon_avg_rain30 = mon_avg_rain30.rename({'Unnamed: 0': 'Region'}, axis = 1)
mon_avg_rain30.head()

Unnamed: 0,Region,Annual,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,"Tanzania, United Republic of",1003.75,154.87,136.42,178.41,154.31,61.35,12.79,5.74,7.42,16.33,34.34,90.84,150.94
1,Mtwara,1212.44,211.33,206.65,256.37,182.24,59.76,10.03,6.44,3.45,12.23,23.71,67.0,173.23
2,Morogoro,1242.33,178.71,172.91,253.21,243.72,79.0,16.77,6.98,7.04,12.07,28.06,77.96,165.89
3,Mbeya,814.66,186.13,163.99,158.22,75.17,9.33,0.52,0.14,0.05,0.47,7.38,49.27,163.98
4,Tabora,902.09,166.64,133.7,155.13,105.23,29.94,0.23,0.26,0.5,4.51,25.63,107.52,172.8


In [5]:
tz_census = pd.read_csv('./data/Tanzania_2012_en.csv')

In [6]:
regions_of_interest = mon_avg_rain30.iloc[1:, 0]
# tz_census_regions = tz_census[(tz_census['Area'] != 'Mainland') & (tz_census['Area'] != 'Tanzania')]
tz_census_regions = tz_census[tz_census['Area'].isin(regions_of_interest)] # df of indicators for regions, not whole country

In [7]:
tz_census_regions["Indicator"].value_counts()[lambda x : x.values > 250]

Primary school net enrolment ratio                               1110
Primary school gross enrolment ratio                             1026
Operators engage in crops                                         897
Population size                                                   894
Yield in a crops                                                  870
Quantity sold in a crops                                          870
Quantity harvested in crops                                       870
Planted area in a crops                                           870
Area harvested in crops                                           870
Pupils who sat for the primary school leaver's examination        685
Enrolment in all primary schools                                  636
Death cause by road accidents                                     620
Quantity of electricity sold by TANESCO                           524
GDP per capita (at current prices)                                505
Operators using irri

In [8]:
region_cols = '''Yield in a crops
Water produced in urban water supply authority
Water demand
Water coverage
Urine protein?
Source of improved drinking water by households
Source of non improved drinking water by households
Quantity sold in a crops
Quantity harvested in crops
Projected infant mortality rate
Projected life expectancy at birth
Population with disability
Post neonatal mortality rate
Population with access to piped or protected water as their main source in regional centres
Population using an improved drinking water source
Population 7 years and above, with difficulty in cognition
Population 7 years and above, with difficulty in communication
Population 7 years and above, with difficulty in hearing
Population 7 years and above, with difficulty in mobility
Population 7 years and above, with difficulty in seeing
Planted area
Planted area in a crops
Improved water source
Income inequality
Human development index
Households within 1 km of drinking water in dry seasons
Households within 15 minutes to water supply in dry seasons
Households with access to safe drinking water during the dry season
Households with access to safe drinking water during the rainy season
Households with basic sanitation facilities
Households with a piped water
Health insurance coverage age 15-49
Health worker density per 10,000 population- (Clinicians=MO,AMO and CO)
Health worker density per 10,000 population-nurse and midwife
GINI coefficient
GDP per capita (at current prices)
Food crop production - bambaranut
Food crop production - bananas
Food crop production - barley
Food crop production - beans
Food crop production - bulrush millet
Food crop production - cassava
Food crop production - chickpeas
Food crop production - cowpeas
Food crop production - finger millet
Food crop production - irish potatos
Food crop production - maize
Food crop production - peagionpeas
Food crop production - rice
Food crop production - sorghum
Food crop production - sweetpotato
Food crop production - wheat
Food crop production -paddy
Distribution of children of age 5-17 years in child labour in agriculture, forestry and fishing industry
Distribution of children of age 5-17 years in hazardous work in agriculture, forestry and fishing industry
Distribution of children of age 5-17 years working in agriculture, forestry and fishing economic activities
Deworming coverage
Children under-5 years with diarrhoea
Children under-5 years with diarrhoea taken to a health facility/provider
Children under-5 years who drink more fluids during diarrhoea
Children under-5 years who given ORS and zinc during diarrhoea
Children under-5 years who receive any ORT for diarrhoea
Children under-5 years with fever'''.split('\n')

In [9]:
all_indicator_cols = '''Yield in a crops
Water produced in urban water supply authority
Water demand
Water coverage
Urine protein?
Source of improved drinking water by households
Source of non improved drinking water by households
Quantity sold in a crops
Quantity harvested in crops
Projected infant mortality rate
Projected life expectancy at birth
Population with disability
Post neonatal mortality rate
Population with access to piped or protected water as their main source in regional centres
Population using an improved drinking water source
Population 7 years and above, with difficulty in cognition
Population 7 years and above, with difficulty in communication
Population 7 years and above, with difficulty in hearing
Population 7 years and above, with difficulty in mobility
Population 7 years and above, with difficulty in seeing
Planted area
Planted area in a crops
Improved water source
Income inequality
Human development index
Households within 1 km of drinking water in dry seasons
Households within 15 minutes to water supply in dry seasons
Households with access to safe drinking water during the dry season
Households with access to safe drinking water during the rainy season
Households with basic sanitation facilities
Households with a piped water
Health insurance coverage age 15-49
Health worker density per 10,000 population- (Clinicians=MO,AMO and CO)
Health worker density per 10,000 population-nurse and midwife
GINI coefficient
GDP per capita (at current prices)
Food crop production - bambaranut
Food crop production - bananas
Food crop production - barley
Food crop production - beans
Food crop production - bulrush millet
Food crop production - cassava
Food crop production - chickpeas
Food crop production - cowpeas
Food crop production - finger millet
Food crop production - irish potatos
Food crop production - maize
Food crop production - peagionpeas
Food crop production - rice
Food crop production - sorghum
Food crop production - sweetpotato
Food crop production - wheat
Food crop production -paddy
Distribution of children of age 5-17 years in child labour in agriculture, forestry and fishing industry
Distribution of children of age 5-17 years in hazardous work in agriculture, forestry and fishing industry
Distribution of children of age 5-17 years working in agriculture, forestry and fishing economic activities
Deworming coverage
Children under-5 years with diarrhoea
Children under-5 years with diarrhoea taken to a health facility/provider
Children under-5 years who drink more fluids during diarrhoea
Children under-5 years who given ORS and zinc during diarrhoea
Children under-5 years who receive any ORT for diarrhoea
Children under-5 years with fever'''.split('\n')

In [10]:
health_cols = '''Urine protein?
# Projected infant mortality rate
# Projected life expectancy at birth
Population with disability
# Post neonatal mortality rate
# Population 7 years and above, with difficulty in cognition
# Population 7 years and above, with difficulty in communication
# Population 7 years and above, with difficulty in hearing
# Population 7 years and above, with difficulty in mobility
# Population 7 years and above, with difficulty in seeing
# Deworming coverage
Children under-5 years with diarrhoea
# Children under-5 years with diarrhoea taken to a health facility/provider
# Children under-5 years who drink more fluids during diarrhoea
# Children under-5 years who given ORS and zinc during diarrhoea
# Children under-5 years who receive any ORT for diarrhoea
Children under-5 years with fever'''.split('\n')

health_cols = '''Population with disability
Children under-5 years with diarrhoea
Children under-5 years with fever'''.split('\n')

water_cols = '''Water produced in urban water supply authority
Water demand
Water coverage
Source of improved drinking water by households
Source of non improved drinking water by households
Population with access to piped or protected water as their main source in regional centres
Population using an improved drinking water source
Improved water source
Households within 1 km of drinking water in dry seasons
Households within 15 minutes to water supply in dry seasons
Households with access to safe drinking water during the dry season
Households with access to safe drinking water during the rainy season
Households with a piped water'''.split('\n')

socioecon_cols = '''Income inequality
Human development index
Households with basic sanitation facilities
Health insurance coverage age 15-49
Health worker density per 10,000 population- (Clinicians=MO,AMO and CO)
Health worker density per 10,000 population-nurse and midwife
GDP per capita (at current prices)'''.split('\n')

agriculture_cols = '''Yield in a crops
Planted area
Planted area in a crops
Food crop production - bambaranut
Food crop production - bananas
Food crop production - barley
Food crop production - beans
Food crop production - bulrush millet
Food crop production - cassava
Food crop production - chickpeas
Food crop production - cowpeas
Food crop production - finger millet
Food crop production - irish potatos
Food crop production - maize
Food crop production - peagionpeas
Food crop production - rice
Food crop production - sorghum
Food crop production - sweetpotato
Food crop production - wheat
Food crop production -paddy
Quantity sold in a crops
Quantity harvested in crops
Distribution of children of age 5-17 years in child labour in agriculture, forestry and fishing industry
Distribution of children of age 5-17 years in hazardous work in agriculture, forestry and fishing industry
Distribution of children of age 5-17 years working in agriculture, forestry and fishing economic activities'''.split('\n')

agriculture_cols_smaller = '''Yield in a crops
Planted area
Planted area in a crops
Quantity sold in a crops
Quantity harvested in crops'''.split('\n')

In [11]:
# cols_tuples = [('health', health_cols), ('water', water_cols), ('socioecon', socioecon_cols), ('agriculture', agriculture_cols_smaller)]

# for tup in cols_tuples:
#     tz_census_regions[tz_census_regions['Indicator'].isin(tup[1])].to_csv('./exported_data/tz_regions_' + tup[0] + '.csv')

In [12]:
tz_regions_health = pd.read_csv('exported_data/tz_regions_health.csv')
tz_regions_socioecon = pd.read_csv('exported_data/tz_regions_socioecon.csv')

In [13]:
tz_regions_health[(tz_regions_health['Time Period'] == '2010') & (tz_regions_health['Source'] == 'NBS_ TDHS (2010)_2011')]

Unnamed: 0.1,Unnamed: 0,Indicator,Unit,Subgroup,Area,Area ID,Time Period,Source,Data Value,Footnotes
25,6128,Children under-5 years with diarrhoea,Percent,Total,Mtwara,TZA001009,2010,NBS_ TDHS (2010)_2011,13.7,"TZA_ Demographic & Health Survey, 2010, Nation..."
26,6129,Children under-5 years with diarrhoea,Percent,Total,Ruvuma,TZA001010,2010,NBS_ TDHS (2010)_2011,9.7,"TZA_ Demographic & Health Survey, 2010, Nation..."
27,6130,Children under-5 years with diarrhoea,Percent,Total,Iringa,TZA001011,2010,NBS_ TDHS (2010)_2011,15.3,"TZA_ Demographic & Health Survey, 2010, Nation..."
28,6131,Children under-5 years with diarrhoea,Percent,Total,Mbeya,TZA001012,2010,NBS_ TDHS (2010)_2011,17.6,"TZA_ Demographic & Health Survey, 2010, Nation..."
29,6132,Children under-5 years with diarrhoea,Percent,Total,Singida,TZA001013,2010,NBS_ TDHS (2010)_2011,15.9,"TZA_ Demographic & Health Survey, 2010, Nation..."
30,6133,Children under-5 years with diarrhoea,Percent,Total,Arusha,TZA001002,2010,NBS_ TDHS (2010)_2011,12.6,"TZA_ Demographic & Health Survey, 2010, Nation..."
31,6134,Children under-5 years with diarrhoea,Percent,Total,Tabora,TZA001014,2010,NBS_ TDHS (2010)_2011,10.0,"TZA_ Demographic & Health Survey, 2010, Nation..."
32,6135,Children under-5 years with diarrhoea,Percent,Total,Rukwa,TZA001015,2010,NBS_ TDHS (2010)_2011,10.4,"TZA_ Demographic & Health Survey, 2010, Nation..."
33,6136,Children under-5 years with diarrhoea,Percent,Total,Kigoma,TZA001016,2010,NBS_ TDHS (2010)_2011,28.8,"TZA_ Demographic & Health Survey, 2010, Nation..."
34,6138,Children under-5 years with diarrhoea,Percent,Total,Kagera,TZA001018,2010,NBS_ TDHS (2010)_2011,24.0,"TZA_ Demographic & Health Survey, 2010, Nation..."


<hr>

Other stuff

In [14]:
indicator = None
for idx, ind in enumerate(tz_census_regions.groupby('Indicator')):
    indicator = ind
    break

In [15]:
cols_list = [health_cols, water_cols, socioecon_cols, agriculture_cols_smaller]

for r in regions_of_interest[1:]:
    group = tz_census_regions.groupby('Area').get_group(r)
    region_df = group[(group['Indicator'].isin(cols_list[0]))]
    for cols in cols_list[1:]:
        region_df = group[(group['Indicator'].isin(agriculture_cols_smaller))]
    
    break

In [16]:
tz_census_regions['Area'][tz_census_regions['Area'].isin(regions_of_interest)].value_counts()[-1]

477

In [17]:
for i in tz_census_regions[tz_census_regions['Area'] == 'Kaskazini Unguja']['Indicator']:
    print(i)

Primary school gross enrolment ratio
Children under-5 years with birth certificates
Children under-5 years with birth certificates
Pregnant women attending antenatal care
Pregnant women who received at least 2 doses of tetanus toxoid immunization
Children under-5 years sleeping under ITN nets
Children under-5 years with ARI
Children under-5 years with diarrhoea
Contraceptive prevalence rate
Children not weighed at birth
Births attended by skilled health personnel
Children 12-23 months who have had DPT 3 immunization
Births taking place through caesarean section
Births taking place in health facilities
Children 12-23 months who are fully immunized
Children 12-23 months who have had measles immunization
Children 12-23 months who have had polio immunization
Children under-18 years living with both parents
Children under-18 years who are orphans
Children under-18 years living with both parents
Children under-18 years who are orphans
Children under-2 years ever breastfed
Children 6-59 month

In [18]:
tz_census_regions[tz_census_regions['Indicator'].isin(socioecon_cols)]['Time Period'].value_counts().head(30)

2012         126
2011          60
2015-2016     48
2013          48
2007          20
2009          20
2006          20
2005          20
2004          20
2003          20
2002          20
2010          20
2008          20
1988          19
1992          19
1996          19
1997          19
1998          19
1999          19
2000          19
2001          19
1995          19
1994          19
1991          19
1990          19
1989          19
1987          19
1993          19
Name: Time Period, dtype: int64

In [19]:
tz_census_regions['Source'].value_counts()

MoE&VT_BEST_Yearly                                                                                    6789
NBS_Annual Agricultural Sample Survey_2014-2015                                                       6105
MoHA_ Annual Crime Reports_Yearly                                                                     4186
MoAFSC_Annual basic agriculture reports_Yearly                                                        4122
NBS_Population and Housing Census, 2002_2003-2007                                                     1625
NBS_Population and Housing Census, 2012_2013-2018                                                     1430
MoFEA_Economic Survey (ES)_Yearly                                                                     1218
NBS_ TDHS (2015-2016)_2016                                                                            1125
NBS_Tanzania Service Provision Assessment (TSPA)_2014-2015                                             870
TFNC_Tanzania National Nutrition Surv

In [20]:
tz_census_regions

Unnamed: 0,Indicator,Unit,Subgroup,Area,Area ID,Time Period,Source,Data Value,Footnotes
9,Primary school net enrolment ratio,Percent,Total,Kagera,TZA001018,2016,MoE&VT_BEST_Yearly,7.612323e+01,
10,Primary school net enrolment ratio,Percent,Total,Katavi,TZA001023,2016,MoE&VT_BEST_Yearly,8.020340e+01,
11,Primary school net enrolment ratio,Percent,Total,Kilimanjaro,TZA001003,2016,MoE&VT_BEST_Yearly,7.747532e+01,
12,Primary school net enrolment ratio,Percent,Total,Lindi,TZA001008,2016,MoE&VT_BEST_Yearly,9.173979e+01,
13,Primary school net enrolment ratio,Percent,Total,Manyara,TZA001021,2016,MoE&VT_BEST_Yearly,7.454933e+01,
...,...,...,...,...,...,...,...,...,...
60778,GDP per capita (at current prices),Tshs,Total,Shinyanga,TZA001017,2012,MoFEA_Economic Survey (ES)_Yearly,8.563330e+05,2012TZA001017856333Total
60779,GDP per capita (at current prices),Tshs,Total,Kagera,TZA001018,2012,MoFEA_Economic Survey (ES)_Yearly,7.162090e+05,2012TZA001018716209Total
60780,GDP per capita (at current prices),Tshs,Total,Mwanza,TZA001019,2012,MoFEA_Economic Survey (ES)_Yearly,9.108240e+05,2012TZA001019910824Total
60781,GDP per capita (at current prices),Tshs,Total,Mara,TZA001020,2012,MoFEA_Economic Survey (ES)_Yearly,1.048437e+06,2012TZA0010201048437Total


In [26]:
tz_census_regions['Area'].value_counts(ascending=True).head(29)

Kaskazini Unguja     477
Mjini Magharibi      630
Kusini Unguja        630
Kaskazini Pemba      632
Kusini Pemba         632
Simiyu               673
Katavi               677
Njombe               682
Geita                716
Shinyanga           1357
Manyara             1383
Dodoma              1557
Pwani               1557
Mara                1570
Lindi               1574
Mtwara              1575
Rukwa               1579
Tabora              1581
Kigoma              1583
Ruvuma              1585
Kilimanjaro         1589
Kagera              1595
Mwanza              1596
Tanga               1603
Iringa              1607
Mbeya               1607
Singida             1615
Morogoro            1628
Arusha              1634
Name: Area, dtype: int64