In [1]:
import os
import pandas as pd 
import matplotlib.pyplot as plt
import utils

* download countrywide indonesia education data

In [2]:
os.chdir("../data")

In [3]:
utils.get_education_data()

('indonesia_education-country.csv', 'already exists')


* pre-processing data

In [4]:
education_columns = ["province", "province_name", "city_code", "city_name",
                     "education_level", "male", "female", "latitude", "longitude"]

In [5]:
# alternative way to import file without pre-setting column names
#indonesia_education = pd.read_csv("../data/indonesia_education-country.csv")
#indonesia_education.columns = education_columns

In [6]:
indonesia_education = pd.read_csv("indonesia_education-country.csv", names = education_columns, 
                                 header = 0, index_col = 0)

In [7]:
english_education_levels = {'Tidak/Belum Tamat SD':'less than elementary',
                            'Tamat SD':'elementary',
                            'Tamat SLTP': 'junior high school',
                            'Tamat SLTA': 'high school',
                            'Tamat PT': 'university'
                           }

In [8]:
indonesia_education["education_level"].replace(english_education_levels, inplace=True)
indonesia_education["education_level"].unique()

array(['less than elementary', 'elementary', 'junior high school',
       'high school', 'university'], dtype=object)

In [9]:
indonesia_education.head()

Unnamed: 0_level_0,province_name,city_code,city_name,education_level,male,female,latitude,longitude
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,less than elementary,1209,1971,2.62818,96.0898
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,elementary,9545,10545,2.62818,96.0898
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,junior high school,6733,6100,2.62818,96.0898
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,high school,6945,4448,2.62818,96.0898
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,university,2043,1777,2.62818,96.0898


## Compare education levels by various demographic groups

goals:
1. combine counts of male and female 
2. group combinded counts by education level
3. group combined counts by province name and education level
4. group combined counts by city name and education level

* make a new variable ("total") that is a sum of males and females in a education category (by row)
    1. find the unique values of education 
    2. create dataframes that subset by level of education 

In [10]:
indonesia_education["education_level"].unique()

array(['less than elementary', 'elementary', 'junior high school',
       'high school', 'university'], dtype=object)

In [11]:
indonesia_education["total"] = indonesia_education["male"]+indonesia_education["female"]

In [12]:
indonesia_education.head()

Unnamed: 0_level_0,province_name,city_code,city_name,education_level,male,female,latitude,longitude,total
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,less than elementary,1209,1971,2.62818,96.0898,3180
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,elementary,9545,10545,2.62818,96.0898,20090
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,junior high school,6733,6100,2.62818,96.0898,12833
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,high school,6945,4448,2.62818,96.0898,11393
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,university,2043,1777,2.62818,96.0898,3820


* we can use pandas group by to aggregate sums

In [13]:
education_groups = indonesia_education.groupby(["education_level"])["total"].sum()
#education_groups = indonesia_education.groupby(indonesia_education["education_level"])["total"].sum()

In [14]:
education_groups

education_level
elementary              56113017
high school             40310023
junior high school      35006186
less than elementary    12107698
university              11194347
Name: total, dtype: int64

* or we can use .loc to subset the dataframe to only include the value of education we are concerned with

In [15]:
less_than_elementary_df = indonesia_education.loc[indonesia_education["education_level"]=="less than elementary"]
elementary_df = indonesia_education.loc[indonesia_education["education_level"]=="elementary"]
junior_high_df = indonesia_education.loc[indonesia_education["education_level"]=="junior high school"]
high_school_df = indonesia_education.loc[indonesia_education["education_level"]=="high school"]
university_df = indonesia_education.loc[indonesia_education["education_level"]=="university"]

In [16]:
less_than_elementary_df.head()

Unnamed: 0_level_0,province_name,city_code,city_name,education_level,male,female,latitude,longitude,total
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11,Prov. Nanggroe Aceh Darussalam,1101,Kab. Simeulue,less than elementary,1209,1971,2.62818,96.0898,3180
11,Prov. Nanggroe Aceh Darussalam,1102,Kab. Aceh Singkil,less than elementary,3407,4251,2.439,97.9244,7658
11,Prov. Nanggroe Aceh Darussalam,1103,Kab. Aceh Selatan,less than elementary,4722,5974,3.25638,97.213,10696
11,Prov. Nanggroe Aceh Darussalam,1104,Kab. Aceh Tenggara,less than elementary,1925,3215,3.59968,97.6619,5140
11,Prov. Nanggroe Aceh Darussalam,1105,Kab. Aceh Timur,less than elementary,5746,7243,4.56983,97.7723,12989


In [17]:
print("the total population with less than elementary education %d" % less_than_elementary_df["total"].sum())
print("the total population with elementary education %d" % elementary_df["total"].sum())
print("the total population with junior high school education %d" % junior_high_df["total"].sum())
print("the total population with high school education %d" % high_school_df["total"].sum())
print("the total population with university education %d" % university_df["total"].sum())

the total population with less than elementary education 12107698
the total population with elementary education 56113017
the total population with junior high school education 35006186
the total population with high school education 40310023
the total population with university education 11194347


In [18]:
print("the total men with less than elementary education %d" % less_than_elementary_df["male"].sum())
print("the total men with elementary education %d" % elementary_df["male"].sum())
print("the total men with junior high school education %d" % junior_high_df["male"].sum())
print("the total men with high school education %d" % high_school_df["male"].sum())
print("the total men with university education %d" % university_df["male"].sum())

the total men with less than elementary education 5525608
the total men with elementary education 27430862
the total men with junior high school education 18017048
the total men with high school education 22275190
the total men with university education 5735592


In [19]:
print("the total women with less than elementary education %d" % less_than_elementary_df["female"].sum())
print("the total women with elementary education %d" % elementary_df["female"].sum())
print("the total women with junior high school education %d" % junior_high_df["female"].sum())
print("the total women with high school education %d" % high_school_df["female"].sum())
print("the total women with university education %d" % university_df["female"].sum())

the total women with less than elementary education 6582090
the total women with elementary education 28682155
the total women with junior high school education 16989138
the total women with high school education 18034833
the total women with university education 5458755


* Chris Albon's [guide](http://chrisalbon.com/python/pandas_apply_operations_to_groups.html) for apply operations with Pandas groups

In [20]:
indonesia_education_city = indonesia_education.groupby(["city_name",
                                                     "education_level"])["total"].sum()
#indonesia_education_city = indonesia_education.groupby([indonesia_education.city_name,
#                                                     indonesia_education.education_level])["total"].sum()

In [21]:
indonesia_education_city.head(10)

city_name             education_level     
Kab. Aceh Barat       elementary              35798
                      high school             33821
                      junior high school      27952
                      less than elementary     6399
                      university               9984
Kab. Aceh Barat Daya  elementary              29120
                      high school             17604
                      junior high school      19079
                      less than elementary     8463
                      university               6653
Name: total, dtype: int64

In [22]:
indonesia_education_city["Kab. Badung"]

education_level
elementary               84459
high school             161860
junior high school       69893
less than elementary     13167
university               47451
Name: total, dtype: int64

In [23]:
indonesia_education_city.to_csv("../data/indonesia_education-city.csv")

In [24]:
indonesia_education_city = pd.read_csv("../data/indonesia_education-city.csv",
                             names = ["city", "education_level", "population"],header = 0)
indonesia_education_city.head()

Unnamed: 0,city,education_level,population
0,Kab. Aceh Barat,high school,33821
1,Kab. Aceh Barat,junior high school,27952
2,Kab. Aceh Barat,less than elementary,6399
3,Kab. Aceh Barat,university,9984
4,Kab. Aceh Barat Daya,elementary,29120


In [25]:
indonesia_education.columns

Index([u'province_name', u'city_code', u'city_name', u'education_level',
       u'male', u'female', u'latitude', u'longitude', u'total'],
      dtype='object')

In [26]:
indonesia_education_city = indonesia_education.groupby(["province_name", "city_name",
                                                        "education_level"])["male","female","total"].sum()
indonesia_education_province = indonesia_education.groupby(["province_name",
                                                            "education_level"])["male","female","total"].sum()


In [27]:
indonesia_education_city.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,male,female,total
province_name,city_name,education_level,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Prov. Bali,Kab. Badung,elementary,38735,45724,84459
Prov. Bali,Kab. Badung,high school,90397,71463,161860
Prov. Bali,Kab. Badung,junior high school,35058,34835,69893
Prov. Bali,Kab. Badung,less than elementary,5361,7806,13167
Prov. Bali,Kab. Badung,university,26903,20548,47451
Prov. Bali,Kab. Bangli,elementary,30909,30490,61399
Prov. Bali,Kab. Bangli,high school,15215,9481,24696
Prov. Bali,Kab. Bangli,junior high school,13141,11015,24156
Prov. Bali,Kab. Bangli,less than elementary,4641,5166,9807
Prov. Bali,Kab. Bangli,university,4249,2645,6894


In [28]:
indonesia_education_city["male"]["Prov. Bali"]["Kab. Badung"]

education_level
elementary              38735
high school             90397
junior high school      35058
less than elementary     5361
university              26903
Name: male, dtype: int64

In [29]:
indonesia_education_province.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,male,female,total
province_name,education_level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Prov. Bali,elementary,376185,426710,802895
Prov. Bali,high school,481544,362027,843571
Prov. Bali,junior high school,256494,237036,493530
Prov. Bali,less than elementary,64880,81399,146279
Prov. Bali,university,148974,110100,259074
Prov. Banten,elementary,1181632,1240746,2422378
Prov. Banten,high school,1163237,892995,2056232
Prov. Banten,junior high school,838419,787927,1626346
Prov. Banten,less than elementary,175505,220456,395961
Prov. Banten,university,297987,256903,554890


In [30]:
indonesia_education_province["male"]['Prov. Bali']

education_level
elementary              376185
high school             481544
junior high school      256494
less than elementary     64880
university              148974
Name: male, dtype: int64

## Map results

* import map from:

In [31]:
import time
print("last updated: {}".format(time.strftime("%a, %d %b %Y %H:%M", time.localtime())))

last updated: Sun, 20 Aug 2017 18:28
