<a href="https://colab.research.google.com/github/jaya-shankar/education-impact/blob/master/data_extract_code/female_edu_attainment_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!rm -rf education-impact

In [6]:
!git clone https://github.com/jaya-shankar/education-impact.git

Cloning into 'education-impact'...
remote: Enumerating objects: 349, done.[K
remote: Counting objects: 100% (349/349), done.[K
remote: Compressing objects: 100% (292/292), done.[K
remote: Total 349 (delta 176), reused 172 (delta 54), pack-reused 0[K
Receiving objects: 100% (349/349), 2.87 MiB | 10.85 MiB/s, done.
Resolving deltas: 100% (176/176), done.


In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
i_file_name = "education-attainment.csv"
data = pd.read_csv("../raw_data/"+i_file_name)
df = pd.DataFrame(data)
df

Unnamed: 0,Area,Year,Age,Education,Distribution
0,Afghanistan,1960,20--24,Under 15,0.0
1,Afghanistan,1965,20--24,Under 15,0.0
2,Afghanistan,1970,20--24,Under 15,0.0
3,Afghanistan,1975,20--24,Under 15,0.0
4,Afghanistan,1980,20--24,Under 15,0.0
...,...,...,...,...,...
17499,United Kingdom of Great Britain and Northern I...,2020,20--24,Master and higher,2.8
17500,United States of America,2015,20--24,Master and higher,2.4
17501,United States of America,2020,20--24,Master and higher,2.9
17502,Uruguay,2015,20--24,Master and higher,0.1


In [3]:
df.columns

Index(['Area', 'Year', 'Age', 'Education', 'Distribution'], dtype='object')

In [4]:
df = df[df.Education != 'Under 15']
df

Unnamed: 0,Area,Year,Age,Education,Distribution
2450,Afghanistan,1960,20--24,No Education,92.5
2451,Afghanistan,1965,20--24,No Education,90.6
2452,Afghanistan,1970,20--24,No Education,89.1
2453,Afghanistan,1975,20--24,No Education,84.2
2454,Afghanistan,1980,20--24,No Education,82.4
...,...,...,...,...,...
17499,United Kingdom of Great Britain and Northern I...,2020,20--24,Master and higher,2.8
17500,United States of America,2015,20--24,Master and higher,2.4
17501,United States of America,2020,20--24,Master and higher,2.9
17502,Uruguay,2015,20--24,Master and higher,0.1


In [5]:
df.Education.unique()

array(['No Education', 'Incomplete Primary', 'Primary', 'Lower Secondary',
       'Upper Secondary', 'Post Secondary', 'Short Post Secondary',
       'Bachelor', 'Master and higher'], dtype=object)

In [6]:
edu_level = { '20-24-Primary_fin'          : [ 'Primary', 'Lower Secondary', 'Upper Secondary', 'Post Secondary', 'Short Post Secondary', 'Bachelor', 'Master and higher'],
              '20-24-Lower_Secondary_fin'  : [ 'Lower Secondary', 'Upper Secondary', 'Post Secondary', 'Short Post Secondary', 'Bachelor', 'Master and higher'],
              '20-24-Higher_Secondary_fin' : [ 'Upper Secondary', 'Post Secondary', 'Short Post Secondary', 'Bachelor', 'Master and higher'],
              '20-24-College_comp'         : [ 'Post Secondary', 'Short Post Secondary', 'Bachelor', 'Master and higher'],
            }

In [7]:
for i in edu_level:
  sr = df.loc[df['Education'].isin(edu_level[i])]
  sr.to_csv('../datasets/'+i+'.csv', encoding='utf-8', index=False)

In [8]:
countries = list(df.Area.unique())

In [9]:
def init_timeline():
  timeline = [i for i in range(1960,2021)]
  timeline_dic={c:{t : 0 if not t%5 else np.NaN for t in timeline  } for c in countries}
  return timeline_dic

In [10]:
def convert_to_df():
  dft = pd.DataFrame.from_dict(timeline_dic,orient='index')
  dft.insert(0, "Country", countries, True)
  return dft

In [11]:
for e in edu_level:
  timeline_dic = init_timeline()
  data = pd.read_csv("../datasets/"+e+".csv")   
  dfs = pd.DataFrame(data)
  for j in range(len(data)):
    timeline_dic[dfs.iloc[j]['Area']][dfs.iloc[j]['Year']] += dfs.iloc[j]['Distribution']
    round(timeline_dic[dfs.iloc[j]['Area']][dfs.iloc[j]['Year']], 2)
  dft = convert_to_df()
  dft.to_csv('../datasets/'+e+'.csv', encoding='utf-8', index=False)

## Interpolation of missing years

In [12]:
years = [(y,y+5) for y in range(1960,2016,5)]
datasets = [ e for e in edu_level]

for name in datasets:
  data = pd.read_csv('../datasets/'+name+'.csv')   
  df = pd.DataFrame(data)
  for r in range(len(df)):
    for s,e in years:
      df.at[r,str(s)] = round(df.iloc[r][str(s)],2)
      df.at[r,str(e)] = round(df.iloc[r][str(e)],2)
      d   = df.iloc[r][str(e)] - df.iloc[r][str(s)]
      inc = d/5
      i=1
      for y in range(s+1,e):
        df.at[r,str(y)] = round(df.iloc[r][str(s)] + inc*i,2)
        i+=1
  df.to_csv('../datasets/'+name+'.csv', encoding='utf-8', index=False)