# Data Transformation

In [2]:
import numpy as np
import pandas as pd
import json

## Market for Higher Education

In the following we will extract information about higher education programs throughout Chile for a series of years


In [19]:
data = pd.read_csv('../data/cned/cned.csv')

In [20]:
# look at info about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95698 entries, 0 to 95697
Data columns (total 10 columns):
region        95698 non-null int64
idc           95698 non-null int64
puntaje       20187 non-null float64
puntajenem    9663 non-null float64
ingreso2      25207 non-null float64
vacantes      73661 non-null float64
ingreso       95698 non-null int64
year          95698 non-null int64
dp            95698 non-null int64
area          95698 non-null object
dtypes: float64(4), int64(5), object(1)
memory usage: 7.3+ MB


In [21]:
data.head()

Unnamed: 0,region,idc,puntaje,puntajenem,ingreso2,vacantes,ingreso,year,dp,area
0,7,7444,683.0,685.0,133.0,161.0,692,2019,0,Business
1,7,7420,719.0,716.0,384.0,528.0,2464,2019,0,Business
2,7,7419,,,,,5,2019,1,Business
3,7,27287,675.0,673.0,161.0,160.0,689,2019,0,Business
4,7,7404,713.0,645.0,26.0,38.0,138,2019,0,Art/Humanities


In [22]:
data['year'].value_counts()

2016    7750
2018    7544
2017    7536
2015    7532
2014    7471
2019    7240
2013    7056
2012    6618
2011    6405
2010    5860
2009    5508
2008    5023
2007    4930
2006    4815
2005    4410
Name: year, dtype: int64

## Visualizing Data

- ingreso2 - mean higher education major enrollment size (Y dependent variable)
- puntaje -  mean standardized score of university majors (X independent variable)
- area - Type of major (e.g. Business, STEM, etc.)
- year - Time (in calendar year)

In [23]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

### Data exploration

First, we explore our data using pandas

In [24]:
# describe each of our relevant variables by year

# first store a grouped dataframe
dataByYear = data.groupby(by=['year', 'area'], sort=True)

dataByYear['ingreso2'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
year,area,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005,Art/Humanities,119.0,40.025210,43.392662,1.0,7.50,31.0,54.50,305.0
2005,Business,90.0,40.944444,65.179676,1.0,4.25,12.0,55.25,373.0
2005,Education,303.0,36.303630,71.396015,1.0,12.00,26.0,40.00,1032.0
2005,Health,171.0,60.175439,88.634966,1.0,30.00,48.0,65.50,927.0
2005,Law,82.0,45.304878,63.735311,1.0,6.00,16.5,57.25,342.0
...,...,...,...,...,...,...,...,...,...
2019,Education,301.0,34.478405,21.673571,1.0,18.00,31.0,47.00,124.0
2019,Health,425.0,57.334118,33.135035,1.0,34.00,53.0,73.00,247.0
2019,Law,86.0,79.034884,74.585198,3.0,32.00,60.0,103.75,519.0
2019,STEM,487.0,47.552361,40.657670,1.0,19.00,39.0,64.00,393.0


In [25]:
dataByYear.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,region,idc,puntaje,puntajenem,ingreso2,vacantes,ingreso,dp
year,area,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005,Art/Humanities,2202,1563199,0.0,0.0,4763.0,10363.0,31556,299
2005,Business,4435,4004362,515.0,0.0,3685.0,17917.0,43511,591
2005,Education,5572,3958585,1076.0,0.0,11000.0,25697.0,78339,720
2005,Health,3074,2168528,2537.0,0.0,10290.0,20810.0,61049,385
2005,Law,2729,2292779,0.0,0.0,3715.0,17186.0,45211,348
...,...,...,...,...,...,...,...,...,...
2019,Education,8956,25475479,149869.0,156955.0,10378.0,37523.0,110288,847
2019,Health,8975,23799652,227328.0,222814.0,24367.0,58854.0,205420,728
2019,Law,1253,2195442,42870.0,41170.0,6797.0,10006.0,40492,90
2019,STEM,22780,68209132,259692.0,269971.0,23158.0,91522.0,268280,2510


In [26]:
data = data.sort_values(by='year')
data.dropna(inplace=True)
data

Unnamed: 0,region,idc,puntaje,puntajenem,ingreso2,vacantes,ingreso,year,dp,area
48146,7,8861,556.0,506.0,102.0,100.0,178,2013,0,Art/Humanities
48143,7,14954,579.0,527.0,79.0,80.0,264,2013,0,Education
48142,7,22085,589.0,551.0,51.0,50.0,104,2013,0,Education
48141,7,8868,573.0,549.0,93.0,100.0,357,2013,0,Education
48140,7,8867,606.0,534.0,55.0,55.0,174,2013,0,Education
...,...,...,...,...,...,...,...,...,...,...
1210,7,19440,519.0,487.0,34.0,35.0,130,2019,0,SocSci
1211,7,1359,574.0,547.0,106.0,90.0,514,2019,0,SocSci
1213,12,1273,536.0,518.0,58.0,60.0,293,2019,0,SocSci
1187,7,2867,532.0,529.0,18.0,30.0,71,2019,0,Education


In [27]:
# print(data.groupby(by='year').apply(lambda x: x.to_dict('records')))

jsonData = data[['idc','ingreso2', 'puntaje', 'area', 'year']].groupby(by='year').apply(lambda x: x.to_dict('records'))

In [28]:
jsonData.iloc[1]
# jsonData.describe()

[{'idc': 14733,
  'ingreso2': 23.0,
  'puntaje': 481.0,
  'area': 'Health',
  'year': 2014},
 {'idc': 14720,
  'ingreso2': 25.0,
  'puntaje': 469.0,
  'area': 'Health',
  'year': 2014},
 {'idc': 14737,
  'ingreso2': 17.0,
  'puntaje': 621.0,
  'area': 'Health',
  'year': 2014},
 {'idc': 14721,
  'ingreso2': 26.0,
  'puntaje': 646.0,
  'area': 'Health',
  'year': 2014},
 {'idc': 16119,
  'ingreso2': 45.0,
  'puntaje': 475.0,
  'area': 'Health',
  'year': 2014},
 {'idc': 17592,
  'ingreso2': 18.0,
  'puntaje': 448.0,
  'area': 'Health',
  'year': 2014},
 {'idc': 7382,
  'ingreso2': 17.0,
  'puntaje': 531.0,
  'area': 'Business',
  'year': 2014},
 {'idc': 7387,
  'ingreso2': 7.0,
  'puntaje': 536.0,
  'area': 'Business',
  'year': 2014},
 {'idc': 7383,
  'ingreso2': 10.0,
  'puntaje': 512.0,
  'area': 'SocSci',
  'year': 2014},
 {'idc': 7384,
  'ingreso2': 9.0,
  'puntaje': 476.0,
  'area': 'SocSci',
  'year': 2014},
 {'idc': 7380,
  'ingreso2': 35.0,
  'puntaje': 532.0,
  'area': 'Law',


In [29]:
dataYearsList = data['year'].unique()

In [30]:
# create an output json variable
output = []
i = 0
while i<len(dataYearsList):
#     print(i, dataYearsList[i])
    outputObject = {}
    outputObject['year'] = dataYearsList[i]
    outputObject['areas'] = jsonData.iloc[i]
    output.append(outputObject)
    i = i+1

In [31]:
output
# print(output.to_json())
jsonDump = json.dumps(output, cls=NpEncoder)
# print(json.loads(jsonDump))
# print(json.loads(jsonOutput))
jsonOutput = json.loads(jsonDump)

In [32]:
with open('../data/viz/output.json', 'w') as outfile:
    json.dump(jsonOutput, outfile)

## School Performance and Nutrition Status

The following includes anthropometric and school performance data from Chilean freshman students. These are pulled from distinct datasets. 

### Variables to use
- obesityPrevalence - prevalence of obesity per institution in the sample
- idrbd - Unique identifier of schools
- efectivr - School performance score



In [63]:
# import csv files from /data/sned and /data/mn

In [64]:
sned = pd.read_csv('../data/sned/sned.csv')
mn = pd.read_csv('../data/mn/1m2017.csv')

In [65]:
sned.head()

Unnamed: 0,idrbd,nom_rbd,cod_reg_rbd,cod_pro_rbd,cod_com_rbd,nom_com_rbd,cod_deprov_rbd,nom_deprov_rbd,rural_rbd,efectivr,superar,iniciar,mejorar,integrar,igualdr,cluster,indicer,sel2016_25,sel2016_35,sel
0,1,LICEO POLITECNICO ARICA,15,151,15101,ARICA,151,ARICA,1,41.494973,51.521082,87.836364,92.345679,76.083333,92.953156,1532,61.149999,2,2,3
1,2,PARVULARIO LAS ESPIGUITAS,15,151,15101,ARICA,151,ARICA,1,56.500516,49.402534,91.754545,92.592593,70.7,96.895561,1512,66.947049,2,2,3
2,3,ESC. PEDRO VICENTE GUTIERREZ TORRES,15,151,15101,ARICA,151,ARICA,2,47.259024,49.135619,0.0,92.592593,0.0,95.273599,1521,54.055856,2,2,3
3,4,LICEO OCTAVIO PALMA PEREZ,15,151,15101,ARICA,151,ARICA,1,67.427625,44.229581,100.0,92.592593,96.666667,97.728461,1532,71.51795,2,2,3
4,5,JOVINA NARANJO FERNANDEZ,15,151,15101,ARICA,151,ARICA,1,70.269825,53.751541,93.25,74.074074,62.5,96.676192,1532,72.520511,2,1,2


In [77]:
mn.dropna(inplace=True)

In [78]:
mn['obesityPrevalence'] = ((mn['onino'] + mn['onina']) / mn['total']) * 100

In [79]:
mn[['onino', 'onina', 'total','obesityPrevalence']]

Unnamed: 0,onino,onina,total,obesityPrevalence
0,10,6,107,14.953271
1,4,0,32,12.500000
2,2,0,44,4.545455
3,10,9,133,14.285714
4,7,7,100,14.000000
...,...,...,...,...
2475,4,4,34,23.529412
2476,10,0,79,12.658228
2477,4,5,45,20.000000
2478,0,0,1,0.000000


In [71]:
mn.set_index('idrbd')

Unnamed: 0_level_0,dnino,dnina,bpnino,bpnina,nnino,nnina,snino,snina,onino,onina,rtnino,rtnina,tnino,tnina,total,obesityPrevalence
idrbd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
97.0,1,1,2,3,21,31,13,19,10,6,3,6,47,60,107,14.953271
103.0,0,0,3,1,6,8,6,4,4,0,0,0,19,13,32,12.500000
106.0,2,0,1,0,14,5,12,8,2,0,3,0,31,13,44,4.545455
107.0,2,0,8,4,36,31,13,20,10,9,3,6,69,64,133,14.285714
108.0,2,1,6,4,23,29,8,13,7,7,4,2,46,54,100,14.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30003.0,0,1,4,1,17,20,12,14,10,0,1,0,43,36,79,12.658228
40230.0,1,0,2,0,16,4,9,4,4,5,2,0,32,13,45,20.000000
72.0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0.000000
78.0,0,0,1,0,6,5,4,1,1,1,1,1,12,7,19,10.526316


In [81]:
mn['idrbd'] = mn['idrbd'].astype(np.int64)

mn

In [82]:
mn

Unnamed: 0,idrbd,dnino,dnina,bpnino,bpnina,nnino,nnina,snino,snina,onino,onina,rtnino,rtnina,tnino,tnina,total,obesityPrevalence
0,97,1,1,2,3,21,31,13,19,10,6,3,6,47,60,107,14.953271
1,103,0,0,3,1,6,8,6,4,4,0,0,0,19,13,32,12.500000
2,106,2,0,1,0,14,5,12,8,2,0,3,0,31,13,44,4.545455
3,107,2,0,8,4,36,31,13,20,10,9,3,6,69,64,133,14.285714
4,108,2,1,6,4,23,29,8,13,7,7,4,2,46,54,100,14.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2475,30001,0,0,1,0,11,8,2,4,4,4,0,1,18,16,34,23.529412
2476,30003,0,1,4,1,17,20,12,14,10,0,1,0,43,36,79,12.658228
2477,40230,1,0,2,0,16,4,9,4,4,5,2,0,32,13,45,20.000000
2478,72,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0.000000


In [83]:
mergedData = pd.merge(mn, sned, on='idrbd', how='inner')

In [85]:
mergedData['efectivr']

0       49.974329
1       47.818693
2       52.072257
3       53.773238
4       38.095751
          ...    
2458    64.672959
2459    52.630439
2460    46.550346
2461    43.516118
2462    53.140512
Name: efectivr, Length: 2463, dtype: float64

In [87]:
mergedData[['idrbd' ,'onino', 'onina', 'total','obesityPrevalence']]

Unnamed: 0,idrbd,onino,onina,total,obesityPrevalence
0,97,10,6,107,14.953271
1,103,4,0,32,12.500000
2,106,2,0,44,4.545455
3,107,10,9,133,14.285714
4,108,7,7,100,14.000000
...,...,...,...,...,...
2458,30001,4,4,34,23.529412
2459,30003,10,0,79,12.658228
2460,40230,4,5,45,20.000000
2461,72,0,0,1,0.000000
