In [1]:
#Importing Libraries
#Data Handling
import pandas as pd

import sys
sys.path.append('../../')

#Pipelines
import source.pipes as op

#Sampling
from sklearn.model_selection import train_test_split

#Storing trains
import pickle

In [2]:
ds = pd.read_csv('../../data/to-ml-dropout/final_dataset.csv', sep = ',', index_col = 0)

In [3]:
ds.shape

(4706, 43)

In [19]:
selected = []
for i, x in enumerate(ds.columns):
    print("{0}-{1}: {2}, Nan: {3:2f}".format(i, x, ds[x].dtype, 100*ds[x].isna().sum()/ds.shape[0]))

0-Anio_Ingreso: int64, Nan: 0.000000
1-Genero: int64, Nan: 0.000000
2-Grupo_Dependencia_Colegio_F: object, Nan: 0.000000
3-Rama_Educacional_Colegio_Grupo: object, Nan: 2.209945
4-Ing_Bruto_Familiar_F: float64, Nan: 2.401190
5-Preferencia_Carrera_F: float64, Nan: 1.338717
6-Colegio_Promedio_PSU: float64, Nan: 16.192095
7-Colegio_Total_Alumnos: float64, Nan: 16.192095
8-Prom_Alumn_Dif: float64, Nan: 16.192095
9-PSU_Matematica_F: float64, Nan: 3.399915
10-PSU_Lenguaje_F: float64, Nan: 3.399915
11-PSU_Ciencias: float64, Nan: 51.891203
12-PSU_Historia: float64, Nan: 45.048874
13-PSU_Hist_Ciencias: float64, Nan: 22.843179
14-PSU_Ranking_F: float64, Nan: 2.911177
15-Notas_Enseñanza_Media: float64, Nan: 21.121972
16-Tipo_Ingreso: object, Nan: 17.594560
17-Region_Residencia_Eq: object, Nan: 0.000000
18-Participa_EDT: int64, Nan: 0.000000
19-Participa_EDV: int64, Nan: 0.000000
20-Participa_PAA_1Sem: int64, Nan: 0.000000
21-Participa_PAA_2Sem: int64, Nan: 0.000000
22-Prom_1er_Sem: float64, Nan: 0

In [14]:
#Selecting useful variables
Y = ds.iloc[:,[42]]
A = ds.iloc[:,[1,2]]
X = ds.iloc[:,range(3,42)]

In [15]:
A_dummy = pd.get_dummies(A)
A_dummy = A_dummy.iloc[:,0:2]
A_dummy.columns = ['gender', 'public_school']
#So now, gender=1 means female
A_dummy['gender'] = 1-A_dummy['gender']

In [20]:
selected = []
for i, x in enumerate(X.columns):
    nan = 100*ds[x].isna().sum()/ds.shape[0]
    if nan <=20:
        selected.append(True)
    else: selected.append(False)
    print("{0}-{1}: {2}, Nan: {3:2f}".format(i, x, ds[x].dtype, nan))

0-Rama_Educacional_Colegio_Grupo: object, Nan: 2.209945
1-Ing_Bruto_Familiar_F: float64, Nan: 2.401190
2-Preferencia_Carrera_F: float64, Nan: 1.338717
3-Colegio_Promedio_PSU: float64, Nan: 16.192095
4-Colegio_Total_Alumnos: float64, Nan: 16.192095
5-Prom_Alumn_Dif: float64, Nan: 16.192095
6-PSU_Matematica_F: float64, Nan: 3.399915
7-PSU_Lenguaje_F: float64, Nan: 3.399915
8-PSU_Ciencias: float64, Nan: 51.891203
9-PSU_Historia: float64, Nan: 45.048874
10-PSU_Hist_Ciencias: float64, Nan: 22.843179
11-PSU_Ranking_F: float64, Nan: 2.911177
12-Notas_Enseñanza_Media: float64, Nan: 21.121972
13-Tipo_Ingreso: object, Nan: 17.594560
14-Region_Residencia_Eq: object, Nan: 0.000000
15-Participa_EDT: int64, Nan: 0.000000
16-Participa_EDV: int64, Nan: 0.000000
17-Participa_PAA_1Sem: int64, Nan: 0.000000
18-Participa_PAA_2Sem: int64, Nan: 0.000000
19-Prom_1er_Sem: float64, Nan: 0.956226
20-Primer_Semestre_MEM: float64, Nan: 5.992350
21-Primer_Semestre_ECO: float64, Nan: 4.717382
22-Primer_Semestre_NEG

# Split Dataset

Split dataset into train and test. Train will be used to fit and transform estimators, such as imputation, normalization, and predictive models. We use a ratio 70/30.

In [22]:
causal_context = (Y, pd.concat([X.loc[:,selected],A_dummy], axis=1, ignore_index=False))

In [28]:
analysis_scenarios = ['dropout', 'fairness']

for key in analysis_scenarios:
    print('Considering {} context'.format(key))
    Y_final, X_final = causal_context
    
    X_train, X_test, y_train, y_test = train_test_split(X_final, Y_final, test_size = 0.3, stratify= Y_final, random_state = 1)
    A_train = X_train.loc[:, ['gender', 'public_school']]
    A_test = X_test.loc[:, ['gender', 'public_school']]
    
    #Get the datasets considering X_test, y_test, and A_test
    X_train_notaware_gender = X_train.iloc[:, X_train.columns != 'gender']
    X_train_notaware_school = X_train.iloc[:, X_train.columns != 'public_school']
    X_train_unaware = X_train.iloc[:, (X_train.columns != 'gender') & (X_train.columns != 'public_school')]
    X_train_aware = X_train
    
    #Creating a dictionary for datasets
    train_datasets = {
        'aware': X_train_aware,
        'unaware': X_train_unaware,
        'gender': X_train_notaware_gender,
        'public_school': X_train_notaware_school
    }

    #Dictionary where pickles will be stored
    preprocessing_pickles = {}
    preprocessing_pickles_norm = {}
    
    enc = 'label'

    #Creating pickles for each case
    for td in train_datasets:
        print('Training prepro for {}'.format(td))
        data = train_datasets[td]
        X, pipe_nom, pipe_num, numerical_features , nominal_features = op.preprocessing(data, idnumerical=None, imputation=True, encode = enc, normalization = False)
        preprocessing_pickles[td] = (X, pipe_nom, pipe_num)
        
        X, pipe_nom, pipe_num, numerical_features , nominal_features = op.preprocessing(data, idnumerical=None, imputation=True, encode = enc, normalization = True)
        preprocessing_pickles_norm[td] = (X, pipe_nom, pipe_num)
        
        numerical_features.extend(nominal_features)        
        with open('../../pipes/chile-dropout/'+key+'/preprocessing_features_'+td+'.pickle', 'wb') as f:
            pickle.dump(numerical_features, f)
            
        print('Total of variables: {0}'.format(len(numerical_features)))

    #Run following codes for storing pipelines on pickles
    with open('../../pipes/chile-dropout/'+key+'/preprocessing.pickle', 'wb') as f:
        pickle.dump(preprocessing_pickles, f)
        
    with open('../../pipes/chile-dropout/'+key+'/preprocessing_norm.pickle', 'wb') as f:
        pickle.dump(preprocessing_pickles_norm, f)
    
    #Get the datasets considering X_test, y_test, and A_test
    X_test_notaware_gender = X_test.iloc[:, X_test.columns != 'gender']
    X_test_notaware_school = X_test.iloc[:, X_test.columns != 'public_school']
    X_test_unaware = X_test.iloc[:, (X_test.columns != 'gender') & (X_test.columns != 'public_school')]
    X_test_aware = X_test
    
    test_datasets = {}
    test_datasets = {
    'aware': X_test_aware,
    'unaware': X_test_unaware,
    'gender': X_test_notaware_gender,
    'public_school': X_test_notaware_school
    }
    
    #Applying Preprocessing Pipelines
    data_test_prepro = {}
    data_test_prepro_norm = {}
    
    for d in preprocessing_pickles.keys():
        print("Transforming test dataset {}".format(d))
        ds = test_datasets[d]
        
        _ , pnom, pnum = preprocessing_pickles[d]
        prep_d = op.applypreprocessing(ds, pnom, pnum)
        data_test_prepro[d] = prep_d

        _ , pnom, pnum = preprocessing_pickles_norm[d]
        prep_d = op.applypreprocessing(ds, pnom, pnum)
        data_test_prepro_norm[d] = prep_d
        
    
    with open('../../pipes/chile-dropout/'+key+'/preprocessing_test.pickle','wb') as f:
        pickle.dump(data_test_prepro,f)
    with open('../../pipes/chile-dropout/'+key+'/preprocessing_test_norm.pickle', 'wb') as f:
        pickle.dump(data_test_prepro_norm, f)
    
    
    y = {'train': y_train,
         'test': y_test}
    
    with open('../../pipes/chile-dropout/'+key+'/y.pickle', 'wb') as f:
        pickle.dump(y,f)
    
    A = {'train': A_train,
         'test': A_test}
    with open('../../pipes/chile-dropout/'+key+'/A.pickle','wb') as f:
        pickle.dump(A, f)
        
    print()

Considering dropout context
Training prepro for aware
Total of variables: 34
Training prepro for unaware
Total of variables: 32
Training prepro for gender
Total of variables: 33
Training prepro for public_school
Total of variables: 33
Transforming test dataset aware
Transforming test dataset unaware
Transforming test dataset gender
Transforming test dataset public_school

Considering fairness context
Training prepro for aware
Total of variables: 34
Training prepro for unaware
Total of variables: 32
Training prepro for gender
Total of variables: 33
Training prepro for public_school
Total of variables: 33
Transforming test dataset aware
Transforming test dataset unaware
Transforming test dataset gender
Transforming test dataset public_school

