In [2]:
#Importing Libraries
#Data Handling
import pandas as pd
import numpy as np
import sklearn
import math

#plotting
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import Markdown, display
import ipywidgets as widgets
import seaborn as sns
sns.set()

#Pipelines
from source import *
import source.fairness as fm
import source.pipes as op

#Sampling
from sklearn.model_selection import train_test_split

#Storing trains
import pickle

In [3]:
ds = pd.read_csv('dataset/final_dataset.csv', sep = ',', index_col = 0)

In [4]:
ds['UD_Reprobadas_Primer_Semestre'] = ds['UD_Reprobadas_Primer_Semestre'].astype(np.float64)

In [5]:
#Selecting useful variables
Y = ds.iloc[:,range(39,45)]
A = ds.iloc[:,[0,1]]
X = ds.iloc[:,range(2,39)]

In [6]:
A_dummy = pd.get_dummies(A)
A_dummy = A_dummy.iloc[:,0:2]
A_dummy.columns = ['gender', 'public_school']
A_dummy['gender'] = 1-A_dummy['gender']
A_dummy['elite'] = 1-(1-A_dummy['gender'])*(1-A_dummy['public_school'])

In [24]:
for i, x in enumerate(X.columns):
    print("{0}-{1}: {2}, Nan: {3:2f}".format(i, x, ds[x].dtype, 100*ds[x].isna().sum()/4983))

0-Rama_Educacional_Colegio_Grupo: object, Nan: 2.087096
1-Ing_Bruto_Familiar_F: float64, Nan: 2.267710
2-Preferencia_Carrera_F: float64, Nan: 1.264299
3-Colegio_Promedio_PSU: float64, Nan: 15.291993
4-Colegio_Total_Alumnos: float64, Nan: 15.291993
5-Prom_Alumn_Dif: float64, Nan: 15.291993
6-PSU_Matematica_F: float64, Nan: 3.210917
7-PSU_Lenguaje_F: float64, Nan: 3.210917
8-PSU_Ciencias: float64, Nan: 49.006623
9-PSU_Historia: float64, Nan: 42.544652
10-PSU_Hist_Ciencias: float64, Nan: 21.573349
11-PSU_Ranking_F: float64, Nan: 2.749348
12-Notas_Enseñanza_Media: float64, Nan: 19.947823
13-Tipo_Ingreso: object, Nan: 16.616496
14-Region_Residencia_Eq: object, Nan: 0.000000
15-Participa_EDT: int64, Nan: 0.000000
16-Participa_EDV: int64, Nan: 0.000000
17-Participa_PAA_1Sem: int64, Nan: 0.000000
18-Participa_PAA_2Sem: int64, Nan: 0.000000
19-Prom_1er_Sem: float64, Nan: 0.903070
20-Primer_Semestre_MEM: float64, Nan: 5.659241
21-Primer_Semestre_ECO: float64, Nan: 4.455148
22-Primer_Semestre_NEG

# Split Dataset

Split dataset into train and test. Train will be used to fit and transform estimators, such as imputation, normalization, and predictive models. Notice that estimators will be used for transform in the data test only.

For this case, 70% and 30% will be used for train and test respectively.

In [25]:
causal_context = { 'causal_2sem': (Y['CausalSem-12'], pd.concat([X.iloc[:,list(range(0,18))+list(range(19,28))],Y['CausalSem-11'], A_dummy.iloc[:,range(2)]], axis=1, ignore_index=False)),
                   'causal_3sem': (Y['CausalSem-21'], pd.concat([X,Y.iloc[:,range(0,3)], A_dummy.iloc[:,range(2)]], axis=1, ignore_index=False))
}

In [27]:
for key in causal_context:
    print('Considering {} context'.format(key))
    Y_final, X_final = causal_context[key]
    
    X_train, X_test, y_train, y_test = train_test_split(X_final, Y_final, test_size = 0.3, stratify= Y_final, random_state = 1)
    A_train = X_train.loc[:,['gender', 'public_school']]
    A_test = X_test.loc[:,['gender', 'public_school']]

    #Get the datasets considering X_test, y_test, and A_test
    X_train_notaware_gender = X_train.iloc[:,X_train.columns != 'gender']
    X_train_notaware_school = X_train.iloc[:,X_train.columns != 'public_school']
    X_train_aware = X_train
    
    #Creating a dictionary for datasets
    train_datasets = {}
    train_datasets = {
        'aware': X_train_aware,
        'gender': X_train_notaware_gender,
        'public_school': X_train_notaware_school
    }
    
    #Getting Xs
    #i = imputation
    #e = encode
    #n = normalization

    #Dictionary where pickles will be stored
    preprocessing_pickles = {}
    
    #Creating pickles for each case
    for td in train_datasets:
        print('Training prepro for {}'.format(td))
        data = train_datasets[td]
        X, pipe_nom, pipe_num, numerical_features , nominal_features = op.preprocessing(data, imputation=True, encode = True, normalization = True)
        preprocessing_pickles[td] = (X, pipe_nom, pipe_num)
        
        numerical_features.extend(nominal_features)
        with open('pipes/'+key+'/preprocessing_features_'+td+'.pickle', 'wb') as f:
            pickle.dump(numerical_features, f)
    
        print('Total of variables: {0}'.format(len(numerical_features)))

        
    #Run following codes for storing pipelines on pickles
    with open('pipes/'+key+'/preprocessing.pickle', 'wb') as f:
        pickle.dump(preprocessing_pickles, f)
    
    #Get the datasets considering X_test, y_test, and A_test
    X_test_notaware_gender = X_test.iloc[:,X_test.columns != 'gender']
    X_test_notaware_school = X_test.iloc[:,X_test.columns != 'public_school']
    X_test_aware = X_test
    
    test_datasets = {}
    test_datasets = {
    'aware': X_test_aware,
    'gender': X_test_notaware_gender,
    'public_school': X_test_notaware_school
    }
    
    #Applying Preprocessing Pipelines
    data_test_prepro = {}
    for d in preprocessing_pickles.keys():
        print("Transforming test dataset {}".format(d))
        _ , pnom, pnum = preprocessing_pickles[d]
        ds = test_datasets[d]
    
        prep_d = op.applypreprocessing(ds, pnom, pnum)
        data_test_prepro[d] = prep_d
    
    with open('pipes/'+key+'/preprocessing_test.pickle','wb') as f:
        pickle.dump(data_test_prepro,f)
    
    y = {'train': y_train,
         'test': y_test}
    
    with open('pipes/'+key+'/y.pickle', 'wb') as f:
        pickle.dump(y,f)
    
    A = {'train': A_train,
         'test': A_test}
    with open('pipes/'+key+'/A.pickle','wb') as f:
        pickle.dump(A, f)
        
    print()

Considering causal_2sem context
Training prepro for aware
Total of variables: 44
Training prepro for gender
Total of variables: 43
Training prepro for public_school
Total of variables: 43
Transforming test dataset aware
Transforming test dataset gender
Transforming test dataset public_school

Considering causal_3sem context
Training prepro for aware
Total of variables: 56
Training prepro for gender
Total of variables: 55
Training prepro for public_school
Total of variables: 55
Transforming test dataset aware
Transforming test dataset gender
Transforming test dataset public_school

