# Train data Preprocessing

In [1]:
import pandas as pd
from preprocessing import minimal_preprocessing, drop_nans_threshold, rename_columns

In [2]:
X, y = minimal_preprocessing()
X = drop_nans_threshold(X, threshold=0.7)
X = rename_columns(X)

In [3]:
X['read'] = X['read'].eq(1)
X['write'] = X['write'].eq(1)
X['attended_school'] = X['attended_school'].eq(1)

In [4]:
X

Unnamed: 0,uid,read,write,attended_school,highest_school_lvl,highest_school_lvl_grade,highest_diploma,preschool,now_enrolled,now_not_enroll_reason,...,present_in_past_year,lives_with_mother,mother_education,mother_alive,mother_death_age,age_mother,lives_with_father,father_education,father_alive,father_death_age
0,441_2_3,True,True,True,2.0,3.0,3.0,0.0,1.0,,...,1,1,,,,,1,,,
1,647_7_1,True,True,True,2.0,2.0,2.0,0.0,1.0,,...,1,1,,,,,2,3.0,2.0,63.0
2,756_4_1,True,True,True,2.0,3.0,3.0,0.0,1.0,,...,1,2,3.0,1.0,,44.0,2,3.0,1.0,
3,25_4_3,True,True,True,6.0,1.0,3.0,2.0,1.0,,...,1,1,,,,,2,3.0,2.0,51.0
4,132_6_3,True,True,True,2.0,3.0,3.0,0.0,1.0,,...,1,1,,,,,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5329,574_5_1,True,True,True,1.0,4.0,1.0,1.0,2.0,13.0,...,1,2,3.0,1.0,,43.0,2,7.0,1.0,
5330,618_2_1,False,False,True,1.0,4.0,1.0,1.0,2.0,13.0,...,1,2,1.0,1.0,,38.0,2,1.0,1.0,
5331,155_4_1,True,True,True,1.0,8.0,2.0,0.0,2.0,13.0,...,1,2,2.0,1.0,,51.0,2,2.0,1.0,
5332,475_5_1,True,True,True,1.0,8.0,2.0,2.0,2.0,13.0,...,1,2,2.0,1.0,,60.0,2,2.0,1.0,


In [5]:
# Get columns with any NaN values
nan_cols = X.columns[X.isna().any()].tolist()

In [6]:
nan_cols

['highest_school_lvl',
 'highest_school_lvl_grade',
 'highest_diploma',
 'preschool',
 'now_enrolled',
 'now_not_enroll_reason',
 'past_enrolled',
 'past_not_enroll_reason',
 'finish_school_age',
 'less_than_19',
 'lives_with_partner',
 'partner_id_code',
 'mother_education',
 'mother_alive',
 'mother_death_age',
 'age_mother',
 'father_education',
 'father_alive',
 'father_death_age']

# Now we imputate NaNs

The following columns have NaN values exclusively for people who did not attend school:

* highest_school_lvl
* highest_school_lvl_grade
* highest_diploma
* preschool

so we fill nas with 0 for all of them.

In [7]:
# Proof
cond = X['highest_school_lvl'].isna() & X['attended_school'].eq(True)
assert(len(X.loc[cond]) == 0)  
cond = X['highest_school_lvl_grade'].isna() & X['attended_school'].eq(True)
assert(len(X.loc[cond]) == 0)  
cond = X['highest_diploma'].isna() & X['attended_school'].eq(True)
assert(len(X.loc[cond]) == 0)  
cond = X['preschool'].isna() & X['attended_school'].eq(True)
assert(len(X.loc[cond]) == 0)  

# Fill NaNs with 0
X['highest_school_lvl'] = X['highest_school_lvl'].fillna(0)
X['highest_school_lvl_grade'] = X['highest_school_lvl_grade'].fillna(0)
X['highest_diploma'] = X['highest_diploma'].fillna(0)
X['preschool'] = X['preschool'].fillna(0)


Now focus on imputating these:
* now_enrolled
* now_not_enroll_reason
* past_enrolled
* past_not_enroll_reason

We have the following dependencies: 
* now_enrolled -> now_not_enroll_reason==null
* past_enrolled -> past_not_enroll_reason==null

So we can merge now_enrolled and now_not_enroll_reason into a single categorical variable.





In [None]:
X['now_not_enroll_reason'].describe()

In [None]:
# Add 'enrolled' as an additional category
X['now_not_enroll_reason'] = X['now_not_enroll_reason'].fillna(15)
X = X.drop(columns=['now_enrolled'])

In [13]:
X['past_not_enroll_reason'].describe()

count    5138.000000
mean        9.955430
std         4.831899
min         1.000000
25%         4.000000
50%        13.000000
75%        13.000000
max        14.000000
Name: past_not_enroll_reason, dtype: float64

In [14]:
# Add 'past_enrolled' as an additional category
X['past_not_enroll_reason'] = X['past_not_enroll_reason'].fillna(15)
X = X.drop(columns=['past_enrolled'])


Now focus on these:
* finish_school_age
* less_than_19
* lives_with_partner
* partner_id_code


Now focus on these:
* mother_education
* mother_alive
* mother_death_age
* age_mother
* father_education
* father_alive
* father_death_age