# DataDrive2030 Early Learning Predictors Challenge - Zindi

https://zindi.africa/competitions/datadrive2030-early-learning-predictors-challenge

## Importing modules

In [168]:
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
# encoding
from sklearn.preprocessing import OrdinalEncoder
# scaling
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler
# splitting
from sklearn.model_selection import train_test_split
# models
from catboost import CatBoostRegressor, Pool
# metrics
from sklearn.metrics import mean_squared_error
# model interpretability library
import shap
# utils
import torch
import random
from tqdm import tqdm
from datetime import datetime as dt
import joblib

import warnings
warnings.filterwarnings('ignore')

In [80]:
# Set seed for reproducability
SEED = 12
random.seed(SEED)
np.random.seed(SEED)

## Data loading

In [3]:
DATA_PATH = 'data/'
Train = pd.read_csv(DATA_PATH + 'Train.csv', low_memory=False)
Test = pd.read_csv(DATA_PATH + 'Test.csv', low_memory=False)
SampleSubmission = pd.read_csv(DATA_PATH + 'SampleSubmission.csv', low_memory=False)
VariableDescription = pd.read_csv(DATA_PATH + 'VariableDescription.csv', low_memory=False)

## EDA

### Dataset description

Dataset sample

In [160]:
Train.head(3)

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,2022-02-03,59.0,,,,,,Sometimes,...,,,,,,,,,,51.5
1,ID_J5BTFOZR3,2019.0,,60.163933,,,,1st year in the programme,103.0,Sometimes,...,,,,,,,,,,55.869999
2,ID_R00SN7AUD,2022.0,2022-03-11,69.0,,,,,108.400002,Often,...,,,,,,,,,,47.52


Datasets shape

In [161]:
print('Train set shape:', Train.shape)
print('Test set shape:', Test.shape)

Train set shape: (8585, 679)
Test set shape: (3680, 678)


Train and test set ratio

In [163]:
Test.shape[0] / Train.shape[0] * 100

42.865463016889926

### Variables description

In [None]:
VariableDescription

Unnamed: 0,Variable Name,Variable Label,Answer Label
0,child_id,Unique child ID,Open ended
1,data_year,Year data was collected,Open ended
2,child_date,ELOM date,Open ended
3,child_age,Child age in months,Open ended
4,child_enrolment_date,Date enrolled in ELP,Open ended
...,...,...,...
676,child_attends,Does child attend an ECD programme?,
677,child_attendance,How many days per week does the child attend t...,
678,child_languages,Child home language(s),
679,pri_fees_amount_pv,Facility monthly fee (present value),


Categorical variables analysis

In [135]:
Train_object_describe = Train.select_dtypes('object').describe().T.sort_values('unique', axis=0, ascending=False)
Train_object_describe

Unnamed: 0,count,unique,top,freq
child_id,8585,8585,ID_SYSJ2FM0D,1
obs_materials,5040,1107,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,365
gps,2412,1064,-27.0850639 23.9166474,39
child_dob,7129,1018,2016-11-11,36
child_enrolment_date,2621,535,2021-02-15,212
...,...,...,...,...
pri_funding_2,1630,2,No,1525
pri_funding_5,1628,2,No,1339
other_practitioner,4,1,THE ONLY AVAILABLE PRACTITIONER NOW.,4
pri_toys,5167,1,No,5167


In [136]:
Test_object_describe = Test.select_dtypes('object').describe().T.sort_values('unique', axis=0, ascending=False)
Test_object_describe

Unnamed: 0,count,unique,top,freq
child_id,3680,3680,ID_0I0999N6S,1
obs_materials,2167,870,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,165
child_dob,3080,808,2016-11-11,18
gps,1006,568,-25.461367 28.118457,14
child_enrolment_date,1112,350,2021-02-15,99
...,...,...,...,...
positionother,3,1,Acting deputy principal,3
pri_locationother,1,1,PRIVATE BUSINESS PROPERTY,1
practitioner,716,1,Yes,716
pri_toys,2207,1,No,2207


Numerical variables analysis

In [128]:
Train_number_describe = Train.select_dtypes('number').describe().T
Train_number_describe

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
data_year,8585.0,2020.983692,0.920267,2019.000000,2021.000000,2021.000000,2022.000000,2022.000000
child_age,8585.0,58.116873,5.318927,49.475410,54.000000,57.000000,62.032787,69.683777
child_months_enrolment,2621.0,20.476917,14.096259,0.000000,8.000000,20.000000,33.000000,57.000000
child_height,6969.0,106.949312,6.430768,64.199997,103.000000,106.599998,110.500000,159.000000
child_observe_total,7883.0,7.381454,3.256987,0.000000,5.000000,8.000000,10.000000,12.000000
...,...,...,...,...,...,...,...,...
obs_heating_4,2261.0,0.072092,0.258697,0.000000,0.000000,0.000000,0.000000,1.000000
obs_heating_5,2261.0,0.050420,0.218859,0.000000,0.000000,0.000000,0.000000,1.000000
obs_heating_6,2261.0,0.026095,0.159452,0.000000,0.000000,0.000000,0.000000,1.000000
obs_heating_7,2261.0,0.011942,0.108647,0.000000,0.000000,0.000000,0.000000,1.000000


In [162]:
Test_number_describe = Test.select_dtypes('number').describe().T
Test_number_describe

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
data_year,3680.0,2020.983152,0.920746,2019.000000,2021.0,2021.000000,2022.0,2022.000000
child_age,3680.0,58.153888,5.256672,49.741272,54.0,58.000000,62.0,69.683777
child_months_enrolment,1112.0,19.697842,13.768122,0.000000,8.0,19.000000,32.0,57.000000
child_height,3019.0,107.137044,6.373249,64.199997,103.0,106.900002,110.9,156.000000
child_observe_total,3410.0,7.324927,3.262990,0.000000,5.0,8.000000,10.0,12.000000
...,...,...,...,...,...,...,...,...
obs_heating_3,968.0,0.002066,0.045431,0.000000,0.0,0.000000,0.0,1.000000
obs_heating_4,968.0,0.064050,0.244968,0.000000,0.0,0.000000,0.0,1.000000
obs_heating_5,968.0,0.053719,0.225579,0.000000,0.0,0.000000,0.0,1.000000
obs_heating_6,968.0,0.018595,0.135160,0.000000,0.0,0.000000,0.0,1.000000


### Missing values analysis

In [101]:
missing_values_train = Train.isnull().sum() / Train.shape[0] * 100
missing_values_train.sort_values()

child_id                   0.000000
pre_covid                  0.000000
sef_ind                    0.000000
language_assessment        0.000000
hle_ind                    0.000000
                            ...    
pra_class_space_small     99.895166
pri_founderother          99.906814
pri_clinic_travelother    99.941759
pri_locationother         99.941759
other_practitioner        99.953407
Length: 679, dtype: float64

In [102]:
missing_values_test = Test.isnull().sum() / Test.shape[0] * 100
missing_values_test.sort_values()

child_id                    0.000000
pre_covid                   0.000000
child_age_group             0.000000
sef_ind                     0.000000
hle_ind                     0.000000
                             ...    
pri_clinic_travelother     99.918478
positionotherreason        99.918478
positionother              99.918478
pri_locationother          99.972826
other_practitioner        100.000000
Length: 678, dtype: float64

Unique ID and target

In [83]:
ID = 'child_id'
TARGET = 'target'

Sample submission visualization

In [103]:
SampleSubmission.head()

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
1,ID_GQ6ONJ4FP,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
2,ID_YZ76CVRW3,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
3,ID_BNINCRXH8,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
4,ID_1U7GDTLRI,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature


## Preprocessing

Useful features research 

In [187]:
features = []
cat_features = []
not_features = []
# missing_rate = Train.isna().sum() / Train.shape[0]
for column in Train.columns[1:]:
    if Train[column].isnull().sum() < 6000:
        features.append(column)
        if Train[column].dtype == 'O':
            cat_features.append(column)
    else:
        not_features.append(column)

print(f'We have {len(features)} features')
print(f'We have {len(cat_features)} categorical features')
print(
    f'We have {len(not_features)} features that have more than 6000 of missing values')


We have 277 features
We have 199 categorical features
We have 401 features that have more than 6000 of missing values


In [116]:
missing_rate = Train.isna().sum() / Train.shape[0]
features_na = Train.columns[(missing_rate < 0.7)]
features_na

Index(['child_id', 'data_year', 'child_date', 'child_age',
       'child_enrolment_date', 'child_months_enrolment', 'child_grant',
       'child_years_in_programme', 'child_height', 'child_observe_attentive',
       ...
       'pre_covid', 'ses_proxy', 'quintile_used', 'id_facility_n', 'id_ward_n',
       'id_mn_n', 'id_dc_n', 'id_prov_n', 'ses_cat', 'target'],
      dtype='object', length=278)

In [117]:
missing_rate = Test.isna().sum() / Test.shape[0]
Test_na = Test.columns[(missing_rate < 0.7)]
Test_na

Index(['child_id', 'data_year', 'child_date', 'child_age',
       'child_enrolment_date', 'child_months_enrolment', 'child_grant',
       'child_years_in_programme', 'child_height', 'child_observe_attentive',
       ...
       'gps_ind', 'pre_covid', 'ses_proxy', 'quintile_used', 'id_facility_n',
       'id_ward_n', 'id_mn_n', 'id_dc_n', 'id_prov_n', 'ses_cat'],
      dtype='object', length=277)

In [118]:
features = features_na.to_list()

In [169]:
Train_pp = Train[features]
Test_pp  = Test[features[:-1]]

Train_pp.fillna('', inplace=True)
Test_pp.fillna('', inplace=True)

max(Train_pp[TARGET]), min(Train_pp[TARGET])

(96.80999755859376, 6.369999885559082)

In [120]:
Train_pp.isnull().sum().sort_values()

child_id                    0
obs_toilet                  0
obs_toilet_1                0
obs_toilet_2                0
obs_toilet_3                0
                           ..
pri_registered_npo          0
pri_registered_programme    0
pri_registered_partial      0
pri_capacity                0
target                      0
Length: 278, dtype: int64

### Catagorization and Normalization

In [175]:
encoder = OrdinalEncoder()
MMS = MinMaxScaler()

columns = Train.columns.to_list()
Train_norm = Train[columns]
Test_norm  = Test[columns[:-1]]

In [176]:
Train_norm.head(3)

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,2022-02-03,59.0,,,,,,Sometimes,...,,,,,,,,,,51.5
1,ID_J5BTFOZR3,2019.0,,60.163933,,,,1st year in the programme,103.0,Sometimes,...,,,,,,,,,,55.869999
2,ID_R00SN7AUD,2022.0,2022-03-11,69.0,,,,,108.400002,Often,...,,,,,,,,,,47.52


In [179]:
def preprocessing_data(data: pd.DataFrame) -> pd.DataFrame:
    ''' dataset preprocessing
    '''
    # dataset copy
    data1 = data.copy()
    # don't threat the ID column
    data1.drop([ID], axis=1, inplace=True)
    # don't threat the target column for train set
    try:
        data2 = data1.drop(TARGET, axis=1)
    except:
        data2 = data1
    # nan value gestion
    # Train_norm.fillna('', inplace=True)
    # categorization
    object_columns = list(data2.select_dtypes(include='object').columns)
    data2[object_columns] = encoder.fit_transform(data2[object_columns])
    # normalization
    scaler = MMS
    data2[data2.columns] = scaler.fit_transform(data2)
    data1[data2.columns] = data2
    return data1


In [180]:
Train_num = preprocessing_data(Train_norm)
Test_num = preprocessing_data(Test_norm)
Train_num.head(3)

Unnamed: 0,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,1.0,0.533597,0.471319,,,,,,1.0,1.0,...,,,,,,,,,,51.5
1,0.0,,0.528916,,,,0.0,0.409283,1.0,0.333333,...,,,,,,,,,,55.869999
2,1.0,0.644269,0.966164,,,,,0.466245,0.666667,0.666667,...,,,,,,,,,,47.52


## Modeling

### Splitting

In [188]:
# xtrain_num, xvalid_num, ytrain_num, yvalid_num = train_test_split(
#     Train_num[Train_num.columns[:-1]],
#     Train_num[Train_num.columns[-1]],
#     test_size=0.15,
#     random_state=SEED,
#     shuffle=True
# )

xtrain_num, xvalid_num, ytrain_num, yvalid_num = train_test_split(
    Train_num[features[:-1]],
    Train_num[features[-1]],
    test_size=0.15,
    random_state=SEED,
    shuffle=True
)

xtrain_num.shape, xvalid_num.shape, ytrain_num.shape, yvalid_num.shape

((7297, 276), (1288, 276), (7297,), (1288,))

In [185]:
xtrain, xvalid, ytrain, yvalid = train_test_split(
    Train_pp[Train_pp.columns[:-1]],
    Train_pp[Train_pp.columns[-1]],
    test_size=0.15,
    random_state=SEED,
    shuffle=True
)

xtrain.shape, xvalid.shape, ytrain.shape, yvalid.shape

((7297, 276), (1288, 276), (7297,), (1288,))

### Metric

In [150]:
def rmse(y_test, y_pred) -> float:
    ''' calculates the root mean squared error of a model 
    '''
    return mean_squared_error(y_test, y_pred, squared=False)

### Parameters

In [151]:
params = {
    'iterations': 3000,
    'learning_rate': 0.1,
    'objective': 'RMSE',
    'random_seed': SEED,
    'early_stopping_rounds': 300,
    'use_best_model': True,
    'task_type': 'GPU' if torch.cuda.is_available() else 'CPU'
}

### Fitting

In [183]:
def catboost_fitting(Test: pd.DataFrame, xtrain, ytrain, xvalid, yvalid, params: dict = params, cat_features=None) -> tuple[CatBoostRegressor, pd.Series]:
    ''' modelling with catboost
    '''
    xtest = Test[features[:-1]]
    train_dataset = Pool(data=xtrain, label=ytrain, cat_features=cat_features)
    val_dataset = Pool(data=xvalid, label=yvalid, cat_features=cat_features)
    model = CatBoostRegressor(**params)
    model.fit(train_dataset, eval_set=val_dataset, verbose=100)
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    print(rmse(yvalid, preds_valid))
    return model, preds_test

Modelling without normalization

In [None]:
model_cbr, preds_test = catboost_fitting(
    Test_pp, xtrain, ytrain, xvalid, yvalid, params=params, cat_features=cat_features)

9.719490517234895

Modelling with normalization

In [189]:
model_num, preds_test_num = catboost_fitting(Test_num, xtrain_num, ytrain_num, xvalid_num, yvalid_num)

0:	learn: 14.6702524	test: 14.6848195	best: 14.6848195 (0)	total: 648ms	remaining: 32m 22s
100:	learn: 9.7078667	test: 10.7962513	best: 10.7962513 (100)	total: 13.2s	remaining: 6m 19s
200:	learn: 8.9611402	test: 10.3913464	best: 10.3913248 (199)	total: 24.2s	remaining: 5m 36s
300:	learn: 8.6252193	test: 10.2596436	best: 10.2567504 (291)	total: 35s	remaining: 5m 13s
400:	learn: 8.3738784	test: 10.1867837	best: 10.1796073 (388)	total: 45.7s	remaining: 4m 56s
500:	learn: 8.1902375	test: 10.1251018	best: 10.1251018 (500)	total: 56.3s	remaining: 4m 40s
600:	learn: 8.0777154	test: 10.0889129	best: 10.0889129 (600)	total: 1m 6s	remaining: 4m 24s
700:	learn: 7.9842908	test: 10.0835805	best: 10.0758286 (669)	total: 1m 16s	remaining: 4m 11s
800:	learn: 7.9268202	test: 10.0785745	best: 10.0758286 (669)	total: 1m 26s	remaining: 3m 57s
900:	learn: 7.8430187	test: 10.0820525	best: 10.0748259 (837)	total: 1m 36s	remaining: 3m 45s
1000:	learn: 7.7993830	test: 10.0819707	best: 10.0748259 (837)	total: 1

## Model interpretability

In [195]:
def interpreting_model(Test: pd.DataFrame, features, model) -> dict:
    ''' doing model interpretability with shap
    '''
    xtest: pd.DataFrame = Test[features[:-1]]
    features = xtest.columns

    f = {}
    for i in range(1, 16):
        f[f'f{i}'] = []

    expaliner = shap.TreeExplainer(model)
    shap_values = expaliner.shap_values(xtest)

    for shap_value in shap_values:
        arr = np.argsort(shap_value)[::-1][:15]
        for ind, a in enumerate(arr):
            name_f = f'f{ind+1}'
            f[name_f].append(features[a])
    return f


Interpretability for unnormalized dataset

In [None]:
f = interpreting_model(Test, model_cbr)

Interpretability for normalized dataset

In [196]:
f_num = interpreting_model(Test_num, features, model_num)

## Submission

In [206]:
def submission(preds_test, f) -> pd.DataFrame:
	''' create a submission file
	'''
	submission = SampleSubmission.copy()
	# predictions
	submission[TARGET] = preds_test
	for i in range(1,16):
		submission[f'feature_{i}'] = f[f'f{i}']
	# submission file
	today = dt.now().strftime(format="%Y-%m-%d_%Hh%M")
	submission.to_csv(f'submissions/submission-{today}.csv', index=False)
	return submission

In [None]:
submission(preds_test, f)

In [207]:
submission(preds_test_num, f_num)

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,60.177488,child_observe_total,teacher_emotional_total,child_observe_diligent,child_height,child_observe_attentive,child_observe_interested,pri_fees_amount,ses_proxy,language_match,child_gender,id_team,pri_fees_amount_pv,obs_materials_19,teacher_social_total,count_children_present
1,ID_GQ6ONJ4FP,49.671598,child_observe_total,teacher_emotional_total,child_observe_attentive,child_observe_concentrated,child_observe_interested,language_match,pri_fees_amount,child_gender,pri_fees_amount_pv,count_children_present,teacher_emotional_met,child_date,teacher_emotional_understand,id_facility,language_assessment
2,ID_YZ76CVRW3,47.818455,child_observe_total,id_team,id_enumerator,pri_fees_amount,id_ward,language_match,language_child,longitude,child_dob,teacher_social_total,pri_fees_amount_pv,teacher_selfcare_total,obs_handwashing_1,pri_year,pri_separate
3,ID_BNINCRXH8,69.991703,child_observe_total,id_enumerator,child_age,child_date,child_observe_diligent,child_observe_attentive,child_observe_concentrated,id_team,child_years_in_programme,id_facility,child_gender,teacher_emotional_total,pri_funding_subsidy,language_match,teacher_social_assistance
4,ID_1U7GDTLRI,40.438829,child_observe_total,pri_meal,id_team,id_ward,prov_best,latitude,language_match,child_observe_interested,pri_registered_dsd,longitude,teacher_emotional_understand,obs_materials_8,count_register_gender_female,teacher_social_initiative,count_register_year_2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3675,ID_LBPQ2VMQZ,55.406548,child_age,child_observe_total,ses_proxy,child_date,child_observe_interested,id_team,child_age_group,id_ward,child_years_in_programme,pri_funding_subsidy,teacher_social_total,pra_free_play_outdoor,language_match,pra_agency_choice,count_register_year_2020
3676,ID_H2RKKMMKK,56.936051,child_age,id_enumerator,child_date,child_height,id_team,pri_fees_amount,id_ward,child_gender,obs_handwashing_1,latitude,child_age_group,pri_fees_amount_pv,count_register_year_2020,count_children_present,teacher_selfcare_total
3677,ID_VY8KX7YTZ,41.828885,child_observe_total,child_height,child_observe_diligent,child_observe_attentive,language_match,child_observe_interested,child_observe_concentrated,child_age,child_gender,child_date,longitude,teacher_selfcare_total,teacher_social_cooperate,teacher_emotional_understand,count_register_gender_female
3678,ID_EO2MYZ4M7,38.293714,id_enumerator,obs_materials,pri_fees_amount,id_facility,pri_separate,child_gender,pri_capacity,child_zha,pri_fees_amount_pv,obs_handwashing_1,language_match,language_assessment,obs_handwashing,count_staff_salary_unpaid,pra_agency_choice


## Saving model

In [35]:
# Save the model as a pickle in a file
# model = model_cbr
# file_name = 'catboost_10.05605957'
# joblib.dump(model, f'models/{file_name}.pkl')

['models/catboost_10.05605957.pkl']