# DataDrive2030 Early Learning Predictors Baseline CHALLENGE AT ZIndi

https://zindi.africa/competitions/datadrive2030-early-learning-predictors-challenge

## Importing modules

In [98]:
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
# encoding
from sklearn.preprocessing import OrdinalEncoder
# scaling
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler
# splitting
from sklearn.model_selection import train_test_split
# models
from catboost import CatBoostRegressor, Pool
# metrics
from sklearn.metrics import mean_squared_error
# model interpretability library
import shap
# utils
import torch
import random
from tqdm import tqdm
from datetime import datetime as dt
import joblib

In [80]:
# Set seed for reproducability
SEED = 12
random.seed(SEED)
np.random.seed(SEED)

## Data loading

In [3]:
DATA_PATH = 'data/'
Train = pd.read_csv(DATA_PATH + 'Train.csv', low_memory=False)
Test = pd.read_csv(DATA_PATH + 'Test.csv', low_memory=False)
SampleSubmission = pd.read_csv(DATA_PATH + 'SampleSubmission.csv', low_memory=False)
VariableDescription = pd.read_csv(DATA_PATH + 'VariableDescription.csv', low_memory=False)

## EDA

In [99]:
Train.sample(3)

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
3712,ID_BF6S3NLQU,2022.0,2022-03-09,67.0,,,,,104.0,Sometimes,...,,,,,,,,,,52.349998
7304,ID_K8AL6R9AL,2021.0,2021-11-10,53.0,2021-02-15,8.0,Yes,1st year in the programme,101.0,Often,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,41.59
4946,ID_XHQR2W3RF,2022.0,2022-03-14,67.0,,,No,,110.0,Often,...,,,,,,,,,,64.5


Datasets shape

In [100]:
print('Train set shape:', Train.shape)
print('Test set shape:', Test.shape)

Train set shape: (8585, 679)
Test set shape: (3680, 678)


Missing values analysis

In [101]:
missing_values_train = Train.isnull().sum() / Train.shape[0] * 100
missing_values_train.sort_values()

child_id                   0.000000
pre_covid                  0.000000
sef_ind                    0.000000
language_assessment        0.000000
hle_ind                    0.000000
                            ...    
pra_class_space_small     99.895166
pri_founderother          99.906814
pri_clinic_travelother    99.941759
pri_locationother         99.941759
other_practitioner        99.953407
Length: 679, dtype: float64

In [102]:
missing_values_test = Test.isnull().sum() / Test.shape[0] * 100
missing_values_test.sort_values()

child_id                    0.000000
pre_covid                   0.000000
child_age_group             0.000000
sef_ind                     0.000000
hle_ind                     0.000000
                             ...    
pri_clinic_travelother     99.918478
positionotherreason        99.918478
positionother              99.918478
pri_locationother          99.972826
other_practitioner        100.000000
Length: 678, dtype: float64

Sample submission visualization

In [103]:
SampleSubmission.head()

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
1,ID_GQ6ONJ4FP,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
2,ID_YZ76CVRW3,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
3,ID_BNINCRXH8,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
4,ID_1U7GDTLRI,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature


Variables description

In [104]:
VariableDescription

Unnamed: 0,Variable Name,Variable Label,Answer Label
0,child_id,Unique child ID,Open ended
1,data_year,Year data was collected,Open ended
2,child_date,ELOM date,Open ended
3,child_age,Child age in months,Open ended
4,child_enrolment_date,Date enrolled in ELP,Open ended
...,...,...,...
676,child_attends,Does child attend an ECD programme?,
677,child_attendance,How many days per week does the child attend t...,
678,child_languages,Child home language(s),
679,pri_fees_amount_pv,Facility monthly fee (present value),


Unique ID and target

In [83]:
ID = 'child_id'
TARGET = 'target'

## Preprocessing

Useful features research 

In [84]:
features = []
cat_features = []
not_features = []
for column in Train.columns[1:]:
    if Train[column].isnull().sum() < 6000:
        features.append(column)
        if Train[column].dtype == 'O':
            cat_features.append(column)
            print(
                f'There are {len(Train[column].value_counts())} classes in: {column}')
    else:
        not_features.append(column)

print('----------------------------------')
print(f'We have {len(features)} features')
print(f'We have {len(cat_features)} categorical features')
print(
    f'We have {len(not_features)} features that have more than 6000 of missing values')


There are 254 classes in: child_date
There are 535 classes in: child_enrolment_date
There are 4 classes in: child_grant
There are 4 classes in: child_years_in_programme
There are 4 classes in: child_observe_attentive
There are 4 classes in: child_observe_concentrated
There are 4 classes in: child_observe_diligent
There are 4 classes in: child_observe_interested
There are 2 classes in: child_gender
There are 1018 classes in: child_dob
There are 3 classes in: child_stunted
There are 4 classes in: child_age_group
There are 153 classes in: id_mn_best
There are 10 classes in: prov_best
There are 50 classes in: id_dc_best
There are 50 classes in: dc_best
There are 153 classes in: mn_best
There are 5 classes in: pra_free_play
There are 4 classes in: pra_free_play_outdoor
There are 31 classes in: pra_groupings
There are 2 classes in: pra_groupings_1
There are 2 classes in: pra_groupings_2
There are 2 classes in: pra_groupings_3
There are 2 classes in: pra_groupings_4
There are 2 classes in: pr

In [85]:
Train_pp = Train[features]
Test_pp  = Test[features[:-1]]

Train_pp.fillna('', inplace=True)
Test_pp.fillna('', inplace=True)

max(Train_pp[TARGET]), min(Train_pp[TARGET])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(96.80999755859376, 6.369999885559082)

In [105]:
Train_pp.isnull().sum().sort_values()

data_year                   0
obs_toilet                  0
obs_toilet_1                0
obs_toilet_2                0
obs_toilet_3                0
                           ..
pri_registered_npo          0
pri_registered_programme    0
pri_registered_partial      0
pri_capacity                0
target                      0
Length: 277, dtype: int64

## Modeling

### Normalization

In [88]:
Train_norm = Train[features]
Test_norm  = Test[features[:-1]]

In [None]:
# Train_norm.fillna('', inplace=True)
# Test_norm.fillna('', inplace=True)

In [90]:
encoder = OrdinalEncoder()
MMS = MinMaxScaler()
object_columns_train = list(Train_norm.select_dtypes(include='object').columns)
object_columns_test = list(Test_norm.select_dtypes(include='object').columns)

In [91]:
# Train_norm[cat_features] = encoder.fit_transform(Train_norm[cat_features])
Train_norm[object_columns_train] = encoder.fit_transform(Train_norm[object_columns_train])
Test_norm[object_columns_test] = encoder.fit_transform(Test_norm[object_columns_test])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [94]:
Test_norm.sample(3)

Unnamed: 0,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,...,gps_ind,pre_covid,ses_proxy,quintile_used,id_facility_n,id_ward_n,id_mn_n,id_dc_n,id_prov_n,ses_cat
71,0.666667,0.116883,0.263694,,,1.0,0.0,,1.0,1.0,...,1.0,0.0,0.25,0.0,0.190476,0.367521,0.110035,0.200555,0.384568,0.25
2896,0.666667,0.125541,0.21355,,,,0.666667,,0.333333,1.0,...,1.0,0.0,0.25,0.0,0.079365,0.17094,0.110035,0.200555,0.384568,0.25
2643,0.666667,0.402597,0.464271,0.065903,0.789474,1.0,0.666667,0.391068,0.0,0.0,...,1.0,0.0,0.5,0.0,0.031746,0.25641,0.080277,0.431645,1.0,0.75


In [93]:
scaler = MMS
Train_norm[Train_norm.columns] = scaler.fit_transform(Train_norm)
Test_norm[Test_norm.columns] = scaler.fit_transform(Test_norm)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Splitting

In [95]:
xtrain_norm, xvalid_norm, ytrain_norm, yvalid_norm = train_test_split(
    Train_norm[Train_norm.columns[:-1]],
    Train_norm[Train_norm.columns[-1]],
    test_size=0.15,
    random_state=42,
    shuffle=True
)

In [87]:
xtrain, xvalid, ytrain, yvalid = train_test_split(
    Train_pp[Train_pp.columns[:-1]],
    Train_pp[Train_pp.columns[-1]],
    test_size=0.15,
    random_state=42,
    shuffle=True
)

xtrain.shape, xvalid.shape, ytrain.shape, yvalid.shape

((7297, 276), (1288, 276), (7297,), (1288,))

### Metric

In [89]:
def rmse(y_test, y_pred) -> float:
    ''' calculates the root mean squared error of a model 
    '''
    return mean_squared_error(y_test, y_pred, squared=False)

### CatBoost

In [88]:
xtest = Test[features[:-1]]
train_dataset = Pool(data=xtrain, label=ytrain, cat_features=cat_features)
val_dataset = Pool(data=xvalid, label=yvalid, cat_features=cat_features)
model = CatBoostRegressor(
    iterations=30000, learning_rate=0.1, random_seed=123, verbose=300)
model.fit(train_dataset, eval_set=val_dataset,
          use_best_model=True, early_stopping_rounds=300)
preds_valid = model.predict(xvalid)
preds_test = model.predict(xtest)
print(np.sqrt(mean_squared_error(yvalid, preds_valid)))

0:	learn: 14.6142698	test: 15.0964155	best: 15.0964155 (0)	total: 1.06s	remaining: 8h 48m 53s
300:	learn: 8.9822053	test: 10.2164290	best: 10.2159916 (296)	total: 1m 28s	remaining: 2h 26m 16s
600:	learn: 8.1682010	test: 9.9856494	best: 9.9855140 (599)	total: 3m 5s	remaining: 2h 31m 36s
900:	learn: 7.5958007	test: 9.9134383	best: 9.9132350 (899)	total: 4m 37s	remaining: 2h 29m 30s
1200:	learn: 7.1426209	test: 9.8500511	best: 9.8500511 (1200)	total: 6m 7s	remaining: 2h 26m 56s
1500:	learn: 6.7524077	test: 9.8216418	best: 9.8187530 (1458)	total: 7m 34s	remaining: 2h 23m 40s
1800:	learn: 6.3895611	test: 9.7910355	best: 9.7905310 (1795)	total: 9m	remaining: 2h 21m 5s
2100:	learn: 6.0534060	test: 9.7750525	best: 9.7727602 (2091)	total: 10m 26s	remaining: 2h 18m 44s
2400:	learn: 5.7706819	test: 9.7689792	best: 9.7613182 (2165)	total: 11m 52s	remaining: 2h 16m 29s
2700:	learn: 5.5013285	test: 9.7449188	best: 9.7449188 (2700)	total: 13m 16s	remaining: 2h 14m 5s
3000:	learn: 5.2151345	test: 9.73

Traceback (most recent call last):
  File "_catboost.pyx", line 2349, in _catboost.get_cat_factor_bytes_representation
  File "_catboost.pyx", line 1869, in _catboost.get_id_object_bytes_string_representation
_catboost.CatBoostError: bad object for id: nan

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Juste KOUASSI\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Juste KOUASSI\AppData\Local\Temp\ipykernel_24696\3770761726.py", line 9, in <module>
    preds_test = model.predict(xtest)
  File "c:\Users\Juste KOUASSI\AppData\Local\Programs\Python\Python310\lib\site-packages\catboost\core.py", line 5775, in predict
    return self._predict(data, prediction_type, ntree_start, ntree_end, thread_count, verbose, 'predict', task_type)
  File "c:\Users\Juste KOUASSI\AppData\Local\Programs\Python\

In [106]:
params = {
    'iterations': 3000,
    'learning_rate': 0.1,
    'objective': 'RMSE',
    'random_seed': SEED,
    'early_stopping_rounds': 300,
    'use_best_model': True,
    'task_type': 'GPU' if torch.cuda.is_available() else 'CPU'
}

### CatBoostRegressor sans normalisation

In [91]:
xtest = Test_pp[features[:-1]]
train_dataset = Pool(data=xtrain, label=ytrain, cat_features=cat_features)
val_dataset = Pool(data=xvalid, label=yvalid, cat_features=cat_features)
model_cbr = CatBoostRegressor(**params)
model_cbr.fit(train_dataset, eval_set=val_dataset, verbose=100)
preds_valid = model_cbr.predict(xvalid)
preds_test = model_cbr.predict(xtest)
print(rmse(yvalid, preds_valid))

0:	learn: 14.5821071	test: 15.0719725	best: 15.0719725 (0)	total: 1.82s	remaining: 15h 11m 1s
300:	learn: 9.2555310	test: 10.3404002	best: 10.3404002 (300)	total: 5m 35s	remaining: 9h 11m 2s
600:	learn: 8.6849024	test: 10.1156885	best: 10.1153198 (599)	total: 11m 3s	remaining: 9h 51s
900:	learn: 8.3503653	test: 10.0347344	best: 10.0338171 (897)	total: 16m 39s	remaining: 8h 57m 47s


KeyboardInterrupt: 

9.719490517234895

### CatBoostRegressor avec normalisation

In [92]:
xtest_norm = Test_norm[features[:-1]]
train_dataset_norm = Pool(data=xtrain_norm, label=ytrain_norm)
val_dataset_norm = Pool(data=xvalid_norm, label=yvalid_norm)
model_norm = CatBoostRegressor(**params)
model_norm.fit(train_dataset_norm, eval_set=val_dataset_norm, verbose=300)
preds_valid_norm = model_norm.predict(xvalid_norm)
preds_test_norm = model_norm.predict(xtest_norm)
print(rmse(yvalid_norm, preds_valid_norm))

0:	learn: 0.1507104	test: 0.1558392	best: 0.1558392 (0)	total: 301ms	remaining: 2h 30m 21s
300:	learn: 0.0897837	test: 0.1043035	best: 0.1043035 (300)	total: 38.5s	remaining: 1h 3m 13s
600:	learn: 0.0838117	test: 0.1028227	best: 0.1028137 (588)	total: 1m 14s	remaining: 1h 54s
900:	learn: 0.0807352	test: 0.1019851	best: 0.1019850 (898)	total: 1m 46s	remaining: 57m 14s
1200:	learn: 0.0788090	test: 0.1020099	best: 0.1018555 (973)	total: 2m 15s	remaining: 54m 6s
1500:	learn: 0.0768224	test: 0.1019524	best: 0.1018555 (973)	total: 2m 43s	remaining: 51m 53s
1800:	learn: 0.0752947	test: 0.1018778	best: 0.1018555 (973)	total: 3m 15s	remaining: 51m
2100:	learn: 0.0741582	test: 0.1017236	best: 0.1017209 (2094)	total: 3m 49s	remaining: 50m 49s
2400:	learn: 0.0729669	test: 0.1017071	best: 0.1016957 (2265)	total: 4m 21s	remaining: 50m 9s
2700:	learn: 0.0721383	test: 0.1017646	best: 0.1016809 (2420)	total: 4m 53s	remaining: 49m 29s
3000:	learn: 0.0713280	test: 0.1017212	best: 0.1016809 (2420)	total: 

KeyboardInterrupt: 

## Model interpretability

### Interpretability for unnormalized dataset

In [94]:
new_features = xtest.columns

f = {}
for i in range(1, 16):
    f[f'f{i}'] = []

In [None]:
expaliner = shap.TreeExplainer(model_cbr)
shap_values = expaliner.shap_values(xtest)

In [31]:
for shap_value in shap_values:
    arr = np.argsort(shap_value)[::-1][:15]
    for ind, a in enumerate(arr):
        name_f = f'f{ind+1}'
        f[name_f].append(new_features[a])

### Interpretability for normalized dataset

In [95]:
features_norm = xtest_norm.columns

f_norm = {}
for i in range(1, 16):
    f_norm[f'f{i}'] = []

In [None]:
expaliner_norm = shap.TreeExplainer(model_norm)
shap_values_norm = expaliner_norm.shap_values(xtest_norm)

In [102]:
for shap_value in shap_values_norm:
    arr = np.argsort(shap_value)[::-1][:15]
    for ind, a in enumerate(arr):
        name_f = f'f{ind+1}'
        f_norm[name_f].append(features_norm[a])

## Submission

In [32]:
SampleSubmission[TARGET] = preds_test
for i in range(1,16):
    SampleSubmission[f'feature_{i}'] = f[f'f{i}']
SampleSubmission

In [104]:
SS_norm = SampleSubmission.copy()
SS_norm[TARGET] = preds_test_norm
for i in range(1,16):
    SS_norm[f'feature_{i}'] = f_norm[f'f{i}']
SS_norm

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,57.518256,child_observe_total,teacher_emotional_total,child_observe_diligent,child_observe_attentive,child_observe_interested,child_height,pri_fees_amount,count_register_gender_female,id_team,child_age,ses_cat,count_children_present,child_years_in_programme,obs_materials_19,child_gender
1,ID_GQ6ONJ4FP,42.645501,child_observe_total,teacher_emotional_total,child_observe_attentive,count_register_year_2015,child_observe_interested,child_observe_concentrated,pri_year,language_child,pri_fees_amount,child_gender,teacher_emotional_understand,count_register_year_2020,id_ward_n,dc_best,id_facility_n
2,ID_YZ76CVRW3,47.425821,child_observe_total,id_enumerator,longitude,latitude,id_team,obs_toilet,language_child,pri_fees_amount,count_toilets_children,child_age,language_assessment,teacher_selfcare_total,language_match,pra_plans_0,ses_cat
3,ID_BNINCRXH8,67.754492,child_observe_total,id_enumerator,child_date,child_observe_diligent,child_observe_attentive,teacher_emotional_total,child_age,id_team,child_height,child_observe_concentrated,id_prov_n,id_dc_best,count_staff_all,child_years_in_programme,mn_best
4,ID_1U7GDTLRI,44.176424,pri_meal,child_observe_total,id_mn_best,id_team,child_observe_interested,id_ward,longitude,dc_best,teacher_emotional_understand,obs_handwashing,count_register_gender_female,id_prov,child_dob,count_children_precovid,pri_time_close_hours
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3675,ID_LBPQ2VMQZ,46.279520,child_age,child_observe_total,child_age_group,ses_proxy,child_observe_interested,id_team,id_ward,pri_language_97,child_years_in_programme,teacher_emotional_understand,teacher_selfcare_met,count_register_year_2020,teacher_duration,pri_funding_subsidy,child_stunted
3676,ID_H2RKKMMKK,52.676299,child_age,child_height,child_date,child_age_group,pri_fees_amount,obs_materials_19,obs_handwashing,id_team,ward_best,id_ward,count_children_present,child_dob,language_match,id_enumerator,id_dc_n
3677,ID_VY8KX7YTZ,42.188966,child_observe_total,child_observe_attentive,child_observe_diligent,child_observe_interested,child_date,child_height,child_age,child_observe_concentrated,mn_best,count_register_gender_female,language_match,teacher_emotional_understand,teacher_social_total,teacher_selfcare_total,child_gender
3678,ID_EO2MYZ4M7,33.140829,id_enumerator,language_match,latitude,obs_handwashing,id_dc_n,count_register_gender_female,count_staff_salary_unpaid,prov_best,pra_agency_choice,id_facility,pri_capacity,count_children_present,teacher_emotional_understand,obs_area,child_gender


In [105]:
# submission file
today = dt.now().strftime(format="%Y-%m-%d_%Hh%M")
SS_norm.to_csv(f'submissions/soumission-{today}.csv', index=False)

## Saving model

In [35]:
# Save the model as a pickle in a file
model = model_cbr
file_name = 'catboost_10.05605957'
joblib.dump(model, f'models/{file_name}.pkl')

['models/catboost_10.05605957.pkl']