# DataDrive2030 Early Learning Predictors Baseline CHALLENGE AT ZIndi

https://zindi.africa/competitions/datadrive2030-early-learning-predictors-challenge

## Importing modules

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# models
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# splitting
from sklearn.model_selection import train_test_split
# metrics
from sklearn.metrics import mean_squared_error
# model Interpretability library
import shap
# utils
import random
from tqdm import tqdm
from datetime import datetime as dt
import joblib

In [None]:
# Set seed for reproducability
SEED = 12
random.seed(SEED)
np.random.seed(SEED)

## Data loading

In [2]:
DATA_PATH = 'data/'
Train = pd.read_csv(DATA_PATH + 'Train.csv', low_memory=False)
Test = pd.read_csv(DATA_PATH + 'Test.csv', low_memory=False)
SampleSubmission = pd.read_csv(DATA_PATH + 'SampleSubmission.csv', low_memory=False)
VariableDescription = pd.read_csv(DATA_PATH + 'VariableDescription.csv', low_memory=False)

## EDA

In [3]:
Train

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,2022-02-03,59.000000,,,,,,Sometimes,...,,,,,,,,,,51.500000
1,ID_J5BTFOZR3,2019.0,,60.163933,,,,1st year in the programme,103.000000,Sometimes,...,,,,,,,,,,55.869999
2,ID_R00SN7AUD,2022.0,2022-03-11,69.000000,,,,,108.400002,Often,...,,,,,,,,,,47.520000
3,ID_BSSK60PAZ,2021.0,2021-10-13,53.000000,2020-01-15,20.0,No,1st year in the programme,98.099998,Almost always,...,,,,,,,,,,58.599998
4,ID_IZTY6TC4D,2021.0,2021-10-13,57.000000,2021-10-13,0.0,,2nd year in programme,114.000000,Almost always,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,76.599998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8580,ID_S6RCB2ULK,2021.0,2021-11-10,55.000000,2021-01-12,9.0,Yes,1st year in the programme,102.300003,Often,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,23.469999
8581,ID_8A6LW98KG,2021.0,2021-10-12,55.000000,2019-01-15,32.0,,2nd year in programme,102.599998,Often,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,45.639999
8582,ID_L52DMG5D1,2021.0,2021-11-02,56.000000,2018-01-07,45.0,,3rd year in programme,103.800003,Almost always,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,53.290001
8583,ID_QZQAO2GKX,2021.0,2021-11-05,57.000000,2021-01-15,9.0,Yes,1st year in the programme,102.400002,Almost always,...,,,,,,,,,,56.330002


In [4]:
Train.isnull().sum()

(child_id                   0
 data_year                  0
 child_date              1821
 child_age                  0
 child_enrolment_date    5964
                         ... 
 obs_heating_4           6324
 obs_heating_5           6324
 obs_heating_6           6324
 obs_heating_7           6324
 target                     0
 Length: 679, dtype: int64,
 (8585, 679))

In [5]:
Test.isnull().sum(), Test.shape

(child_id                   0
 data_year                  0
 child_date               754
 child_age                  0
 child_enrolment_date    2568
                         ... 
 obs_heating_3           2712
 obs_heating_4           2712
 obs_heating_5           2712
 obs_heating_6           2712
 obs_heating_7           2712
 Length: 678, dtype: int64,
 (3680, 678))

In [6]:
SampleSubmission

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
1,ID_GQ6ONJ4FP,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
2,ID_YZ76CVRW3,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
3,ID_BNINCRXH8,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
4,ID_1U7GDTLRI,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3675,ID_LBPQ2VMQZ,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
3676,ID_H2RKKMMKK,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
3677,ID_VY8KX7YTZ,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
3678,ID_EO2MYZ4M7,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature


In [7]:
VariableDescription

Unnamed: 0,Variable Name,Variable Label,Answer Label
0,child_id,Unique child ID,Open ended
1,data_year,Year data was collected,Open ended
2,child_date,ELOM date,Open ended
3,child_age,Child age in months,Open ended
4,child_enrolment_date,Date enrolled in ELP,Open ended
...,...,...,...
676,child_attends,Does child attend an ECD programme?,
677,child_attendance,How many days per week does the child attend t...,
678,child_languages,Child home language(s),
679,pri_fees_amount_pv,Facility monthly fee (present value),


## Preprocessing

In [8]:
features = []
cat_features = []
not_features = []
for k in Train.columns[1:]:
    if Train[k].isnull().sum() < 6000:
        features.append(k)
        if Train[k].dtype == 'O':
            cat_features.append(k)
            print('There is ' +
                  str(len(Train[k].value_counts()))+' Class in: ' + k)
    else:
        not_features.append(k)

print('----------------------------------')
print('We have '+str(len(features)) + ' features')
print('We have '+str(len(cat_features)) + ' categorical features')
print('We have '+str(len(not_features)) +
      ' features that have more than 6000 of missing values')

There is 254 Class in: child_date
There is 535 Class in: child_enrolment_date
There is 4 Class in: child_grant
There is 4 Class in: child_years_in_programme
There is 4 Class in: child_observe_attentive
There is 4 Class in: child_observe_concentrated
There is 4 Class in: child_observe_diligent
There is 4 Class in: child_observe_interested
There is 2 Class in: child_gender
There is 1018 Class in: child_dob
There is 3 Class in: child_stunted
There is 4 Class in: child_age_group
There is 153 Class in: id_mn_best
There is 10 Class in: prov_best
There is 50 Class in: id_dc_best
There is 50 Class in: dc_best
There is 153 Class in: mn_best
There is 5 Class in: pra_free_play
There is 4 Class in: pra_free_play_outdoor
There is 31 Class in: pra_groupings
There is 2 Class in: pra_groupings_1
There is 2 Class in: pra_groupings_2
There is 2 Class in: pra_groupings_3
There is 2 Class in: pra_groupings_4
There is 2 Class in: pra_groupings_5
There is 3 Class in: pra_engaged
There is 4 Class in: pra_age

In [9]:
Train = Train[features]
Test  = Test[features[:-1]]

In [10]:
max(Train['target']), min(Train['target'])

(96.80999755859376, 6.369999885559082)

In [11]:
Train = Train.fillna('')
Test  = Test.fillna('')

## Modeling

In [12]:
xtrain, xvalid, ytrain, yvalid = train_test_split(
    Train[Train.columns[:-1]],
    Train[Train.columns[-1]],
    test_size = 0.15,
    random_state = 42,
    shuffle = True
)

In [13]:
xtrain.shape, xvalid.shape, ytrain.shape, yvalid.shape

((7297, 276), (1288, 276), (7297,), (1288,))

### CatBoostRegressor

In [14]:
xtest = Test[features[:-1]]
train_dataset = Pool(data=xtrain, label=ytrain, cat_features=cat_features)
val_dataset = Pool(data=xvalid, label=yvalid, cat_features=cat_features)
model = CatBoostRegressor(
    iterations=30000, learning_rate=0.1, random_seed=123, verbose=300)
model.fit(train_dataset, eval_set=val_dataset,
          use_best_model=True, early_stopping_rounds=300)
preds_valid = model.predict(xvalid)
preds_test = model.predict(xtest)
print(np.sqrt(mean_squared_error(yvalid, preds_valid)))

0:	learn: 14.6142698	test: 15.0964155	best: 15.0964155 (0)	total: 582ms	remaining: 4h 50m 57s
300:	learn: 8.9822053	test: 10.2164290	best: 10.2159916 (296)	total: 2m 15s	remaining: 3h 42m 52s
600:	learn: 8.1682010	test: 9.9856494	best: 9.9855140 (599)	total: 4m 29s	remaining: 3h 39m 36s
900:	learn: 7.5958007	test: 9.9134383	best: 9.9132350 (899)	total: 6m 48s	remaining: 3h 39m 54s
1200:	learn: 7.1426209	test: 9.8500511	best: 9.8500511 (1200)	total: 9m 2s	remaining: 3h 36m 40s
1500:	learn: 6.7524077	test: 9.8216418	best: 9.8187530 (1458)	total: 10m 23s	remaining: 3h 17m 24s
1800:	learn: 6.3895611	test: 9.7910355	best: 9.7905310 (1795)	total: 12m 24s	remaining: 3h 14m 18s
2100:	learn: 6.0534060	test: 9.7750525	best: 9.7727602 (2091)	total: 14m 25s	remaining: 3h 11m 38s
2400:	learn: 5.7706819	test: 9.7689792	best: 9.7613182 (2165)	total: 16m 13s	remaining: 3h 6m 27s
2700:	learn: 5.5013285	test: 9.7449188	best: 9.7449188 (2700)	total: 17m 48s	remaining: 2h 59m 57s
3000:	learn: 5.2151345	te

In [None]:
xtest = Test[features[:-1]]
train_dataset = Pool(data=xtrain, label=ytrain, cat_features=cat_features)
val_dataset = Pool(data=xvalid, label=yvalid, cat_features=cat_features)
model_lgbm = LGBMRegressor(
    iterations=30000, learning_rate=0.1, random_seed=123, verbose=300)
model_lgbm.fit(train_dataset)
preds_valid = model_lgbm.predict(xvalid)
preds_test = model_lgbm.predict(xtest)
print(np.sqrt(mean_squared_error(yvalid, preds_valid)))

In [17]:
features = xtest.columns

In [18]:
f = {
    'f1': [], 'f2': [], 'f3': [], 'f4': [], 'f5': [],
    'f6': [], 'f7': [], 'f8': [], 'f9': [], 'f10': [],
    'f11': [], 'f12': [], 'f13': [], 'f14': [], 'f15': []
}

In [24]:
for shap_value in shap_values:
    arr = np.argsort(shap_value)[::-1][:15]
    for ind, a in enumerate(arr):
        name_f = f'f{ind+1}'
        f[name_f].append(features[a])

In [29]:
SampleSubmission['target'] = preds_test
for i in range(1,16):
    SampleSubmission[f'feature_{i}'] = f[f'f{i}']

# SampleSubmission['feature_1'] = f['f1']
# SampleSubmission['feature_2'] = f['f2']
# SampleSubmission['feature_3'] = f['f3']
# SampleSubmission['feature_4'] = f['f4']
# SampleSubmission['feature_5'] = f['f5']
# SampleSubmission['feature_6'] = f['f6']
# SampleSubmission['feature_7'] = f['f7']
# SampleSubmission['feature_8'] = f['f8']
# SampleSubmission['feature_9'] = f['f9']
# SampleSubmission['feature_10'] = f['f10']
# SampleSubmission['feature_11'] = f['f11']
# SampleSubmission['feature_12'] = f['f12']
# SampleSubmission['feature_13'] = f['f13']
# SampleSubmission['feature_14'] = f['f14']
# SampleSubmission['feature_15'] = f['f15']

ValueError: Length of values (7360) does not match length of index (3680)

In [21]:
SampleSubmission

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,58.764816,child_observe_diligent,teacher_emotional_total,child_observe_interested,child_observe_attentive,ses_cat,teacher_emotional_understand,child_gender,id_mn_best,id_enumerator,child_years_in_programme,ses_proxy,child_observe_total,teacher_social_total,language_child,census
1,ID_GQ6ONJ4FP,43.200392,teacher_emotional_total,child_observe_diligent,child_observe_interested,child_observe_attentive,child_observe_concentrated,teacher_emotional_understand,teacher_emotional_met,id_facility,child_observe_total,teacher_emotional_selfstarter,obs_water_running,obs_toilet_6,pra_agency_questions,mn_best,pri_calc_time_close
2,ID_YZ76CVRW3,48.647779,child_observe_diligent,child_observe_attentive,child_observe_interested,prov_best,id_team,teacher_emotional_independent,obs_toilet,ses_cat,child_date,id_ward_n,child_observe_concentrated,pra_plans_1,id_enumerator,certificate_registration_partial,teacher_emotional_confidence
3,ID_BNINCRXH8,70.838060,child_observe_diligent,child_observe_concentrated,id_enumerator,prov_best,child_age,child_observe_attentive,id_facility,child_years_in_programme,child_stunted,child_observe_interested,child_date,id_prov,teacher_emotional_selfstarter,teacher_emotional_understand,teacher_emotional_total
4,ID_1U7GDTLRI,45.069746,child_observe_diligent,child_observe_interested,pri_attendance,child_observe_attentive,id_ward,id_prov,pri_registered_programme,id_mn_best,id_facility,teacher_social_assistance,longitude,census,pri_language_1,obs_water_running,pri_calc_time_open
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3675,ID_LBPQ2VMQZ,49.087739,child_age,child_age_group,child_observe_interested,child_observe_attentive,child_years_in_programme,child_date,ses_proxy,child_zha,child_observe_concentrated,id_mn_best,teacher_emotional_independent,prov_best,teacher_emotional_understand,obs_materials_19,id_dc_best
3676,ID_H2RKKMMKK,51.138668,child_age,child_age_group,id_enumerator,ward_best,child_height,ses_cat,id_team,child_gender,obs_materials_19,id_mn_best,id_dc_best,obs_equipment_1,teacher_emotional_confidence,child_stunted,count_children_present
3677,ID_VY8KX7YTZ,42.787611,child_observe_diligent,child_observe_interested,child_observe_attentive,child_observe_concentrated,child_age,child_observe_total,teacher_emotional_selfstarter,pri_meals,child_date,child_gender,pri_meal_2,obs_materials,pri_attendance,obs_gate,obs_firstaid
3678,ID_EO2MYZ4M7,39.281417,longitude,id_facility,teacher_emotional_adjust,pri_calc_time_close,pra_agency_choice,teacher_social_assistance,latitude,pri_meal_2,id_mn_best,count_register_gender_female,pri_funding_donations,id_dc_best,grade_r,count_register_year_2020,id_enumerator


In [22]:
SampleSubmission.to_csv('submission.csv', index=False)

In [None]:
# Save the model as a pickle in a file
file_name = 'catboost_dummy_0.108665476'
joblib.dump(model_dummy, f'models/{file_name}.pkl')