In [146]:
# Install eli5, you can use any other model Interpretability libraries
!pip install eli5 -q

In [147]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [148]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import eli5
import warnings
warnings.filterwarnings('ignore')

In [149]:
# Load files
train = pd.read_csv('/content/drive/MyDrive/Elom/Train.csv')
test = pd.read_csv('/content/drive/MyDrive/Elom/Test (1).csv')
ss = pd.read_csv('/content/drive/MyDrive/Elom/SampleSubmission.csv')

In [150]:
# Preview train
train.head()

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,2022-02-03,59.0,,,,,,Sometimes,...,,,,,,,,,,51.5
1,ID_J5BTFOZR3,2019.0,,60.163933,,,,1st year in the programme,103.0,Sometimes,...,,,,,,,,,,55.869999
2,ID_R00SN7AUD,2022.0,2022-03-11,69.0,,,,,108.400002,Often,...,,,,,,,,,,47.52
3,ID_BSSK60PAZ,2021.0,2021-10-13,53.0,2020-01-15,20.0,No,1st year in the programme,98.099998,Almost always,...,,,,,,,,,,58.599998
4,ID_IZTY6TC4D,2021.0,2021-10-13,57.0,2021-10-13,0.0,,2nd year in programme,114.0,Almost always,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,76.599998


In [151]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8585 entries, 0 to 8584
Columns: 679 entries, child_id to target
dtypes: float64(160), object(519)
memory usage: 44.5+ MB


In [152]:
len(train.columns)

679

In [153]:
# Preview test
test.head()

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_4,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7
0,ID_0I0999N6S,2021.0,2021-09-20,57.0,,,Yes,2nd year in programme,108.0,Almost always,...,,,,,,,,,,
1,ID_GQ6ONJ4FP,2021.0,2021-10-21,54.0,2021-01-10,9.0,Yes,1st year in the programme,105.0,Almost always,...,,,,,,,,,,
2,ID_YZ76CVRW3,2021.0,2021-05-17,57.0,,,Yes,,101.5,Often,...,,,,,,,,,,
3,ID_BNINCRXH8,2022.0,2022-09-09,59.334702,,,,3rd year in programme,,Almost always,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,ID_1U7GDTLRI,2021.0,2021-10-12,54.0,2021-01-15,8.0,Yes,1st year in the programme,103.5,Often,...,,,,,,,,,,


In [154]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3680 entries, 0 to 3679
Columns: 678 entries, child_id to obs_heating_7
dtypes: float64(161), object(517)
memory usage: 19.0+ MB


In [155]:
len(test.columns)

678

In [156]:
# Preview submission file
ss.head()

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
1,ID_GQ6ONJ4FP,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
2,ID_YZ76CVRW3,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
3,ID_BNINCRXH8,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
4,ID_1U7GDTLRI,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature


In [157]:
varibles = pd.read_csv('/content/drive/MyDrive/Elom/VariableDescription.csv')
varibles

Unnamed: 0,Variable Name,Variable Label,Answer Label
0,child_id,Unique child ID,Open ended
1,data_year,Year data was collected,Open ended
2,child_date,ELOM date,Open ended
3,child_age,Child age in months,Open ended
4,child_enrolment_date,Date enrolled in ELP,Open ended
...,...,...,...
676,child_attends,Does child attend an ECD programme?,
677,child_attendance,How many days per week does the child attend t...,
678,child_languages,Child home language(s),
679,pri_fees_amount_pv,Facility monthly fee (present value),


In [158]:
# Prepare your sub file to look like the provided sample submission
ss.head()

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
1,ID_GQ6ONJ4FP,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
2,ID_YZ76CVRW3,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
3,ID_BNINCRXH8,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
4,ID_1U7GDTLRI,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature


In [159]:
# # One-hot encode categorical columns
    # data_cat = pd.get_dummies(data[cat_cols], drop_first=False)
    # data_num = data[num_cols]

    # # Combine numerical and categorical data
    # data_processed = pd.concat([data_cat, data_num], axis=1)

In [160]:
def cleaning(data):
    # Identify categorical columns with more than one unique value
    cat_cols = []
    for col in data.columns:
        if (data[col].dtype == 'object' or data[col].dtype.name == 'category') and data[col].nunique() > 1:
            cat_cols.append(col)
    
    # Identify date columns among categorical columns and drop them
    date_cols = []
    for col in cat_cols:
        try:
            pd.to_datetime(data[col], format='%Y-%m-%d')
            date_cols.append(col)
        except ValueError:
            pass
    data.drop(columns=date_cols, inplace=True)

    # Fill missing values for numerical columns with median
    num_cols = data.select_dtypes(include=['float32', 'float64']).columns.tolist()
    data[num_cols] = data[num_cols].fillna(data[num_cols].median())

    # Fill missing values for categorical columns with mode
    cat_cols = data.select_dtypes(include='object').columns.tolist()
    data[cat_cols] = data[cat_cols].fillna(data[cat_cols].mode().iloc[0])

    # Drop duplicates based on 'child_id'
    data.drop_duplicates(inplace=True)

    # Clean column names
    col_names = data.columns.tolist()
    col_names = [col.replace('[', '').replace(']', '').replace('<', '') for col in col_names]
    data.columns = col_names

    return data

In [161]:
train = cleaning(train)

In [162]:
test= cleaning(test)

In [163]:
train_columns = set(train.columns)
test_columns = set(test.columns)

different_columns = list(train_columns.symmetric_difference(test_columns))

if len(different_columns) == 1:
    print(f"The different column between train and test data is {different_columns[0]}.")
else:
    print("There is more than one different column between train and test data.")

The different column between train and test data is target.


In [164]:
# drop all different columns except for the target column in train data
target_column = 'target'  # replace 'target' with the actual name of your target column
train.drop(columns=[col for col in different_columns if col != target_column and col in train.columns], inplace=True)

# drop all different columns from test data
test.drop(columns=[col for col in different_columns if col in test.columns], inplace=True)

# drop 'other_practitioner' column from train and test data
train.drop(columns=['other_practitioner'], inplace=True)
test.drop(columns=['other_practitioner'], inplace=True)

In [165]:
# count missing values in each column
null_counts = test.isnull().sum()

# print columns with missing values
print(null_counts[null_counts > 0])

Series([], dtype: int64)


In [166]:
def get_dummy(data):
  # Fill missing values for numerical columns with median
    num_cols = data.select_dtypes(include=['float32', 'float64']).columns.tolist()
    data[num_cols] = data[num_cols].fillna(data[num_cols].median())

    # Fill missing values for categorical columns with mode
    cat_cols = data.select_dtypes(include='object').columns.tolist()
    data[cat_cols] = data[cat_cols].fillna(data[cat_cols].mode().iloc[0])
    # One-hot encode categorical columns
    data_cat = pd.get_dummies(data[cat_cols], drop_first=False)
    data_num = data[num_cols]

    # Combine numerical and categorical data
    data_processed = pd.concat([data_cat, data_num], axis=1)
    return data_processed

In [167]:
train.columns[train.isna().any()].tolist()

[]

In [168]:
test.columns[test.isna().any()].tolist()

[]

In [169]:
train = train.drop('child_id',axis=1)
train = get_dummy(train)

In [170]:
# # Fill missing values for numerical columns with median
# num_cols = t.select_dtypes(include=['float32', 'float64']).columns.tolist()
# data[num_cols] = data[num_cols].fillna(data[num_cols].median())
# # Fill missing values for categorical columns with mode
# cat_cols = train.select_dtypes(include='object').columns.tolist()
# train[cat_cols] = train[cat_cols].fillna(data[cat_cols].mode().iloc[0])
# # One-hot encode categorical columns
# train_cat = pd.get_dummies(train[cat_cols], drop_first=False)
# train_num = train[num_cols]

# # Combine numerical and categorical data
# train_processed = pd.concat([train_cat, data_num], axis=1)

In [171]:
x = train.drop('target',axis=1)
y = train.target

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

In [172]:
from sklearn.preprocessing import StandardScaler
# Instantiating standard scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [173]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [174]:
import catboost as cb
from sklearn.metrics import r2_score

In [175]:
# iterations=1000,
#                           depth=4,random_seed =42, 
#                           learning_rate=0.01, 
#                           loss_function='RMSE', verbose=False
# loss_function='RMSE', verbose=False

In [176]:
# Create the model
catb = cb.CatBoostRegressor(iterations=900, depth=8,random_seed =42,
                            learning_rate=0.05,loss_function='RMSE', verbose=False)
catb.fit(X_train,y_train)

# Make predictions
y_pred_train = catb.predict(X_train)
y_pred_test = catb.predict(X_test)

In [177]:
test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
test_rmse

9.81384902356263

In [178]:
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
train_rmse

7.253471753296894

In [179]:
# # define hyperparameters to search over
# depth_range = [2, 4, 6, 8, 10]
# learning_rate_range = [0.01, 0.02, 0.03, 0.04, 0.05]
# iterations_range = [500, 700, 900, 1100, 1300]

# # initialize lists to store results
# train_scores = []
# test_scores = []

# # iterate over hyperparameters and iterations and evaluate model performance
# for depth in depth_range:
#     for learning_rate in learning_rate_range:
#         for iterations in iterations_range:
#             # Create the model
#             catb = cb.CatBoostRegressor(iterations=iterations, depth=depth, random_seed=42, learning_rate=learning_rate, loss_function='RMSE', verbose=False)
#             catb.fit(X_train, y_train)

#             # Make predictions
#             y_pred_train = catb.predict(X_train)
#             y_pred_test = catb.predict(X_test)

#             # calculate R^2 scores and store results
#             train_score = catb.score(X_train, y_train)
#             test_score = catb.score(X_test, y_test)
#             train_scores.append(train_score)
#             test_scores.append(test_score)

# # find the index of the best hyperparameter values
# best_index = np.argmax(test_scores)
# best_depth = depth_range[(best_index // (len(learning_rate_range) * len(iterations_range)))]
# best_learning_rate = learning_rate_range[(best_index // len(iterations_range)) % len(learning_rate_range)]
# best_iterations = iterations_range[best_index % len(iterations_range)]
# print("Best depth:", best_depth)
# print("Best learning rate:", best_learning_rate)
# print("Best iterations:", best_iterations)


In [180]:
!pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [181]:
test_processed = test.drop('child_id',axis=1)
test_processed = get_dummy(test)

In [184]:
import shap

# Create a SHAP explainer for the model
explainer = shap.Explainer(catb)

# Define the number of predictors to include
num_predictors = 15

# Get the SHAP values for the test set
shap_values = explainer(test_processed)

# Reshape the SHAP values array and convert it to a pandas DataFrame
shap_values = shap_values.values.reshape((test_processed.shape[0], -1))
shap_df = pd.DataFrame(shap_values, columns=test_processed.columns)

# Get the top 15 features for each prediction in the test set
top_predictors = []
for i in range(shap_values.shape[0]):
    # Get the SHAP values for the current observation
    shap_values_obs = shap_df.iloc[[i]]

    # Get the absolute mean SHAP values for each feature
    abs_mean_shap = shap_values_obs.abs().mean()

    # Sort the features by absolute mean SHAP value in descending order
    top_preds = abs_mean_shap.sort_values(ascending=False)[:num_predictors]

    # Append the top predictors to the list
    top_predictors.append(list(top_preds.index))
    
print(top_predictors)


[['hle_ind_No', 'teacher_social_peers_A little of the time', 'child_id_ID_02Z1L8L4T', 'child_id_ID_08BOWNVG2', 'pri_languages_Setswana', 'teacher_social_peers_All of the time', 'child_id_ID_0IXUJCQ4H', 'child_id_ID_BZTBI2W35', 'child_id_ID_0JOU0DKOG', 'child_id_ID_06KP00IHN', 'child_id_ID_084IKLA70', 'obs_area_1 3 5', 'child_id_ID_06LOV8OK4', 'pri_support_providerother_PARENTS OF THE CHILDREN', 'child_id_ID_047ECGGYD'], ['hle_ind_No', 'teacher_social_peers_A little of the time', 'child_id_ID_02Z1L8L4T', 'pri_languages_Setswana', 'child_id_ID_08BOWNVG2', 'child_id_ID_0IXUJCQ4H', 'child_id_ID_BZTBI2W35', 'teacher_social_peers_All of the time', 'child_id_ID_0JOU0DKOG', 'child_id_ID_06KP00IHN', 'obs_area_1 3 5', 'child_id_ID_084IKLA70', 'child_id_ID_06LOV8OK4', 'pri_support_providerother_PARENTS OF THE CHILDREN', 'child_id_ID_047ECGGYD'], ['hle_ind_No', 'teacher_social_peers_A little of the time', 'pri_languages_Setswana', 'child_id_ID_02Z1L8L4T', 'child_id_ID_08BOWNVG2', 'child_id_ID_0IXU

In [185]:
# Make predictions on the test set
preds = catb.predict(test_processed)
preds

array([50.45475338, 50.02355707, 49.92942187, ..., 50.00431458,
       53.90490265, 49.73178359])

In [186]:
# Submission file preparation
predictors = pd.DataFrame(top_predictors, columns = ['feature_' + str(i) for i in range(1,16)])
predictors['target'] = preds
predictors['child_id'] = test.child_id
predictors = predictors[ss.columns.tolist()]
predictors.head()

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,50.454753,hle_ind_No,teacher_social_peers_A little of the time,child_id_ID_02Z1L8L4T,child_id_ID_08BOWNVG2,pri_languages_Setswana,teacher_social_peers_All of the time,child_id_ID_0IXUJCQ4H,child_id_ID_BZTBI2W35,child_id_ID_0JOU0DKOG,child_id_ID_06KP00IHN,child_id_ID_084IKLA70,obs_area_1 3 5,child_id_ID_06LOV8OK4,pri_support_providerother_PARENTS OF THE CHILDREN,child_id_ID_047ECGGYD
1,ID_GQ6ONJ4FP,50.023557,hle_ind_No,teacher_social_peers_A little of the time,child_id_ID_02Z1L8L4T,pri_languages_Setswana,child_id_ID_08BOWNVG2,child_id_ID_0IXUJCQ4H,child_id_ID_BZTBI2W35,teacher_social_peers_All of the time,child_id_ID_0JOU0DKOG,child_id_ID_06KP00IHN,obs_area_1 3 5,child_id_ID_084IKLA70,child_id_ID_06LOV8OK4,pri_support_providerother_PARENTS OF THE CHILDREN,child_id_ID_047ECGGYD
2,ID_YZ76CVRW3,49.929422,hle_ind_No,teacher_social_peers_A little of the time,pri_languages_Setswana,child_id_ID_02Z1L8L4T,child_id_ID_08BOWNVG2,child_id_ID_0IXUJCQ4H,child_id_ID_BZTBI2W35,teacher_social_peers_All of the time,child_id_ID_0JOU0DKOG,child_id_ID_06KP00IHN,obs_area_1 3 5,child_id_ID_084IKLA70,child_id_ID_06LOV8OK4,child_id_ID_047ECGGYD,child_id_ID_07XHLCP68
3,ID_BNINCRXH8,50.955816,hle_ind_No,teacher_social_peers_A little of the time,child_id_ID_02Z1L8L4T,child_id_ID_08BOWNVG2,pri_languages_Setswana,child_id_ID_0JOU0DKOG,teacher_social_peers_All of the time,child_id_ID_0IXUJCQ4H,child_id_ID_BZTBI2W35,child_id_ID_06KP00IHN,obs_area_1 3 5,child_id_ID_084IKLA70,child_id_ID_06LOV8OK4,teacher_social_nonaggressive_A little of the time,child_id_ID_047ECGGYD
4,ID_1U7GDTLRI,49.562216,hle_ind_No,teacher_social_peers_A little of the time,child_id_ID_02Z1L8L4T,pri_languages_Setswana,child_id_ID_08BOWNVG2,child_id_ID_0IXUJCQ4H,child_id_ID_BZTBI2W35,child_id_ID_0JOU0DKOG,child_id_ID_06KP00IHN,child_id_ID_084IKLA70,obs_area_1 3 5,child_id_ID_06LOV8OK4,pri_support_providerother_PARENTS OF THE CHILDREN,child_id_ID_047ECGGYD,child_id_ID_07XHLCP68


In [188]:
# Create csv to upload to Zindi for scoring
predictors.to_csv('submission.csv', index = False)

In [189]:
from google.colab import files

# Download csv file
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import make_scorer

# # Define the RMSE scorer
# scorer = make_scorer(mean_squared_error, squared=False)

# #Perform the grid search
# grid_search = GridSearchCV(estimator=catb, param_grid=grid, cv=5, n_jobs=-1, scoring=scorer)
# grid_result = grid_search.fit(X_train, y_train)

# # Print the best parameters and RMSE score
# print("Best Parameters: ", grid_result.best_params_)
# print("Best RMSE Score: ", -grid_result.best_score_)

In [None]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [190]:
from xgboost import XGBRegressor

In [191]:
# Instantiate XGBClassifier
reg = XGBRegressor(eta =0.5,min_split_loss=0.3,min_child_weight=4,colsample_bytree=0.6,subsample=0.6)

# Fit XGBClassifier
reg.fit(X_train, y_train)

# Predict on training and test sets
training_preds = reg.predict(X_train)
test_preds = reg.predict(X_test)

# Accuracy of training and test sets
training_rscore = r2_score(y_train, training_preds)
testing_rscore = r2_score(y_test, test_preds)

print('Training Accuracy: {:.4}%'.format(training_rscore * 100))
print('Validation accuracy: {:.4}%'.format(testing_rscore * 100))

Training Accuracy: 86.41%
Validation accuracy: 46.12%


In [192]:
mean_squared_error(y_train, training_preds, squared=False)

5.541789595741995

In [193]:
mean_squared_error(y_test, test_preds, squared=False)

11.483292410276476

In [None]:
# from sklearn.model_selection import GridSearchCV


# param_grid = {
#     'eta': [0.1, 0.5, 1.0],
#     'min_split_loss': [0.1, 0.5, 1.0],
#     'min_child_weight': [1, 2, 4],
#     'colsample_bytree': [0.6, 0.8, 1.0],
#     'subsample': [0.6, 0.8, 1.0]
# }

# # Instantiate XGBRegressor
# reg = XGBRegressor()

# # Instantiate GridSearchCV
# grid_search = GridSearchCV(reg, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

# # Fit GridSearchCV
# grid_search.fit(X_train, y_train)

# # Predict on training and test sets using best estimator
# training_preds = grid_search.best_estimator_.predict(X_train)
# test_preds = grid_search.best_estimator_.predict(X_test)

# # Calculate RMSE of training and test sets using best estimator
# training_rmse = np.sqrt(mean_squared_error(y_train, training_preds))
# testing_rmse = np.sqrt(mean_squared_error(y_test, test_preds))

# # Print best parameters and RMSE scores
# print("Best parameters: ", grid_search.best_params_)
# print("Training RMSE: ", training_rmse)
# print("Testing RMSE: ", testing_rmse)

In [194]:
from sklearn.linear_model import LinearRegression


In [195]:
# Instantiating logistic regression
lr = LinearRegression(fit_intercept=False,n_jobs=-1)

# Fitting the model to unbalanced classes data
lr.fit(X_train, y_train)

In [196]:
# Predicting on training and test sets
lr_train_pred = lr.predict(X_train)
lr_test_pred = lr.predict(X_test)
# Evaluating the model on the training set
lr_train_rmse = mean_squared_error(y_train,lr_train_pred,squared=False)

# Evaluating the model on the testing set
lr_test_rmse = mean_squared_error(y_test,lr_test_pred,squared=False)

# Evaluating the balanced classes data
print(f'Train RMSE - {lr_train_rmse} \nTest RMSE - {lr_test_rmse}')

Train RMSE - 347.07230678147334 
Test RMSE - 164698750343252.06
