In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [8]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [9]:
import pandas as pd

hh_features = pd.read_csv("/content/drive/MyDrive/DrivenData/train_hh_features.csv")
hh_gt       = pd.read_csv("/content/drive/MyDrive/DrivenData/train_hh_gt.csv")
rates_gt    = pd.read_csv("/content/drive/MyDrive/DrivenData/train_rates_gt.csv")

test_hh     = pd.read_csv("/content/drive/MyDrive/DrivenData/test_hh_features.csv")


In [6]:
hh_features.shape

(104234, 88)

In [7]:
hh_gt.shape

(104234, 3)

In [8]:
rates_gt.shape

(3, 20)

In [9]:
test_hh.shape

(103023, 88)

In [11]:
HH_KEY   = [ "survey_id", "hhid"]
RATE_KEY = "survey_id"


In [11]:
print("HH FEATURES COLUMNS:")
print(hh_features.columns.tolist())

print("\nHH GT COLUMNS:")
print(hh_gt.columns.tolist())

print("\nRATES GT COLUMNS:")
print(rates_gt.columns.tolist())

print("\nTEST HH COLUMNS:")
print(test_hh.columns.tolist())


HH FEATURES COLUMNS:
['hhid', 'com', 'weight', 'strata', 'utl_exp_ppp17', 'male', 'hsize', 'num_children5', 'num_children10', 'num_children18', 'age', 'owner', 'water', 'toilet', 'sewer', 'elect', 'water_source', 'sanitation_source', 'dweltyp', 'num_adult_female', 'num_adult_male', 'num_elderly', 'employed', 'sworkershh', 'share_secondary', 'educ_max', 'sfworkershh', 'any_nonagric', 'sector1d', 'region1', 'region2', 'region3', 'region4', 'region5', 'region6', 'region7', 'urban', 'consumed100', 'consumed200', 'consumed300', 'consumed400', 'consumed500', 'consumed600', 'consumed700', 'consumed800', 'consumed900', 'consumed1000', 'consumed1100', 'consumed1200', 'consumed1300', 'consumed1400', 'consumed1500', 'consumed1600', 'consumed1700', 'consumed1800', 'consumed1900', 'consumed2000', 'consumed2100', 'consumed2200', 'consumed2300', 'consumed2400', 'consumed2500', 'consumed2600', 'consumed2700', 'consumed2800', 'consumed2900', 'consumed3000', 'consumed3100', 'consumed3200', 'consumed3300

In [12]:
train = hh_features.merge(
    hh_gt,
    on=HH_KEY,
    how="left",
)

train = train.merge(
    rates_gt,
    on=RATE_KEY,
    how="left",

)


In [13]:
print(train[RATE_KEY].isna().mean())


0.0


In [14]:
train.shape

(104234, 108)

In [15]:
test_hh.shape

(103023, 88)

In [13]:
test = test_hh.merge(
    rates_gt,
    on=RATE_KEY,
    how="left"
)


In [17]:
test.shape

(103023, 107)

In [18]:

print(train.shape, test.shape)

train.isna().sum().sort_values(ascending=False).head()


(104234, 108) (103023, 107)


Unnamed: 0,0
sector1d,14129
dweltyp,1206
utl_exp_ppp17,85
consumed5000,66
consumed4800,56


In [14]:
TARGET = "cons_ppp17"   # replace with exact column
ID_COL = HH_KEY

X = train.drop(columns=HH_KEY + [TARGET])
X_test = test.drop(columns=HH_KEY)



In [20]:
# Household uniqueness
assert train[HH_KEY].drop_duplicates().shape[0] == train.shape[0]
assert test[HH_KEY].drop_duplicates().shape[0] == test.shape[0]

# Missing targets?
train.isna().sum().sort_values(ascending=False).head()


Unnamed: 0,0
sector1d,14129
dweltyp,1206
utl_exp_ppp17,85
consumed5000,66
consumed4800,56


In [21]:
TARGET = "cons_ppp17"   # example


In [15]:
X = train.drop(columns=HH_KEY + [TARGET])
y = train[TARGET]

X_test = test.drop(columns=HH_KEY)


In [23]:
X.shape, y.shape, X_test.shape

((104234, 105), (104234,), (103023, 105))

In [24]:
train.shape, test.shape

((104234, 108), (103023, 107))

In [25]:
cat_cols = X.select_dtypes(include="object").columns.tolist()


In [26]:
for col in cat_cols:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)


In [27]:
for col in cat_cols:
    X[col] = X[col].fillna("MISSING")
    X_test[col] = X_test[col].fillna("MISSING")


In [28]:
cat_cols = X.select_dtypes(include="object").columns.tolist()


In [29]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    loss_function="MAE",
    iterations=800,
    depth=6,
    learning_rate=0.04,
    random_seed=42,
    verbose=200
)

model.fit(X, y, cat_features=cat_cols)

0:	learn: 5.9878554	total: 1.58s	remaining: 21m 6s
200:	learn: 3.3040528	total: 2m 59s	remaining: 8m 56s
400:	learn: 3.1932824	total: 6m 6s	remaining: 6m 4s
600:	learn: 3.1355583	total: 9m 10s	remaining: 3m 2s
799:	learn: 3.1021131	total: 12m 10s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7cc60c22dc70>

In [37]:
test_preds = model.predict(X_test)
test_preds.shape

(103023,)

In [31]:
submission = pd.DataFrame({
    "survey_id": test["survey_id"],
    "hhid": test["hhid"],
    "cons_ppp17": test_preds
})

submission.to_csv("/content/drive/MyDrive/DrivenData/submission.csv", index=False)


In [32]:
submission.head()

Unnamed: 0,survey_id,hhid,cons_ppp17
0,400000,400001,10.430452
1,400000,400002,6.251836
2,400000,400003,7.819717
3,400000,400004,11.438084
4,400000,400005,4.927045


In [33]:
for col in cat_cols:
    if X[col].isna().any():
        print(col, "has NaNs")


In [38]:
y_pred = model.predict(X)
y_pred.shape

(104234,)

In [35]:
from sklearn.metrics import mean_absolute_percentage_error
TARGET = 'cons_ppp17'
y_true=train[TARGET]

mape = mean_absolute_percentage_error(y_true, y_pred)
print("MAPE (%):", mape * 100)


MAPE (%): 26.671324036353756


In [36]:
train[TARGET].isna().mean()


np.float64(0.0)

In [16]:
[c for c in train.columns if c.startswith('pct_hh_below')]


['pct_hh_below_3.17',
 'pct_hh_below_3.94',
 'pct_hh_below_4.60',
 'pct_hh_below_5.26',
 'pct_hh_below_5.88',
 'pct_hh_below_6.47',
 'pct_hh_below_7.06',
 'pct_hh_below_7.70',
 'pct_hh_below_8.40',
 'pct_hh_below_9.13',
 'pct_hh_below_9.87',
 'pct_hh_below_10.70',
 'pct_hh_below_11.62',
 'pct_hh_below_12.69',
 'pct_hh_below_14.03',
 'pct_hh_below_15.64',
 'pct_hh_below_17.76',
 'pct_hh_below_20.99',
 'pct_hh_below_27.37']

In [17]:
TARGETS = [
    'pct_hh_below_3.17',	'pct_hh_below_3.94',	'pct_hh_below_4.60',	'pct_hh_below_5.26'	,'pct_hh_below_5.88',	'pct_hh_below_6.47',	'pct_hh_below_7.06',	'pct_hh_below_7.70'	,'pct_hh_below_8.40',	'pct_hh_below_9.13'	,'pct_hh_below_9.87',	'pct_hh_below_10.70',	'pct_hh_below_11.62',	'pct_hh_below_12.69',	'pct_hh_below_14.03',	'pct_hh_below_15.64',	'pct_hh_below_17.76',	'pct_hh_below_20.99',	'pct_hh_below_27.37'
]


In [18]:
HH_KEY = ['survey_id', 'hhid']

DROP_COLS = [c for c in HH_KEY + TARGETS if c in train.columns]
if 'cons_ppp17' in train.columns:
    train = train.drop(columns=['cons_ppp17'])

X = train.drop(columns=DROP_COLS)


In [19]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

for c in cat_cols:
    X[c] = X[c].astype(str).fillna('MISSING')


In [20]:
from catboost import CatBoostRegressor
import pandas as pd

final_submission = pd.DataFrame({'survey_id': test['survey_id'].unique()})


In [25]:
for TAR in TARGETS:
    print(f"Training for target: {TAR}")

    y = train[TAR]

    model = CatBoostRegressor(
        iterations=800,
        learning_rate=0.05,
        depth=7,
        loss_function='MAE',
        eval_metric='MAPE',
        cat_features=cat_cols,
        verbose=False,
        random_seed=42
    )

    model.fit(X, y)

    # --- Predict on test households ---
    X_test = test[X.columns].copy()

    for c in cat_cols:
        X_test[c] = X_test[c].astype(str).fillna('MISSING')

    test['hh_pred'] = model.predict(X_test)

    # --- Aggregate to survey level ---
    agg = (
        test
        .groupby('survey_id')['hh_pred']
        .mean()
        .reset_index()
        .rename(columns={'hh_pred': TAR})
    )

    # --- Merge into final submission ---
    final_submission = final_submission.merge(
        agg,
        on='survey_id',
        how='left'
    )


Training for target: pct_hh_below_3.17
Training for target: pct_hh_below_3.94
Training for target: pct_hh_below_4.60
Training for target: pct_hh_below_5.26
Training for target: pct_hh_below_5.88
Training for target: pct_hh_below_6.47
Training for target: pct_hh_below_7.06
Training for target: pct_hh_below_7.70
Training for target: pct_hh_below_8.40
Training for target: pct_hh_below_9.13
Training for target: pct_hh_below_9.87
Training for target: pct_hh_below_10.70
Training for target: pct_hh_below_11.62
Training for target: pct_hh_below_12.69
Training for target: pct_hh_below_14.03
Training for target: pct_hh_below_15.64
Training for target: pct_hh_below_17.76
Training for target: pct_hh_below_20.99
Training for target: pct_hh_below_27.37


In [21]:
print([c for c in train.columns if 'cons_' in c.lower()])


[]


In [26]:
final_submission = final_submission.sort_values('survey_id')

final_submission.to_csv(
    '/content/drive/MyDrive/DrivenData/submission_3.csv',
    index=False
)
