# Exploring the data

Independent variable: Poverty score (self reported) categorical outcome on a 1-10 scale

Dependent variables divided into 2 categories:
- Education data
- Household data


In [66]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression





In [67]:
# Minimal preprocessing

ed = pd.read_csv('./data/train_data/ed_train_raw.csv') # education DataFrame
hh = pd.read_csv('./data/train_data/hh_train_raw.csv') # household DataFrame
poverty = pd.read_csv('./data/train_data/poverty_train_raw.csv') # poverty/labels


def preprocess_df(df, suffix:str):
    ## merge first three columns into psu_hh_idcode identifier
    uids = df['psu'].astype(str) + "_"  + df['hh'].astype(str) + "_" + df['idcode'].astype(str) 
    # delete the three columns
    df = df.drop(columns=['psu', 'hh', 'idcode'])

    ## Capitalize all Q's in column name prefixes. Add ED or HH prefix to identify variate group
    df.columns = [suffix + "_" + col.capitalize() for col in df.columns]

    # Insert uid as first column, lowercase, no prefix.
    df.insert(0, 'uid', uids)

    return df

ed = preprocess_df(ed, 'ED')
hh = preprocess_df(hh, 'HH')

for i in range(1,11):
    col = 'subjective_poverty_'+ str(i)
    poverty.loc[poverty[col]==1, 'poverty_score'] = i

poverty['uid'] = poverty['psu_hh_idcode']
y = poverty[['uid', 'poverty_score']]

ed = ed[ed['uid'].isin(poverty['uid'])]
# ed = ed.merge(poverty[['uid', 'poverty_score']], on='uid', how='left')
hh = hh[hh['uid'].isin(poverty['uid'])]
# hh = hh.merge(poverty[['uid', 'poverty_score']], on='uid', how='left')



In [62]:
# renames = {
#     'ED_Q01': 'read',
#     'ED_Q02': 'write',
#     'ED_Q03': 'attended_school',
#     'ED_Q04': 'school_grade',
#     'ED_Q05': 'school_grade_level',
#     'ED_Q06': 'highest_diploma',
#     'ED_Q07': 'years_preschool',
#     'ED_Q08': 'now_enrolled',
#     'ED_Q09': 'now_attend',
#     'ED_Q10': 'now_not_attend_reason',
#     'ED_Q11': 'past_not_enrolled_reason',
#     'ED_Q12': 'current_grade',
#     'ED_Q13': 'current_grade_level',
#     'ED_Q14': 'past_enrolled',
#     'ED_Q15': 'past_attend',
#     'ED_Q16': 'past_not_attend_reason',
#     'ED_Q17': 'past_not_enrolled_reason',
    
# }



In [63]:
# ed = ed.rename(columns={'ED_Q01': 'read'})
y

Unnamed: 0,uid,poverty_score
0,30_8_1,4.0
1,194_1_2,1.0
2,224_6_1,3.0
3,323_10_1,5.0
4,428_10_1,4.0
...,...,...
5332,571_8_1,3.0
5333,601_5_1,4.0
5334,782_1_1,2.0
5335,606_3_1,5.0


In [68]:
X_raw = pd.merge(ed, hh, on='uid', how='inner')
# To ensure rows match between X and y
Xy = pd.merge(X_raw, y, on='uid', how='left')
assert(Xy['poverty_score'].isna().sum() == 0)
y = Xy['poverty_score']
X_raw = Xy.drop(columns=['poverty_score'])


# Baseline models

Since tree-based models and kNN accept NaNs, train them on this minimal raw data to set a baseline.

## Random Forest

In [65]:


# Set up a grid of hyperparameters to search over
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV to search over the hyperparameter grid
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV to the training data
grid_search.fit(X_raw, y)

# Get the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.2f}")


KeyboardInterrupt: 

**Results:** 

```
Best parameters: {
    'max_depth': None, 
    'min_samples_split': 10, 
    'n_estimators': 50}
Best cross-validation accuracy: 0.21
```

So, the accuracy to beat is 0.21

## HistGradientBoosting


In [None]:

# Set up parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'max_iter': [100, 200],
    'min_samples_leaf': [20, 50],
    'l2_regularization': [0, 1.0, 2.0]
}

# Initialize the model
hgb = HistGradientBoostingClassifier(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(
    hgb, 
    param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1  # Use all available cores
)

# Fit GridSearchCV to the training data
grid_search.fit(X_raw, y)

# Get the best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.2f}")

Best parameters: {'l2_regularization': 2.0, 'learning_rate': 0.01, 'max_depth': 3, 'max_iter': 100, 'min_samples_leaf': 20}
Best cross-validation accuracy: 0.22


**Results:** 
```
Best parameters: {
    'l2_regularization': 2.0, 
    'learning_rate': 0.01, 
    'max_depth': 3, 
    'max_iter': 100, 
    'min_samples_leaf': 20}
```

Best cross-validation accuracy: 0.22

## Stacking

In [69]:
hgb = HistGradientBoostingClassifier(random_state=42, l2_regularization=2.0, learning_rate=0.01, max_depth=3, max_iter=100, min_samples_leaf=20)
rf = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=None, min_samples_split=10)


# Define the base estimators
estimators = [
    ('rf', rf),
    ('hgb', hgb)
]

# Create the stacked model

# Using logistic regression as the final estimator
stacked_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5  # Number of folds for cross-validation during stacking
)

# Fit the stacked model
stacked_model.fit(X_raw, y)

# Evaluate using cross-validation
cv_scores = cross_val_score(stacked_model, X_raw, y, cv=5, scoring='accuracy')
print(f"Stacked model cross-validation accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")

Stacked model cross-validation accuracy: 0.22 (+/- 0.03)


Stacked model cross-validation accuracy: 0.22 (+/- 0.03)

# Preliminary Variable selection
The purpose of this notebook is to identify the variables that are not useful for the prediction of wealth.

Dropping useless variables 

In [1]:
hh_useless_columns=['HH_Hhid', 'HH_Q04', 'HH_Q08', 'HH_Q12', 'HH_Q18']
ed_useful_columns = [
    'uid',
    'ED_Q01', 'ED_Q02', 'ED_Q03', 'ED_Q04', 'ED_Q05', 'ED_Q06', 'ED_Q07', 'ED_Q08', 'ED_Q09', 'ED_Q10', 'ED_Q11', 
    'ED_Q14', 'ED_Q15', 'ED_Q16', 'ED_Q17', 'ED_Q18',
    'ED_Q19',
    'ED_Q23',
    'ED_Q26', 'ED_Q27', 'ED_Q28', 'ED_Q29',
    'ED_Q41', 
]
%store hh_useless_columns
%store ed_useful_columns


Stored 'hh_useless_columns' (list)
Stored 'ed_useful_columns' (list)


In [1]:
ed 

IPython will make a temporary file named: /var/folders/21/wpdrdb153rgcw7h7qz_vrd_40000gn/T/ipython_edit_m705y5td/ipython_edit_ex4hczmf.py
