# Predict care givers’ depression

## Initial Configuration

In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve

warnings.filterwarnings("ignore")
seed = 257248

In [79]:
def evaluate(rf, train, valid, test, train_y, valid_y, return_pred=False, version=1, return_auc=False, plot_auc=False):
    if version == 2:
        rf_train_y = rf.predict(train)
        rf_valid_y = rf.predict(valid)
        pred = rf.predict(test)
        acr_train = rf.score(train, train_y)
        acr_valid = rf.score(valid, valid_y)
        print(f"Train Accuracy: {acr_train:.4f}; Validation Accuracy: {acr_valid:.4f}; Test: {np.sum(pred):>4d}")
        cm = confusion_matrix(train_y, rf_train_y)
        tn, fp, fn, tp = cm.ravel()
        print(f"Train AUROC: {roc_auc_score(train_y, rf_train_y):.4f}; FP: {fp:>4d}; FN: {fn:>4d}")
        cm = confusion_matrix(valid_y, rf_valid_y)
        tn, fp, fn, tp = cm.ravel()
        print(f"Valid AUROC: {roc_auc_score(valid_y, rf_valid_y):.4f}; FP: {fp:>4d}; FN: {fn:>4d}")
        print()
    elif version == 3:
        pred_train = rf.predict(train)
        pred_valid = rf.predict(valid)
        pred = rf.predict(test)
        print(f"Train RMSE: {mse(train_y, pred_train, squared=False):.4f}")
        print(f"Valid RMSE: {mse(valid_y, pred_valid, squared=False):.4f}")
    else:
        rf_train_y = rf.predict_proba(train)[:,1]
        rf_valid_y = rf.predict_proba(valid)[:,1]
        pred = rf.predict_proba(test)[:,1]
        acr_train = rf.score(train, train_y)
        acr_valid = rf.score(valid, valid_y)
        print(f"Train Accuracy: {acr_train:.4f}; Validation Accuracy: {acr_valid:.4f}")
        auc_train = roc_auc_score(train_y, rf_train_y)
        auc_valid = roc_auc_score(valid_y, rf_valid_y)
        print(f"Train AUROC: {auc_train:.4f}")
        print(f"Valid AUROC: {auc_valid:.4f}")
        print()
    if plot_auc:
        try:
            fpr1, tpr1, thresholds1 = roc_curve(train_y, rf_train_y)
            fpr2, tpr2, thresholds2 = roc_curve(valid_y, rf_valid_y)
            plt.plot([0,1], 'k--')
            plt.plot(fpr1, tpr1, label= "Train")
            plt.plot(fpr2, tpr2, label= "Valid")
            plt.legend()
            plt.xlabel("FPR")
            plt.ylabel("TPR")
            plt.title('AUROC Curve')
            plt.show()
            # plot_roc_curve(rf, train, train_y)
            # plot_roc_curve(rf, valid, valid_y)
        except:
            pass
    if return_pred and return_auc and version == 1:
        return pred, auc_train, auc_valid
    elif return_pred:
        return pred
    else:
        return

## Load Data

In [73]:
# modify dataset values for better model training
data = pd.read_pickle("../data/data.pkl")
data["BB1"] = data["BB1"] - 1
data["G1eA"] = data["G1eA"].replace({0: 1, 1: 0})
data["G1fA"] = data["G1fA"].replace({-1: 4})
data["G1gA"] = data["G1gA"].replace({0: 1, 1: 0})
data["G1hA"] = data["G1hA"].replace({0: 1, 1: 0})
data["G1iA"] = data["G1iA"].replace({0: 1, 1: 0})
print(len(data))

25394


In [74]:
# split dataset based on "G1eA" value
data_no_helper = data[data["G1eA"] == 2]
data_has_helper = data[data["G1eA"] != 2]

# discard observations with some missing criteria
null_cols = ["G1gA", "G1hA", "G1iA"]
data_discard = data_has_helper[
    (data_has_helper[null_cols[0]] == -1)
    | (data_has_helper[null_cols[1]] == -1)
    | (data_has_helper[null_cols[2]] == -1)
]
data_has_helper = data_has_helper[~data_has_helper.index.isin(data_discard.index)]
data_has_helper_index = data_has_helper.index
data_no_helper_index = data_no_helper.index

print(f"{len(data_has_helper)} {len(data_no_helper)} {len(data_discard)}")

24672 707 15


- `sex`  
    - `0` represents **male**
    - `1` represents **female**  
- `live_with_client`
    - `0` represents **No**
    - `1` represents **Yes**
    - `2` represents **No such helper**  
- `relationship`
    - `0` represents **Child or child-in-law or grandchild**
    - `1` represents **Spouse** 
    - `2` represents **Other relative**
    - `3` represents **Friend/neighbor**
    - `4` repressents **Others (e.g. maid)**
- `advice`
    - `0` represents **No**
    - `1` represents **Yes**
- `iadl`
    - `0` represents **No**
    - `1` represents **Yes**
- `adl`
    - `0` represents **No**
    - `1` represents **Yes**

In [75]:
# modify column names
modify_column_names = {
    "BB1": "sex",
    "G1eA": "live_with_client",
    "G1fA": "relationship",
    "G1gA": "advice",
    "G1hA": "iadl",
    "G1iA": "adl",
}
data_has_helper.rename(columns=modify_column_names, inplace=True)
data_no_helper.rename(columns=modify_column_names, inplace=True)

## Model Fitting

### Logistic Regression

In [76]:
from sklearn.linear_model import LogisticRegression

In [80]:
def LR(train, valid, test, train_y, valid_y, plot_auc=True):
    reg = LogisticRegression(solver='lbfgs', max_iter=100, C=1e-5, penalty="l2", multi_class="ovr", class_weight=None, random_state=seed)
    reg.fit(train, train_y)
    return evaluate(reg, train, valid, test, train_y, valid_y, return_pred=True, plot_auc=True)

In [None]:
# pred_test = LR(train, valid, test, train_y, valid_y)
print()