In [1]:
%load_ext nb_black
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics as m
import xgboost as xgb
import seaborn as sns
from imblearn.over_sampling import SMOTE  # doctest: +NORMALIZE_WHITESPACE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from collections import Counter

warnings.filterwarnings("ignore")
random_seed = 8

Using TensorFlow backend.


<IPython.core.display.Javascript object>

In [2]:
# collected form each main diabetic indicator features regressors
cols = np.unique(
    [
        "AGE",
        "FIELD_31",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "L100500",
        "L100700",
        "L100800",
        "L101200",
        "L101300",
        "L101600",
        "L101700",
        "L103000",
        "L103100",
        "L103300",
        "L104600",
        "L107400",
        "L190000",
        "L190300",
        "L190400",
        "S000100",
        "S000300",
        "S000501",
        "S000502",
        "SEX",
        "FIELD_1",
        "FIELD_2",
        "Unnamed: 0",
        "FIELD_16",
        "FIELD_23",
        "FIELD_15",
        "FIELD_22",
        "FIELD_17",
        "FIELD_24",
    ]
)

<IPython.core.display.Javascript object>

Read Dataset
====

In [3]:
x_original = pd.read_csv("../XLable_onlyDiabeticRemoved.txt")[cols]
y_original = pd.read_csv("../TargetLable_onlyDiabeticRemoved.txt")[cols]


data = pd.merge(
    x_original, y_original, how="inner", left_on="Unnamed: 0", right_on="Unnamed: 0"
)

<IPython.core.display.Javascript object>

Prepare dataset
===

In [4]:
# filter the data set
data = data[data.FIELD_16_x != 1]  # exclude people who are diagnosed for (diabetes)
data = data[data.FIELD_23_x != 1]  # exclude people who are on medication for diabetes

data = data[
    data.FIELD_15_x != 1
]  # exclude people who are diagnosed for (high blood pressure)
data = data[
    data.FIELD_22_x != 1
]  # exclude people who are on medication for high blood pressure

data = data[data.FIELD_17_x != 1]  # exclude people who are diagnosed for hyperlipidemia
data = data[
    data.FIELD_24_x != 1
]  # exclude people who are on medication for hyperlipidemia

data = data.dropna()
print(data.shape)

(41525, 67)


<IPython.core.display.Javascript object>

In [5]:
x = data[
    [
        "AGE_x",
        "FIELD_31_x",
        "FIELD_33_x",
        "FIELD_38_x",
        "FIELD_40_x",
        "L100500_x",
        "L100700_x",
        "L100800_x",
        "L101200_x",
        "L101300_x",
        "L101600_x",
        "L101700_x",
        "L103000_x",
        "L103100_x",
        "L103300_x",
        "L104600_x",
        "L107400_x",
        "L190000_x",
        "L190300_x",
        "L190400_x",
        "S000100_x",
        "S000300_x",
        "S000501_x",
        "S000502_x",
        "SEX_x",
        "FIELD_1_x",
        "FIELD_2_x",
    ]
]

y = data[
    [
        "AGE_y",
        "FIELD_31_y",
        "FIELD_33_y",
        "FIELD_38_y",
        "FIELD_40_y",
        "L100500_y",
        "L100700_y",
        "L100800_y",
        "L101200_y",
        "L101300_y",
        "L101600_y",
        "L101700_y",
        "L103000_y",
        "L103100_y",
        "L103300_y",
        "L104600_y",
        "L107400_y",
        "L190000_y",
        "L190300_y",
        "L190400_y",
        "S000100_y",
        "S000300_y",
        "S000501_y",
        "S000502_y",
        "SEX_y",
        "FIELD_1_y",
        "FIELD_2_y",
    ]
]

<IPython.core.display.Javascript object>

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=42, test_size=0.3)

<IPython.core.display.Javascript object>

Train Regression Models
==========

L104600_y # HBA1C
====

In [12]:
temp = pd.DataFrame(
    xtrain[
        [
            "L104600_x",
            "L100800_x",
            "S000300_x",
            "AGE_x",
            "L101300_x",
            "S000501_x",
            "L101700_x",
            "S000502_x",
            "L190000_x",
            "L101600_x",
            "L190300_x",
        ]
    ]
)
temp["L104600_y"] = ytrain[["L104600_y"]]

conditions = [
    (temp.L100800_x < 100),
    (temp.L100800_x >= 100) & (temp.L100800_x < 126),
    (temp.L100800_x >= 126),
]
choices = [0, 1, 2]
temp["A_CLASS"] = np.select(conditions, choices, default=0)
temp.head()

Unnamed: 0,L104600_x,L100800_x,S000300_x,AGE_x,L101300_x,S000501_x,L101700_x,S000502_x,L190000_x,L101600_x,L190300_x,L104600_y
35813,5.30,104.0,26.7,38.0,10.0,120.0,15.0,81.0,5.62,39.0,4.01,5.67
53995,5.70,91.0,25.3,42.0,9.0,99.0,11.0,64.0,8.18,45.0,4.39,5.80
53547,5.33,95.0,21.7,44.0,16.0,122.0,19.0,85.0,4.02,38.0,4.45,5.51
170134,5.37,98.0,25.1,46.0,21.0,128.0,30.0,79.0,9.34,60.0,4.74,5.14
157606,5.91,87.0,18.6,37.0,13.0,106.0,12.0,75.0,5.78,44.0,3.95,5.75
75372,5.39,107.0,22.3,47.0,41.0,132.0,122.0,86.0,6.33,61.0,4.09,5.40
32024,5.20,80.0,19.8,39.0,7.0,105.0,14.0,67.0,5.62,59.0,4.08,5.59
29759,5.33,93.0,25.5,38.0,24.0,128.0,79.0,68.0,8.99,43.0,4.89,5.53
73911,5.25,95.0,23.7,46.0,11.0,128.0,14.0,84.0,5.61,41.0,4.53,5.40
128529,5.40,86.0,22.9,40.0,18.0,115.0,18.0,68.0,6.31,44.0,4.86,5.60


<IPython.core.display.Javascript object>

In [10]:
xtrain.columns

# sm = SMOTE(random_state=random_seed)
# X_res, y_res = sm.fit_resample(xtrain, ytrain)

# print("Resampled dataset shape %s" % Counter(y_res))
# print(
#     y_res[y_res == 0].shape[0], y_res[y_res == 1].shape[0], y_res[y_res == 2].shape[0]
# )
# print(X_res.shape, y_res.shape)

# y_res2 = y_res.copy()

Index(['AGE_x', 'FIELD_31_x', 'FIELD_33_x', 'FIELD_38_x', 'FIELD_40_x',
       'L100500_x', 'L100700_x', 'L100800_x', 'L101200_x', 'L101300_x',
       'L101600_x', 'L101700_x', 'L103000_x', 'L103100_x', 'L103300_x',
       'L104600_x', 'L107400_x', 'L190000_x', 'L190300_x', 'L190400_x',
       'S000100_x', 'S000300_x', 'S000501_x', 'S000502_x', 'SEX_x',
       'FIELD_1_x', 'FIELD_2_x'],
      dtype='object')

<IPython.core.display.Javascript object>

In [None]:
regr = RandomForestRegressor(
    max_depth=10, random_state=random_seed, verbose=0, n_estimators=300, max_features=4
)
regr.fit(
    xtrain[
        [
            "L104600_x",
            "L100800_x",
            "S000300_x",
            "AGE_x",
            "L101300_x",
            "S000501_x",
            "L101700_x",
            "S000502_x",
            "L190000_x",
            "L101600_x",
            "L190300_x",
        ]
    ],
    ytrain[["L104600_y"]],
)

ypred = regr.predict(
    xtest[
        [
            "L104600_x",
            "L100800_x",
            "S000300_x",
            "AGE_x",
            "L101300_x",
            "S000501_x",
            "L101700_x",
            "S000502_x",
            "L190000_x",
            "L101600_x",
            "L190300_x",
        ]
    ]
)

print("r2_score", m.r2_score(ytest[["L104600_y"]], ypred))

In [None]:
pred_L104600 = ypred
pred_L104600.shape

L103000 #Triglyceride
=====

In [None]:
regr = RandomForestRegressor(
    max_depth=10, random_state=random_seed, verbose=0, n_estimators=300, max_features=8
)
regr.fit(
    xtrain[
        [
            "L103000_x",
            "L100700_x",
            "L101700_x",
            "S000300_x",
            "S000502_x",
            "FIELD_33_x",
            "L190000_x",
            "L100800_x",
            "L101600_x",
            "S000501_x",
        ]
    ],
    ytrain[["L103000_y"]],
)

ypred = regr.predict(
    xtest[
        [
            "L103000_x",
            "L100700_x",
            "L101700_x",
            "S000300_x",
            "S000502_x",
            "FIELD_33_x",
            "L190000_x",
            "L100800_x",
            "L101600_x",
            "S000501_x",
        ]
    ]
)

print("r2_score", m.r2_score(ytest[["L103000_y"]], ypred))

In [None]:
pred_L103000 = ypred

S000300 #BMI
===

In [None]:
regr = RandomForestRegressor(
    max_depth=10, random_state=random_seed, verbose=0, n_estimators=300, max_features=8
)
regr.fit(
    xtrain[
        [
            "S000300_x",
            "L103300_x",
            "S000501_x",
            "L100700_x",
            "SEX_x",
            "L103100_x",
            "L190300_x",
            "L190400_x",
            "L101300_x",
            "S000502_x",
        ]
    ],
    ytrain[["S000300_y"]],
)

ypred = regr.predict(
    xtest[
        [
            "S000300_x",
            "L103300_x",
            "S000501_x",
            "L100700_x",
            "SEX_x",
            "L103100_x",
            "L190300_x",
            "L190400_x",
            "L101300_x",
            "S000502_x",
        ]
    ]
)

print("r2_score", m.r2_score(ytest[["S000300_y"]], ypred))

In [None]:
pred_S000300 = ypred

L101700 # r-GTP gamma
===

In [None]:
regr = RandomForestRegressor(
    max_depth=10, random_state=random_seed, verbose=0, n_estimators=300, max_features=8
)
regr.fit(
    xtrain[
        [
            "L101700_x",
            "L101300_x",
            "L103000_x",
            "SEX_x",
            "L100700_x",
            "L101200_x",
            "S000300_x",
            "S000501_x",
            "L100800_x",
            "S000502_x",
            "L101600_x",
            "S000100_x",
        ]
    ],
    ytrain[["L101700_y"]],
)

ypred = regr.predict(
    xtest[
        [
            "L101700_x",
            "L101300_x",
            "L103000_x",
            "SEX_x",
            "L100700_x",
            "L101200_x",
            "S000300_x",
            "S000501_x",
            "L100800_x",
            "S000502_x",
            "L101600_x",
            "S000100_x",
        ]
    ]
)

print("r2_score", m.r2_score(ytest[["L101700_y"]], ypred))

In [None]:
pred_L101700 = ypred

L100700 #Uric acid
====

In [None]:
regr = RandomForestRegressor(
    max_depth=10, random_state=random_seed, verbose=0, n_estimators=300, max_features=8
)
regr.fit(
    xtrain[
        [
            "L100700_x",
            "L100500_x",
            "S000100_x",
            "S000300_x",
            "L103300_x",
            "L103000_x",
            "L103100_x",
            "S000501_x",
            "L101300_x",
            "L101700_x",
            "S000502_x",
            "L107400_x",
        ]
    ],
    ytrain[["L100700_y"]],
)

ypred = regr.predict(
    xtest[
        [
            "L100700_x",
            "L100500_x",
            "S000100_x",
            "S000300_x",
            "L103300_x",
            "L103000_x",
            "L103100_x",
            "S000501_x",
            "L101300_x",
            "L101700_x",
            "S000502_x",
            "L107400_x",
        ]
    ]
)

print("r2_score", m.r2_score(ytest[["L100700_y"]], ypred))

In [None]:
pred_L100700 = ypred

Categorical values
=====

In [None]:
pred_FIELD_33 = ytest.FIELD_33_y

In [None]:
pred_FIELD_38 = ytest.FIELD_38_y

In [None]:
pred_FIELD_40 = ytest.FIELD_40_y

In [None]:
pred_FIELD_31 = ytest.FIELD_31_y

In [None]:
pred_SEX = ytest.SEX_y

In [None]:
pred_AGE = ytest.AGE_y + 1

Combine the predicted values to make dataframe
====

In [None]:
NextYearData = pd.DataFrame()
NextYearData["P_FIELD_1"] = ytest.FIELD_1_y.values
NextYearData["P_FIELD_2"] = ytest.FIELD_2_y.values
NextYearData["P_L104600"] = pred_L104600
NextYearData["P_L103000"] = pred_L103000
NextYearData["P_S000300"] = pred_S000300
NextYearData["P_L101700"] = pred_L101700
NextYearData["P_L100700"] = pred_L100700
NextYearData["P_FIELD_33"] = pred_FIELD_33.values
NextYearData["P_FIELD_38"] = pred_FIELD_38.values
NextYearData["P_FIELD_40"] = pred_FIELD_40.values
NextYearData["P_FIELD_31"] = pred_FIELD_31.values
NextYearData["P_SEX"] = pred_SEX.values
NextYearData["P_AGE"] = pred_AGE.values

NextYearData["P_AGE"] = pred_AGE.values
NextYearData["P_AGE"] = pred_AGE.values


NextYearData["A_L100800"] = ytest.L100800_y.values

conditions = [
    (NextYearData.A_L100800 < 100),
    (NextYearData.A_L100800 >= 100) & (NextYearData.A_L100800 < 126),
    (NextYearData.A_L100800 >= 126),
]
choices = [0, 1, 2]
NextYearData["A_CLASS"] = np.select(conditions, choices, default=0)
NextYearData.head()

In [None]:
NextYearData.to_csv("Predicted_NextYearData.txt", sep=",")

==============================================
==========

In [None]:
# select the actual next year values

NextYearData_actualData = pd.DataFrame()
NextYearData_actualData["FIELD_1"] = ytest.FIELD_1_y.values
NextYearData_actualData["FIELD_2"] = ytest.FIELD_2_y.values
NextYearData_actualData["L104600"] = ytest.L104600_y.values
NextYearData_actualData["L103000"] = ytest.L103000_y.values
NextYearData_actualData["S000300"] = ytest.S000300_y.values
NextYearData_actualData["L101700"] = ytest.L101700_y.values
NextYearData_actualData["L100700"] = ytest.L100700_y.values
NextYearData_actualData["FIELD_33"] = ytest.FIELD_33_y.values
NextYearData_actualData["FIELD_38"] = ytest.FIELD_38_y.values
NextYearData_actualData["FIELD_40"] = ytest.FIELD_40_y.values
NextYearData_actualData["FIELD_31"] = ytest.FIELD_31_y.values
NextYearData_actualData["SEX"] = ytest.SEX_y.values
NextYearData_actualData["AGE"] = ytest.AGE_y.values

NextYearData_actualData["L100800"] = ytest.L100800_y.values

conditions = [
    (NextYearData_actualData.L100800 < 100),
    (NextYearData_actualData.L100800 >= 100) & (NextYearData_actualData.L100800 < 126),
    (NextYearData_actualData.L100800 >= 126),
]
choices = [0, 1, 2]
NextYearData_actualData["CLASS"] = np.select(conditions, choices, default=0)
NextYearData_actualData.head()

NextYearData_actualData.to_csv("NextYearData_actualData_test.txt", sep=",")

==========================================================
===

In [None]:
# select the actual this year values

ThisYearData_actualData = pd.DataFrame()
ThisYearData_actualData["FIELD_1"] = xtest.FIELD_1_x.values
ThisYearData_actualData["FIELD_2"] = xtest.FIELD_2_x.values
ThisYearData_actualData["L104600"] = xtest.L104600_x.values
ThisYearData_actualData["L103000"] = xtest.L103000_x.values
ThisYearData_actualData["S000300"] = xtest.S000300_x.values
ThisYearData_actualData["L101700"] = xtest.L101700_x.values
ThisYearData_actualData["L100700"] = xtest.L100700_x.values
ThisYearData_actualData["FIELD_33"] = xtest.FIELD_33_x.values
ThisYearData_actualData["FIELD_38"] = xtest.FIELD_38_x.values
ThisYearData_actualData["FIELD_40"] = xtest.FIELD_40_x.values
ThisYearData_actualData["FIELD_31"] = xtest.FIELD_31_x.values
ThisYearData_actualData["SEX"] = xtest.SEX_x.values
ThisYearData_actualData["AGE"] = xtest.AGE_x.values
ThisYearData_actualData["L100800"] = xtest.L100800_x.values

conditions = [
    (ThisYearData_actualData.L100800 < 100),
    (ThisYearData_actualData.L100800 >= 100) & (ThisYearData_actualData.L100800 < 126),
    (ThisYearData_actualData.L100800 >= 126),
]
choices = [0, 1, 2]
ThisYearData_actualData["CLASS"] = np.select(conditions, choices, default=0)
ThisYearData_actualData.head()

ThisYearData_actualData.to_csv("ThisYearData_actualData_test.txt", sep=",")

============================================
=========

In [None]:
# select the actual next year values train

NextYearData_actualData_train = pd.DataFrame()
NextYearData_actualData_train["FIELD_1"] = ytrain.FIELD_1_y.values
NextYearData_actualData_train["FIELD_2"] = ytrain.FIELD_2_y.values
NextYearData_actualData_train["L104600"] = ytrain.L104600_y.values
NextYearData_actualData_train["L103000"] = ytrain.L103000_y.values
NextYearData_actualData_train["S000300"] = ytrain.S000300_y.values
NextYearData_actualData_train["L101700"] = ytrain.L101700_y.values
NextYearData_actualData_train["L100700"] = ytrain.L100700_y.values
NextYearData_actualData_train["FIELD_33"] = ytrain.FIELD_33_y.values
NextYearData_actualData_train["FIELD_38"] = ytrain.FIELD_38_y.values
NextYearData_actualData_train["FIELD_40"] = ytrain.FIELD_40_y.values
NextYearData_actualData_train["FIELD_31"] = ytrain.FIELD_31_y.values
NextYearData_actualData_train["SEX"] = ytrain.SEX_y.values
NextYearData_actualData_train["AGE"] = ytrain.AGE_y.values

NextYearData_actualData_train["L100800"] = ytrain.L100800_y.values

conditions = [
    (NextYearData_actualData_train.L100800 < 100),
    (NextYearData_actualData_train.L100800 >= 100)
    & (NextYearData_actualData_train.L100800 < 126),
    (NextYearData_actualData_train.L100800 >= 126),
]
choices = [0, 1, 2]
NextYearData_actualData_train["CLASS"] = np.select(conditions, choices, default=0)
NextYearData_actualData_train.head()

NextYearData_actualData_train.to_csv("NextYearData_actualData_train.txt", sep=",")

=================================================
==============

In [None]:
# select the actual this year values train

ThisYearData_actualData_train = pd.DataFrame()
ThisYearData_actualData_train["FIELD_1"] = xtrain.FIELD_1_x.values
ThisYearData_actualData_train["FIELD_2"] = xtrain.FIELD_2_x.values
ThisYearData_actualData_train["L104600"] = xtrain.L104600_x.values
ThisYearData_actualData_train["L103000"] = xtrain.L103000_x.values
ThisYearData_actualData_train["S000300"] = xtrain.S000300_x.values
ThisYearData_actualData_train["L101700"] = xtrain.L101700_x.values
ThisYearData_actualData_train["L100700"] = xtrain.L100700_x.values
ThisYearData_actualData_train["FIELD_33"] = xtrain.FIELD_33_x.values
ThisYearData_actualData_train["FIELD_38"] = xtrain.FIELD_38_x.values
ThisYearData_actualData_train["FIELD_40"] = xtrain.FIELD_40_x.values
ThisYearData_actualData_train["FIELD_31"] = xtrain.FIELD_31_x.values
ThisYearData_actualData_train["SEX"] = xtrain.SEX_x.values
ThisYearData_actualData_train["AGE"] = xtrain.AGE_x.values
ThisYearData_actualData_train["L100800"] = xtrain.L100800_x.values

conditions = [
    (ThisYearData_actualData_train.L100800 < 100),
    (ThisYearData_actualData_train.L100800 >= 100)
    & (ThisYearData_actualData_train.L100800 < 126),
    (ThisYearData_actualData_train.L100800 >= 126),
]
choices = [0, 1, 2]
ThisYearData_actualData_train["CLASS"] = np.select(conditions, choices, default=0)
ThisYearData_actualData_train.head()

ThisYearData_actualData_train.to_csv("ThisYearData_actualData_train.txt", sep=",")