In [3]:
%load_ext nb_black
import numpy as np
import pandas as pd
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics as m
import xgboost as xgb
import seaborn as sns
from imblearn.over_sampling import SMOTE  # doctest: +NORMALIZE_WHITESPACE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from numpy.random import RandomState
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTENC

warnings.filterwarnings("ignore")
random_seed = 8

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [4]:
data1 = pd.read_csv("../NextYearData_actualData_train.txt")
data2 = pd.read_csv("../ThisYearData_actualData_train.txt")
data3 = pd.read_csv("../ThisYearData_actualData_test.txt")

data4 = pd.read_csv("../NextYearData_actualData_test.txt")
data5 = pd.read_csv("../Predicted_NextYearData.txt")

<IPython.core.display.Javascript object>

In [5]:
data = pd.read_csv("../../sep19SexAndAgeAddedFINAL DATASET_ver2.txt")

# filter the data set
data = data[data.FIELD_16 != 1]  # exclude people who are diagnosed for (diabetes)
data = data[data.FIELD_23 != 1]  # exclude people who are on medication for diabetes

data = data[
    data.FIELD_15 != 1
]  # exclude people who are diagnosed for (high blood pressure)
data = data[
    data.FIELD_22 != 1
]  # exclude people who are on medication for high blood pressure

data = data[data.FIELD_17 != 1]  # exclude people who are diagnosed for hyperlipidemia
data = data[
    data.FIELD_24 != 1
]  # exclude people who are on medication for hyperlipidemia


conditions = [
    (data.L100800 < 100),
    (data.L100800 >= 100) & (data.L100800 < 126),
    (data.L100800 >= 126),
]
choices = [0, 1, 2]
data["CLASS"] = np.select(conditions, choices, default=0)

<IPython.core.display.Javascript object>

In [7]:
maindata = data.copy()

maindata = maindata[
    [
        "FIELD_1",
        "FIELD_2",
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",
        "L100800",
        "CLASS",
    ]
]
maindata["index"] = maindata.index
print(maindata.shape)

(466680, 16)


<IPython.core.display.Javascript object>

In [8]:
# exclude data points which are going to be used for trainig

print(data4.shape)

temp = pd.merge(
    maindata[["index", "FIELD_1", "FIELD_2"]],
    data4[["FIELD_1", "FIELD_2"]],
    how="inner",
    left_on=["FIELD_1", "FIELD_2"],
    right_on=["FIELD_1", "FIELD_2"],
)

print(temp.shape)

maindata = maindata.drop(temp["index"])
maindata = maindata.dropna()
maindata.shape

(12458, 16)
(12092, 3)


(130632, 16)

<IPython.core.display.Javascript object>

In [9]:
maindata.groupby(by="CLASS").size()

CLASS
0    95662
1    32739
2     2231
dtype: int64

<IPython.core.display.Javascript object>

In [13]:
diabetic = maindata[maindata.CLASS == 2]
preddiabetic = maindata[maindata.CLASS == 1].sample(
    10000, random_state=42
)  # .sample(diabetic.shape[0], random_state=42)
normal = maindata[maindata.CLASS == 0].sample(10000, random_state=42)
trainpd = pd.concat([diabetic, preddiabetic, normal])
trainpd = trainpd[
    [
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",
        "CLASS",
    ]
]
x = trainpd[
    [
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",
    ]
]

y = trainpd[["CLASS"]]

<IPython.core.display.Javascript object>

In [14]:
# trainpd.to_csv("training_set_2231_2231_2231.txt", sep=",")

<IPython.core.display.Javascript object>

In [12]:
_d = data4[data4.CLASS == 2]
_pd = data4[data4.CLASS == 1].sample(_d.shape[0], random_state=42)
_n = data4[data4.CLASS == 0].sample(_d.shape[0], random_state=42)


testpd = pd.concat([_d, _pd, _n])

testpd = testpd[
    [
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",
        "CLASS",
    ]
]

print(_d.shape, _pd.shape, _n.shape)

xtest = testpd[
    [
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",
    ]
]
ytest = testpd.CLASS

testpd.to_csv("test_set.txt", sep=",")

(236, 16) (236, 16) (236, 16)


<IPython.core.display.Javascript object>

Generate synthetic dataset 
===

In [15]:
# generate synthetic dataset to overcome class imbalance

sm = SMOTENC(random_state=42, categorical_features=[5, 6, 7, 8])
X_res, y_res = sm.fit_resample(x, y)
print("Resampled dataset samples per class {}".format(Counter(y_res)))

Resampled dataset samples per class Counter({2: 10000, 1: 10000, 0: 10000})


<IPython.core.display.Javascript object>

In [16]:
temp = pd.DataFrame(X_res)
temp.columns = x.columns
temp["CLASS"] = y_res
temp.tail()

Unnamed: 0,L104600,L103000,S000300,L101700,L100700,FIELD_33,FIELD_38,FIELD_40,FIELD_31,SEX,AGE,CLASS
29995,6.794957,166.036652,26.902443,59.530543,3.85672,2.0,2.0,0.0,0.0,0.0,46.481674,2
29996,7.54871,345.330811,26.070456,57.237375,5.7899,1.0,3.0,3.0,0.0,0.0,50.050501,2
29997,6.052311,47.459136,16.946018,13.934409,3.293441,1.0,1.0,0.0,0.0,1.0,43.196773,2
29998,6.227596,161.0,23.103716,19.23048,4.59492,1.0,0.0,0.0,0.0,1.0,53.82032,2
29999,6.578945,105.087072,23.226388,26.174144,5.382586,1.0,0.0,4.0,0.0,0.0,58.630608,2


<IPython.core.display.Javascript object>

In [17]:
temp.to_csv("g_training_set_10000_10000_10000.txt", sep=",")

<IPython.core.display.Javascript object>

In [18]:
temp.shape

(30000, 12)

<IPython.core.display.Javascript object>

In [19]:
gx = temp[
    [
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",
    ]
]
gy = temp.CLASS

<IPython.core.display.Javascript object>

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    random_state=42,
    n_estimators=300,
    max_depth=4,
    min_samples_split=2,
    min_samples_leaf=5,
    max_features="auto",
)
rf.fit(x, y)

In [None]:
from matplotlib import pyplot as plt

ypred = rf.predict(xtest)

score = rf.score(xtest, ytest)
print(score)

feat_importances = pd.Series(rf.feature_importances_, index=xtest.columns)
feat_importances.nlargest(16).plot(kind="barh")
plt.show()

print(feat_importances.nlargest(16).index)

In [None]:
print("Accuracy on training set: {:.3f}".format(rf.score(x, y)))
print("Accuracy on test set: {:.3f}".format(rf.score(xtest, ytest)))

confmatrx = pd.DataFrame(m.confusion_matrix(ytest, ypred))
confmatrx.head()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(ytest, ypred))