In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
import seaborn as sns
sns.set(rc={'figure.figsize':(10,8)})

In [2]:
data=pd.read_csv("Processed-nhanes3.csv")

In [3]:
data.columns

Index(['agestrat', 'sex', 'ses', 'hf', 'stroke', 'mi', 'fast', 'metabolic',
       'mortstat', 'race_hispanic', 'race_multiracial',
       'race_non hispanic black', 'race_non hispanic white'],
      dtype='object')

In [4]:
data.head()

Unnamed: 0,agestrat,sex,ses,hf,stroke,mi,fast,metabolic,mortstat,race_hispanic,race_multiracial,race_non hispanic black,race_non hispanic white
0,0,1,1,0,0,0,0,0,alive,0,0,1,0
1,0,0,0,0,0,0,0,0,alive,0,0,1,0
2,1,0,0,0,0,0,0,0,alive,1,0,0,0
3,1,1,0,0,0,0,0,0,alive,1,0,0,0
4,1,1,0,0,0,0,1,0,alive,0,0,1,0


#### Removing Age 

In [5]:
data['agestrat'].value_counts()

0    4089
1    3507
3    3264
2    2574
Name: agestrat, dtype: int64

In [6]:
for i in range(4):
    count = data[data['agestrat'] == i]['mortstat'].value_counts()
    print(f'| {i:1d} | Alive : {count["alive"]:7.2f} | Dead: {count["dead"]:7.2f} |')

| 0 | Alive : 3969.00 | Dead:  120.00 |
| 1 | Alive : 3222.00 | Dead:  285.00 |
| 2 | Alive : 1937.00 | Dead:  637.00 |
| 3 | Alive :  964.00 | Dead: 2300.00 |


In [7]:
data1 = data[(data['agestrat'] == 3) & (data['mortstat'] == 'dead')].sample(964)

In [8]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 964 entries, 2727 to 1558
Data columns (total 13 columns):
agestrat                   964 non-null int64
sex                        964 non-null int64
ses                        964 non-null int64
hf                         964 non-null int64
stroke                     964 non-null int64
mi                         964 non-null int64
fast                       964 non-null int64
metabolic                  964 non-null int64
mortstat                   964 non-null object
race_hispanic              964 non-null int64
race_multiracial           964 non-null int64
race_non hispanic black    964 non-null int64
race_non hispanic white    964 non-null int64
dtypes: int64(12), object(1)
memory usage: 105.4+ KB


In [9]:
data = pd.concat([data[(data['agestrat'] == 3) & (data['mortstat'] == 'alive')], data1], ignore_index=True)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1928 entries, 0 to 1927
Data columns (total 13 columns):
agestrat                   1928 non-null int64
sex                        1928 non-null int64
ses                        1928 non-null int64
hf                         1928 non-null int64
stroke                     1928 non-null int64
mi                         1928 non-null int64
fast                       1928 non-null int64
metabolic                  1928 non-null int64
mortstat                   1928 non-null object
race_hispanic              1928 non-null int64
race_multiracial           1928 non-null int64
race_non hispanic black    1928 non-null int64
race_non hispanic white    1928 non-null int64
dtypes: int64(12), object(1)
memory usage: 195.9+ KB


In [11]:
data['mortstat'].value_counts()

alive    964
dead     964
Name: mortstat, dtype: int64

In [12]:
# sns.heatmap(data.corr(), cmap=sns.diverging_palette(220, 10, as_cmap=True),square=True)

#### Feature selection

In [13]:
Y = data['mortstat']

In [14]:
del data['mortstat']

In [15]:
X = data

In [16]:
X.shape

(1928, 12)

In [17]:
Y = np.ravel(Y)

In [18]:
Y = np.array(Y == 'alive', dtype=int)

In [19]:
# xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.20, random_state=1)

In [20]:
X.shape

(1928, 12)

In [21]:
lr = LogisticRegression(penalty='l1', C=0.06, solver='saga')

In [22]:
lr.fit(X,Y)

LogisticRegression(C=0.06, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
lr.coef_

array([[ 0.        , -0.41482026, -0.11418146, -0.19317142, -0.56730156,
        -0.4061759 , -0.15855221,  0.        , -0.1381529 ,  0.        ,
         0.        ,  0.        ]])

In [24]:
sfm = SelectFromModel(lr)
sfm.fit(X,Y)

SelectFromModel(estimator=LogisticRegression(C=0.06, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [25]:
H_columns = X.columns[sfm.get_support()]

In [26]:
H = X[H_columns]

In [27]:
H.head()

Unnamed: 0,sex,ses,hf,stroke,mi,fast,race_hispanic
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,1,0,0,0,0,0


In [28]:
H.shape

(1928, 7)

In [29]:
X_T = X[[x for x in X.columns if x != 'metabolic']]

In [30]:
X_T.head()

Unnamed: 0,agestrat,sex,ses,hf,stroke,mi,fast,race_hispanic,race_multiracial,race_non hispanic black,race_non hispanic white
0,3,0,0,0,0,0,0,1,0,0,0
1,3,0,0,0,0,0,0,0,0,1,0
2,3,0,0,0,0,0,0,1,0,0,0
3,3,0,0,0,0,0,0,1,0,0,0
4,3,0,1,0,0,0,0,0,0,0,1


In [31]:
# xtrain, xtest, ytrain, ytest = train_test_split(X_T, Y, test_size=0.20, random_state=1)

In [32]:
lr.fit(X_T,data['metabolic'])

LogisticRegression(C=0.06, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
lr.coef_

array([[ 0.        , -0.08454058,  0.        ,  0.        ,  0.        ,
         0.        ,  0.2909827 ,  0.        ,  0.        ,  0.14619004,
         0.        ]])

In [34]:
sfm.fit(X_T,data['metabolic'])

SelectFromModel(estimator=LogisticRegression(C=0.06, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [35]:
K_columns = X_T.columns[sfm.get_support()]

In [36]:
K = X_T[K_columns]

In [37]:
K.head()

Unnamed: 0,sex,fast,race_non hispanic black
0,0,0,0
1,0,0,1
2,0,0,0
3,0,0,0
4,0,0,0


### Regressing Y on H

In [38]:
# xtrain, xtest, ytrain, ytest = train_test_split(H, Y, test_size=0.20, random_state=1)

In [39]:
clf = AdaBoostClassifier()

In [40]:
clf.fit(H,Y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [41]:
Y_prob = clf.predict_proba(H)

#### Y Residual

In [42]:
Y[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [43]:
Y_prob[:10,1]

array([0.50219761, 0.50426774, 0.50219761, 0.50219761, 0.50204782,
       0.50138893, 0.50426774, 0.50219761, 0.50219761, 0.49931871])

In [44]:
Y_res = np.abs(Y - Y_prob[:,1])

#### T Residual

In [45]:
clf.fit(K,data['metabolic'])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [46]:
T_prob = clf.predict_proba(K)

In [47]:
T_prob[:10,1]

array([0.49551286, 0.4975955 , 0.49551286, 0.49551286, 0.49551286,
       0.49629114, 0.49551286, 0.49551286, 0.49551286, 0.4942086 ,
       0.4942086 , 0.4942086 , 0.4975955 , 0.4942086 , 0.4942086 ,
       0.49551286, 0.4942086 , 0.49551286, 0.49551286, 0.4942086 ,
       0.49551286, 0.4942086 , 0.4942086 , 0.49551286, 0.4942086 ,
       0.49629114, 0.49551286, 0.4942086 , 0.49551286, 0.49551286,
       0.4942086 , 0.49551286, 0.49745129, 0.49551286, 0.49551286,
       0.49551286, 0.4942086 , 0.49551286, 0.4942086 , 0.4942086 ,
       0.49551286, 0.4942086 , 0.49551286, 0.49551286, 0.4942086 ,
       0.49551286, 0.49629114, 0.4942086 , 0.4942086 , 0.4942086 ,
       0.49629114, 0.49551286, 0.4942086 , 0.50083841, 0.49551286,
       0.4942086 , 0.49551286, 0.49551286, 0.4942086 , 0.49551286,
       0.49745129, 0.4942086 , 0.49551286, 0.4942086 , 0.49551286,
       0.49629114, 0.49551286, 0.4942086 , 0.49629114, 0.49551286,
       0.4942086 , 0.4975955 , 0.49551286, 0.4942086 , 0.49551

In [48]:
data['metabolic'].values - T_prob[:,1]

array([-0.49551286, -0.4975955 , -0.49551286, ..., -0.4942086 ,
       -0.49629114,  0.5057914 ])

In [49]:
T_res = np.abs(data['metabolic'].values - T_prob[:,1])

### Regressing Y_res on T_res

In [50]:
Y_res[:10]

array([0.49780239, 0.49573226, 0.49780239, 0.49780239, 0.49795218,
       0.49861107, 0.49573226, 0.49780239, 0.49780239, 0.50068129])

In [51]:
T_res[:10]

array([0.49551286, 0.4975955 , 0.49551286, 0.49551286, 0.49551286,
       0.49629114, 0.49551286, 0.49551286, 0.49551286, 0.4942086 ])

In [52]:
from sklearn.linear_model import LinearRegression

In [53]:
clf = LinearRegression(fit_intercept=False)

In [54]:
T_res = T_res - np.mean(T_res)

In [55]:
clf.fit(T_res.reshape(-1,1),Y_res)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)

In [56]:
clf.coef_

array([-0.01093892])

In [57]:
clf.intercept_

0.0

In [58]:
T_res.mean()

9.700055420026329e-17