In [27]:
import os
from tqdm import tqdm
import json
import gc
import numpy as np
import pandas as pd
from sklearn.preprocessing import *
from sklearn.svm import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import *
from sklearn.metrics import *

pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
data = pd.read_sas('./master.sas7bdat')
gc.collect()

250

In [3]:
target_var = 'censor_po'
varstodrop = ['fuyrs_po']

data.drop(varstodrop,axis=1, inplace=True)

In [4]:
cat_vars = [ele for ele in data.columns.tolist() if data[ele].dtype=="O"]
print(cat_vars)

['raceclass']


In [6]:
def label_encoding(input_data, col):
    try:
        print("Label encoding {}...".format(col))
        le = LabelEncoder()
        output = le.fit_transform(input_data[col])
        print("Finished.")
        return output
    except Exception as e:
        _err_msg = "Failed in label encoding. Error: {}".format(e)
        print(_err_msg); raise

In [8]:
encoded_data = label_encoding(data, 'raceclass')
data['raceclass'] = encoded_data

Label encoding raceclass...
Finished.


In [11]:
print("make sure no missing values")
missing_vals = data.isnull().sum()
print(missing_vals)

make sure no missing values
female             0
baseline_age       0
cvd_hx_baseline    0
raceclass          0
edu                0
yrsdiab            0
ulcer              0
protein            0
hartfail           0
neuropat           0
depressn           0
eyedisea           0
histhart           0
cigarett           0
alcohol            0
wt_kg              0
ht_cm              0
waist_cm           0
ffilam             0
MNSIscor           0
feeling            0
censor_po          0
sbp                0
dbp                0
hr                 0
potassium          0
a2rb               0
acei               0
beta_blocker       0
sulfonylurea       0
                  ..
aspirin            0
anti_depress       0
hba1c              0
chol               0
trig               0
vldl               0
ldl                0
hdl                0
fpg                0
alt                0
cpk                0
screat             0
gfr                0
ualb               0
ucreat             0
uacr  

In [15]:
print("split training and validation data")
train, test = train_test_split(data, test_size=0.3, random_state=2019)

split training and validation data


In [20]:
print("Check target variable ratio")
print("For training:")
print(train[target_var].value_counts() / train.shape[0])

print("For test:")
print(test[target_var].value_counts() / test.shape[0])
print("They are very close.")

Check target variable ratio
For training:
1.0    0.811427
0.0    0.188573
Name: censor_po, dtype: float64
For test:
1.0    0.809392
0.0    0.190608
Name: censor_po, dtype: float64
They are very close.


### Logictic regression

In [28]:
def validate(truth, preds, threshold=0.5):
    bin_preds = [1 if ele >= threshold else 0 for ele in preds]
    _accurary_score = accuracy_score(truth, bin_preds)
    _precision_score = precision_score(truth, bin_preds)
    _recall_score = recall_score(truth, bin_preds)
    _auc_score = roc_auc_score(truth, preds)
    
    print("Accuracy: {}".format(_accurary_score))
    print("Precision: {}".format(_precision_score))
    print("Recall: {}".format(_recall_score))
    print("AUC: {}".format(_auc_score))

In [21]:
print("Play with feature tuning :)")

Play with feature tuning :)


In [35]:
lr = LogisticRegression(random_state=2019, max_iter=200, n_jobs=-1)

lr.fit(train.drop([target_var], axis=1), train[target_var])

lr_tra_preds = lr.predict_proba(train.drop([target_var], axis=1))[:, 1].squeeze()
lr_tst_preds = lr.predict_proba(test.drop([target_var], axis=1))[:, 1].squeeze()

  " = {}.".format(effective_n_jobs(self.n_jobs)))


In [36]:
print("For training: ")
validate(train[target_var], lr_tra_preds)

print("For testing: ")
validate(test[target_var], lr_tst_preds)

For training: 
Accuracy: 0.8179396092362344
Precision: 0.8264742014742015
Recall: 0.9817584823057278
AUC: 0.7317717983272787
For testing: 
Accuracy: 0.8080110497237569
Precision: 0.8197424892703863
Recall: 0.9778156996587031
AUC: 0.6624622842162536
