In [493]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import Imputer
from sklearn.metrics import roc_auc_score
import matplotlib
import matplotlib.pyplot as plt

In [494]:
def logreg_auc(variables, target, basetable):
    X = basetable[variables]
    y = basetable[target].values.ravel()
    logreg = linear_model.LogisticRegression()
    logreg.fit(X, y)
    predictions = logreg.predict_proba(X)[:,1]
    auc = roc_auc_score(y, predictions)
    return(auc)

def next_best(current_variables,candidate_variables, target, basetable):
    best_auc = -1
    best_variable = None
    for v in candidate_variables:
        auc_v = logreg_auc(current_variables + [v], target, basetable)
        if auc_v >= best_auc:
            best_auc = auc_v
            best_variable = v
    return best_variable

In [495]:
data = pd.read_csv("data/data.csv")

In [496]:
data.head()

Unnamed: 0.1,Unnamed: 0,YEAR,SERIAL,DUID,PID,MEPSID,PANEL,PSUANN,STRATANN,PSUPLD,...,HEARTCONEV,HYPERTENEV,STROKEAGE,STROKEV,HYPERTENAGE,CHEARTDIAGE,HEARTATTAGE,CHOLHIGHAGE,DIABETICAGE,HPTOTNIGHT
0,1,2010,1,10007,101,1000710115,15,2,1024,2.0,...,1,2,0,1,24,96,96,27,96,0
1,2,2010,1,10007,102,1000710215,15,2,1024,2.0,...,1,1,0,1,96,96,96,96,96,0
2,3,2010,1,10007,103,1000710315,15,2,1024,2.0,...,0,0,0,0,96,96,96,96,96,0
3,4,2010,1,10007,104,1000710415,15,2,1024,2.0,...,0,0,0,0,96,96,96,96,96,0
4,5,2010,2,10008,101,1000810115,15,1,1021,1.0,...,1,1,0,1,96,96,96,96,96,0


In [497]:
cat_features = [data.columns[i] for i, j in enumerate(data.dtypes) if (j == "object" or j == "bool")]
numeric_features = [data.columns[i] for i, j in enumerate(data.dtypes) if j in ["int64", "float64"]]

In [498]:
cat_features

[]

In [499]:
#numeric_features

In [500]:
sum(data['DIABWEIGHT'] != 0)
#exclude as too many 0

15102

In [501]:
data['MARSTAT'].unique()
#combine 0 and 99

array([10,  0, 50, 30, 99, 20, 40], dtype=int64)

In [502]:
data['EDUC'].unique()

array([604, 401,   0, 603, 601, 109, 999, 301, 201, 302, 103, 101, 998,
       203, 105, 500, 202, 107, 204, 996, 108, 106, 102, 104, 997, 402,
       403, 602, 100, 200, 600, 300], dtype=int64)

In [503]:
data['EDUCYR'].unique()
#review if correlation high enough to just use one of two

array([14., 96., 17.,  8., 99., 12.,  9., 11.,  2.,  0., 98.,  4., 16.,
       10., 13.,  6., 15.,  7.,  5.,  1.,  3., 97., nan])

In [504]:
candidate_variables = ['YEAR', 'PERWEIGHT', 'SAQWEIGHT', 'AGE', 'SEX', 'MARSTAT', 'BIRTHYR', 'EDUCYR', 'HIDEG', 'INCTOT', 'FTOTVAL', 'DIABETICEV']

In [505]:
target_variable = "HEARTATTEV"

In [506]:
#binary classification of target variable
data.drop(data[-data[target_variable].isin([1, 2])].index, inplace = True)
data.loc[data[target_variable] == 1,target_variable] = 0
data.loc[data[target_variable] == 2,target_variable] = 1
data = data.reset_index(drop = True)

In [507]:
train = data[candidate_variables]
Label = data[target_variable]

In [508]:
Label.head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    1
7    0
8    0
9    0
Name: HEARTATTEV, dtype: int64

In [511]:
train.shape

(179785, 12)

In [510]:
# check variables that are missing
for j in range(train.values.shape[1]):
    vec = train.values[:,j]
    mask = np.isfinite(vec)
    if np.sum(~mask) > 0:
        print(j, train.columns[j])
        print(np.sum(~mask))

7 EDUCYR
26499
8 HIDEG
51560


In [512]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(train)
train_imp= imp.transform(train)
train = pd.DataFrame(train_imp, columns = train.columns)

In [513]:
basetable = train
basetable["Label"] = Label

In [515]:
X = basetable[["AGE"]]
y = basetable["Label"]#.values.ravel()
logreg = linear_model.LogisticRegression()
logreg.fit(X, y)
predictions = logreg.predict_proba(X)[:,1]
auc = roc_auc_score(y, predictions)

In [518]:
max_number_variables = 5
number_iterations = min(max_number_variables, len(candidate_variables))
current_variables = []
for i in range(0,number_iterations):
    next_var = next_best(current_variables, candidate_variables, ["Label"], basetable)
    current_variables = current_variables + [next_var]
    candidate_variables.remove(next_var)
print(current_variables)

['BIRTHYR', 'DIABETICEV', 'YEAR', 'SEX', 'HIDEG']


In [521]:
logreg.fit(basetable[current_variables],Label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [522]:
predictions = logreg.predict_proba(basetable[current_variables])[:,1]
auc = roc_auc_score(basetable["Label"], predictions)

In [523]:
print(auc)

0.8429913306560539
