###Part 1: Building the model

In [1]:
import numpy as np
import pandas as pd
from collections import OrderedDict
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('https://github.com/ArinB/CA05-B-Logistic-Regression/raw/master/cvd_data.csv')

In [3]:
df

Unnamed: 0,cvd_4types,age_s1,race,educat,mstat,hip,neck20,waist,av_weight_kg,cgpkyr,tea15,srhype,parrptdiab,bend25,happy25,tired25,hlthlm25
0,0,54,1,2,1,110.0,40.0,108.0,87.5,34.0,0,1,0,1,2,3,4
1,0,56,3,2,1,113.0,34.0,107.0,83.5,0.0,0,0,0,2,2,1,3
2,0,54,1,3,1,110.0,44.5,105.0,86.2,49.5,0,0,0,3,2,6,4
3,0,54,1,3,1,129.0,42.5,110.0,89.1,0.0,0,0,0,3,2,1,3
4,0,51,3,2,1,122.0,37.0,113.0,81.3,0.0,0,0,0,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3237,1,66,1,2,1,95.0,41.5,99.0,88.2,0.0,0,1,0,2,2,5,4
3238,1,54,1,3,1,99.0,34.9,99.0,83.3,30.0,0,0,0,3,3,3,4
3239,1,55,1,4,1,101.0,37.0,91.0,75.2,0.0,0,0,0,3,2,5,4
3240,1,53,1,2,1,98.0,39.0,93.0,79.0,0.0,0,0,0,2,2,5,4


In [None]:
x = df.drop(['cvd_4types'], axis=1)

In [None]:
x

Unnamed: 0,age_s1,race,educat,mstat,hip,neck20,waist,av_weight_kg,cgpkyr,tea15,srhype,parrptdiab,bend25,happy25,tired25,hlthlm25
0,54,1,2,1,110.0,40.0,108.0,87.5,34.0,0,1,0,1,2,3,4
1,56,3,2,1,113.0,34.0,107.0,83.5,0.0,0,0,0,2,2,1,3
2,54,1,3,1,110.0,44.5,105.0,86.2,49.5,0,0,0,3,2,6,4
3,54,1,3,1,129.0,42.5,110.0,89.1,0.0,0,0,0,3,2,1,3
4,51,3,2,1,122.0,37.0,113.0,81.3,0.0,0,0,0,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3237,66,1,2,1,95.0,41.5,99.0,88.2,0.0,0,1,0,2,2,5,4
3238,54,1,3,1,99.0,34.9,99.0,83.3,30.0,0,0,0,3,3,3,4
3239,55,1,4,1,101.0,37.0,91.0,75.2,0.0,0,0,0,3,2,5,4
3240,53,1,2,1,98.0,39.0,93.0,79.0,0.0,0,0,0,2,2,5,4


In [None]:
y = df.cvd_4types

In [None]:
y

0       0
1       0
2       0
3       0
4       0
       ..
3237    1
3238    1
3239    1
3240    1
3241    1
Name: cvd_4types, Length: 3242, dtype: int64

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
log_reg = LogisticRegression(random_state=99, max_iter=4000)

In [None]:
log_reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=99, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = log_reg.predict(x_test)

###Part 2: Feature Importance


In [None]:
log_reg.coef_

array([[ 9.19841611e-04, -1.06192366e+00,  2.33442405e-01,
        -1.29985974e-01, -5.89594487e-02, -3.89559933e-02,
         8.22304681e-02, -3.18132639e-02,  1.10453295e-03,
        -4.45842228e-02,  1.26927728e-01,  6.51701397e-01,
         1.20024373e-01, -1.07540455e-01,  1.00660291e-01,
        -5.78098245e-01]])

In [None]:
x.columns

Index(['age_s1', 'race', 'educat', 'mstat', 'hip', 'neck20', 'waist',
       'av_weight_kg', 'cgpkyr', 'tea15', 'srhype', 'parrptdiab', 'bend25',
       'happy25', 'tired25', 'hlthlm25'],
      dtype='object')

In [None]:
coefficient_list = zip(x.columns, log_reg.coef_[0])

In [None]:
coef_dict = {x.columns[i]: log_reg.coef_[0][i] for i in range(len(x.columns))}

In [None]:
coef_dict

{'age_s1': 0.0009198416105173559,
 'av_weight_kg': -0.03181326388533544,
 'bend25': 0.12002437289135785,
 'cgpkyr': 0.0011045329505004508,
 'educat': 0.23344240519558723,
 'happy25': -0.10754045485631271,
 'hip': -0.05895944866696345,
 'hlthlm25': -0.5780982446495613,
 'mstat': -0.12998597366354597,
 'neck20': -0.03895599334749571,
 'parrptdiab': 0.6517013971899595,
 'race': -1.0619236612472764,
 'srhype': 0.12692772779698583,
 'tea15': -0.044584222813828664,
 'tired25': 0.10066029095358553,
 'waist': 0.08223046814012146}

In [None]:
sorted_pairs = sorted(coef_dict.items(), key=lambda k: abs(k[1]), reverse=True)
ordered_dict = OrderedDict(sorted_pairs)

In [None]:
ordered_dict

OrderedDict([('race', -1.0619236612472764),
             ('parrptdiab', 0.6517013971899595),
             ('hlthlm25', -0.5780982446495613),
             ('educat', 0.23344240519558723),
             ('mstat', -0.12998597366354597),
             ('srhype', 0.12692772779698583),
             ('bend25', 0.12002437289135785),
             ('happy25', -0.10754045485631271),
             ('tired25', 0.10066029095358553),
             ('waist', 0.08223046814012146),
             ('hip', -0.05895944866696345),
             ('tea15', -0.044584222813828664),
             ('neck20', -0.03895599334749571),
             ('av_weight_kg', -0.03181326388533544),
             ('cgpkyr', 0.0011045329505004508),
             ('age_s1', 0.0009198416105173559)])

In [None]:
#ordered by value of coefficients
for key, value in sorted(ordered_dict.items(), key=lambda item: item[1]):
    print(key,': ', value)

race :  -1.0619236612472764
hlthlm25 :  -0.5780982446495613
mstat :  -0.12998597366354597
happy25 :  -0.10754045485631271
hip :  -0.05895944866696345
tea15 :  -0.044584222813828664
neck20 :  -0.03895599334749571
av_weight_kg :  -0.03181326388533544
age_s1 :  0.0009198416105173559
cgpkyr :  0.0011045329505004508
waist :  0.08223046814012146
tired25 :  0.10066029095358553
bend25 :  0.12002437289135785
srhype :  0.12692772779698583
educat :  0.23344240519558723
parrptdiab :  0.6517013971899595


Our coefficients have been listed in order of importance. The top 3 most impactful variables are race, quality of life, and marital status. For example, we can say that non-white individuals have a lower rate of heart disease. Additionally, as quality of life increases, we expect to see a lower likelihood of CVD. Finally, individuals who are married are less likely to have heart disease.

In [None]:
coef_dict.items()

dict_items([('age_s1', 0.0009198416105173559), ('race', -1.0619236612472764), ('educat', 0.23344240519558723), ('mstat', -0.12998597366354597), ('hip', -0.05895944866696345), ('neck20', -0.03895599334749571), ('waist', 0.08223046814012146), ('av_weight_kg', -0.03181326388533544), ('cgpkyr', 0.0011045329505004508), ('tea15', -0.044584222813828664), ('srhype', 0.12692772779698583), ('parrptdiab', 0.6517013971899595), ('bend25', 0.12002437289135785), ('happy25', -0.10754045485631271), ('tired25', 0.10066029095358553), ('hlthlm25', -0.5780982446495613)])

###Part 3: Model Evaluation



In [None]:
log_reg.score(x_test, y_test)

0.7040690505548706

In [None]:
roc_auc_score(y_test, y_pred)

0.6742558458039629

In [None]:
confusion_matrix(y_test, y_pred)

array([[166, 186],
       [ 69, 390]])

What do these metrics tell us about our model?
What insights can we derive from them?

Our performance metrics indicate to us that we have a decent predictive model. It's accuracy score is not bad but it also misclassified a lot of false positives. This is an issue with our model that would need to be addressed in future iterations. In conclusion, I would recommend to further hone in on the precision score of our model. We could also invest in more resources for non-white, single people as they are a greater risk than others.  