In [1]:
import numpy as np
import pandas as pd


In [8]:
#import raw data from github and store it in variable url

url = 'https://raw.githubusercontent.com/Manoj-A-Thomas/data/data/diabetes-data-preprocessed.csv'
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
dpp = pd.read_csv(url, names=names)

In [10]:
dpp

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [12]:
dpp.shape

(768, 9)

In [13]:
dpp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
preg     768 non-null int64
plas     768 non-null int64
pres     768 non-null int64
skin     768 non-null int64
test     768 non-null int64
mass     768 non-null float64
pedi     768 non-null float64
age      768 non-null int64
class    768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [14]:
dpp.sample(10)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
482,4,85,58,22,49,27.8,0.306,28,0
523,9,130,70,0,0,34.2,0.652,45,1
556,1,97,70,40,0,38.1,0.218,30,0
297,0,126,84,29,215,30.7,0.52,24,0
610,3,106,54,21,158,30.9,0.292,24,0
692,2,121,70,32,95,39.1,0.886,23,0
382,1,109,60,8,182,25.4,0.947,21,0
519,6,129,90,7,326,19.6,0.582,60,0
246,10,122,68,0,0,31.2,0.258,41,0
20,3,126,88,41,235,39.3,0.704,27,0


In [16]:
# Create a dataframe for predictor variable, denoted X
array = dpp.values
X = array[:,0:8]
y = array[:,8]

In [17]:
X.shape

(768, 8)

In [18]:
y.shape

(768,)

In [21]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [24]:
# Feature Selection - Filter Methods, using chi squre

test = SelectKBest(score_func=chi2, k=4) #out of 8, give 4 most important targets
fit = test.fit(X,y)

In [25]:
np.set_printoptions(precision=3)
print(fit.scores_) #test, plas, age, mass

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


In [29]:
# Scale Features/ Variables

features = fit.transform(X)

In [31]:
print(features[0:5,:])      #sample from predictor dataset 

[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


In [33]:
# Feature Selection - Filter Method - Correlation Coeff 

features.dtype

dtype('float64')

In [35]:
features_df = pd.DataFrame(features)
features_df.corr()      #the more closer to 1, more similar -> high correlation and vise versa
#rule of thump: any variable that is > 0.7 corr, we must think about elimating that variable 

Unnamed: 0,0,1,2,3
0,1.0,0.331357,0.221071,0.263514
1,0.331357,1.0,0.197859,-0.042163
2,0.221071,0.197859,1.0,0.036242
3,0.263514,-0.042163,0.036242,1.0


In [36]:
# Feature selection - Wrapper Method - RFE

from sklearn.feature_selection import RFE 
from sklearn.linear_model import LogisticRegression

In [38]:
# Create the logistic regression model 

model = LogisticRegression()
rfe = RFE(model,3)      # chose top 3 important features 
fit = rfe.fit(X,y)



In [41]:
print('selected features: %s' %(fit.support_))
print('Feature Ranking: %s' %(fit.ranking_))

#top 3 important features are: preg, pedi, age

selected features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]


In [43]:
# Pretty Function for the Linear Regression 

def pretty_print_coefs(coefs, names=None, sort=False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs,names)
    if sort:
        lst = sorted(lst, key=lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s " %(round(coefs,3),name)
                      for coefs, name in lst)

In [44]:
print('Ridge Model: ', pretty_print_coefs(ridge.coef_)) 

# variables that are negative coef can be removed, but remember - the strongest coef is the closet to 1
# with that being said, -0.0 * X4 (stands for variable skin) can be eliminated completely, 
# while -0.002 * X2 (stands for variable plas) needs more modification before removing 
# by checking the result, both before and after. 

Ridge Model:  0.021 * X0  + 0.006 * X1  + -0.002 * X2  + 0.0 * X3  + -0.0 * X4  + 0.013 * X5  + 0.145 * X6  + 0.003 * X7 
