In [30]:
# import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import seaborn


In [31]:
# Read housing data set csv into memory
df=pd.read_csv("../data/wdbc.csv",low_memory='False')

# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)

 **Attribute Information:**
1) ID number
2) Diagnosis (M = malignant, B = benign)
3-32)

Ten real-valued features are computed for each cell nucleus:

	a) radius (mean of distances from center to points on the perimeter)
	b) texture (standard deviation of gray-scale values)
	c) perimeter
	d) area
	e) smoothness (local variation in radius lengths)
	f) compactness (perimeter^2 / area - 1.0)
	g) concavity (severity of concave portions of the contour)
	h) concave points (number of concave portions of the contour)
	i) symmetry 
	j) fractal dimension ("coastline approximation" - 1)

The mean, standard error, and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features.  For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.

All feature values are recoded with four significant digits.

Missing attribute values: none

Class distribution: 357 benign, 212 malignant

In [32]:
# Upper case all data frame column names
df.columns=map(str.upper, df.columns)

#check dimensions of data set
(rows,features)= df.shape
print "Number of rows in data set - ", rows
print "Number of features in data set - ",features
print df.head(2)
df.describe()

Number of rows in data set -  569
Number of features in data set -  32
       ID RESULT  RADIUS_MN  TEXTURE_MN  PERIMETER_MN     AREA_MN  \
0  842302      M  17.990000   10.380000    122.800000 1001.000000   
1  842517      M  20.570000   17.770000    132.900000 1326.000000   

   SMOOTHNESS_MN  COMPACTNESS_MN  CONCAVITY_MN  CONCAVE_MN    ...      \
0       0.118400        0.277600      0.300100    0.147100    ...       
1       0.084740        0.078640      0.086900    0.070170    ...       

   RADIUS_L  TEXTURE_L  PERIMETER_L      AREA_L  SMOOTHNESS_L  COMPACTNESS_L  \
0 25.380000  17.330000   184.600000 2019.000000      0.162200       0.665600   
1 24.990000  23.410000   158.800000 1956.000000      0.123800       0.186600   

   CONCAVITY_L  CONCAVE_L  SYMMETRY_L  FRACTAL_L  
0     0.711900   0.265400    0.460100   0.118900  
1     0.241600   0.186000    0.275000   0.089020  

[2 rows x 32 columns]


Unnamed: 0,ID,RADIUS_MN,TEXTURE_MN,PERIMETER_MN,AREA_MN,SMOOTHNESS_MN,COMPACTNESS_MN,CONCAVITY_MN,CONCAVE_MN,SYMMETRY_MN,...,RADIUS_L,TEXTURE_L,PERIMETER_L,AREA_L,SMOOTHNESS_L,COMPACTNESS_L,CONCAVITY_L,CONCAVE_L,SYMMETRY_L,FRACTAL_L
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371831.432337,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020585.612224,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320502.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [35]:
from sklearn.preprocessing import LabelBinarizer
lb=LabelBinarizer()
df['RESULT']=lb.fit_transform(df['RESULT'])

In [37]:
df['RESULT']=pd.to_numeric(df['RESULT'])

In [38]:
df['RESULT'].describe()

count   569.000000
mean      0.372583
std       0.483918
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000
Name: RESULT, dtype: float64

### Normalize variables

*Normalization transforms variables to have zero mean and unit standard deviation*

In [39]:
df_numeric=df.drop(['ID','RESULT'], axis=1)
print df_numeric.head(2)
input=(df_numeric-df_numeric.mean())/df_numeric.std()

   RADIUS_MN  TEXTURE_MN  PERIMETER_MN     AREA_MN  SMOOTHNESS_MN  \
0  17.990000   10.380000    122.800000 1001.000000       0.118400   
1  20.570000   17.770000    132.900000 1326.000000       0.084740   

   COMPACTNESS_MN  CONCAVITY_MN  CONCAVE_MN  SYMMETRY_MN  FRACTAL_MN  \
0        0.277600      0.300100    0.147100     0.241900    0.078710   
1        0.078640      0.086900    0.070170     0.181200    0.056670   

     ...      RADIUS_L  TEXTURE_L  PERIMETER_L      AREA_L  SMOOTHNESS_L  \
0    ...     25.380000  17.330000   184.600000 2019.000000      0.162200   
1    ...     24.990000  23.410000   158.800000 1956.000000      0.123800   

   COMPACTNESS_L  CONCAVITY_L  CONCAVE_L  SYMMETRY_L  FRACTAL_L  
0       0.665600     0.711900   0.265400    0.460100   0.118900  
1       0.186600     0.241600   0.186000    0.275000   0.089020  

[2 rows x 30 columns]


In [40]:
print input.head(1)

   RADIUS_MN  TEXTURE_MN  PERIMETER_MN  AREA_MN  SMOOTHNESS_MN  \
0   1.096100   -2.071512      1.268817 0.983510       1.567087   

   COMPACTNESS_MN  CONCAVITY_MN  CONCAVE_MN  SYMMETRY_MN  FRACTAL_MN  \
0        3.280628      2.650542    2.530249     2.215566    2.253764   

     ...      RADIUS_L  TEXTURE_L  PERIMETER_L   AREA_L  SMOOTHNESS_L  \
0    ...      1.885031  -1.358098     2.301575 1.999478      1.306537   

   COMPACTNESS_L  CONCAVITY_L  CONCAVE_L  SYMMETRY_L  FRACTAL_L  
0       2.614365     2.107672   2.294058    2.748204   1.935312  

[1 rows x 30 columns]


In [41]:
data=pd.DataFrame()
data=pd.concat([df[['ID','RESULT']],input],axis=1)

In [42]:
print data.head(3)
print ' + '.join(data.columns)

         ID  RESULT  RADIUS_MN  TEXTURE_MN  PERIMETER_MN  AREA_MN  \
0    842302       1   1.096100   -2.071512      1.268817 0.983510   
1    842517       1   1.828212   -0.353322      1.684473 1.907030   
2  84300903       1   1.578499    0.455786      1.565126 1.557513   

   SMOOTHNESS_MN  COMPACTNESS_MN  CONCAVITY_MN  CONCAVE_MN    ...      \
0       1.567087        3.280628      2.650542    2.530249    ...       
1      -0.826235       -0.486643     -0.023825    0.547662    ...       
2       0.941382        1.052000      1.362280    2.035440    ...       

   RADIUS_L  TEXTURE_L  PERIMETER_L   AREA_L  SMOOTHNESS_L  COMPACTNESS_L  \
0  1.885031  -1.358098     2.301575 1.999478      1.306537       2.614365   
1  1.804340  -0.368879     1.533776 1.888827     -0.375282      -0.430066   
2  1.510541  -0.023953     1.346291 1.455004      0.526944       1.081980   

   CONCAVITY_L  CONCAVE_L  SYMMETRY_L  FRACTAL_L  
0     2.107672   2.294058    2.748204   1.935312  
1    -0.146620   1.

In [43]:
print data.shape

(569, 32)


## Build model

In [44]:
import statsmodels.formula.api as smf

** Explore Correlation between house value  and AGE**

In [46]:
print ("Logistic regression model ")
lreg1 = smf.logit(formula = 'RESULT ~ RADIUS_MN + TEXTURE_MN + PERIMETER_MN + AREA_MN + SMOOTHNESS_MN + COMPACTNESS_MN + CONCAVITY_MN + CONCAVE_MN + SYMMETRY_MN + FRACTAL_MN ', data = data).fit()
print (lreg1.summary())


Logistic regression model 
Optimization terminated successfully.
         Current function value: 0.128410
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:                 RESULT   No. Observations:                  569
Model:                          Logit   Df Residuals:                      558
Method:                           MLE   Df Model:                           10
Date:                Mon, 03 Apr 2017   Pseudo R-squ.:                  0.8055
Time:                        01:04:07   Log-Likelihood:                -73.065
converged:                       True   LL-Null:                       -375.72
                                        LLR p-value:                1.282e-123
                     coef    std err          z      P>|z|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------
Intercept          0.4870      0.564      0.863      0.388        -0.619

In [49]:
print lreg1.params

Intercept         0.487017
RADIUS_MN        -7.221851
TEXTURE_MN        1.654756
PERIMETER_MN     -1.737630
AREA_MN          14.004846
SMOOTHNESS_MN     1.074953
COMPACTNESS_MN   -0.077235
CONCAVITY_MN      0.675123
CONCAVE_MN        2.592874
SYMMETRY_MN       0.446256
FRACTAL_MN       -0.482484
dtype: float64


In [48]:
# odds ratios
print ("Odds Ratios")
print (np.exp(lreg1.params))

# odd ratios with 95% confidence intervals
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (np.exp(conf))


Odds Ratios
Intercept              1.627454
RADIUS_MN              0.000730
TEXTURE_MN             5.231804
PERIMETER_MN           0.175937
AREA_MN          1208445.767572
SMOOTHNESS_MN          2.929856
COMPACTNESS_MN         0.925673
CONCAVITY_MN           1.964275
CONCAVE_MN            13.368140
SYMMETRY_MN            1.562452
FRACTAL_MN             0.617248
dtype: float64
                Lower CI            Upper CI             OR
Intercept       0.538466            4.918796       1.627454
RADIUS_MN       0.000000    102335945.217832       0.000730
TEXTURE_MN      3.036540            9.014133       5.231804
PERIMETER_MN    0.000000   4941387162.006423       0.175937
AREA_MN        11.691772 124903319434.974350 1208445.767572
SMOOTHNESS_MN   1.214230            7.069548       2.929856
COMPACTNESS_MN  0.112713            7.602216       0.925673
CONCAVITY_MN    0.552327            6.985677       1.964275
CONCAVE_MN      1.526800          117.046894      13.368140
SYMMETRY_MN     0.882

In [56]:
print ("Logistic regression model ")
lreg2 = smf.logit(formula = 'RESULT ~  TEXTURE_MN +  AREA_MN + SMOOTHNESS_MN + CONCAVE_MN + TEXTURE_SE+ AREA_SE + SMOOTHNESS_SE +  CONCAVE_SE +TEXTURE_L+ AREA_L + SMOOTHNESS_L +  CONCAVE_L ', data = data).fit()
print (lreg2.summary())

Logistic regression model 
Optimization terminated successfully.
         Current function value: 0.057369
         Iterations 12
                           Logit Regression Results                           
Dep. Variable:                 RESULT   No. Observations:                  569
Model:                          Logit   Df Residuals:                      556
Method:                           MLE   Df Model:                           12
Date:                Mon, 03 Apr 2017   Pseudo R-squ.:                  0.9131
Time:                        01:25:02   Log-Likelihood:                -32.643
converged:                       True   LL-Null:                       -375.72
                                        LLR p-value:                4.051e-139
                    coef    std err          z      P>|z|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------
Intercept         1.8741      0.807      2.321      0.020         0.292   

In [57]:
# odds ratios
print ("Odds Ratios")
print (np.exp(lreg2.params))

# odd ratios with 95% confidence intervals
params = lreg2.params
conf = lreg2.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (np.exp(conf))


Odds Ratios
Intercept           6.515172
TEXTURE_MN          0.651920
AREA_MN             0.016376
SMOOTHNESS_MN       0.535937
CONCAVE_MN          7.898563
TEXTURE_SE          0.271023
AREA_SE          5247.456499
SMOOTHNESS_SE       1.354028
CONCAVE_SE          0.450856
TEXTURE_L          26.707105
AREA_L          13382.546278
SMOOTHNESS_L        3.830140
CONCAVE_L          17.252942
dtype: float64
               Lower CI         Upper CI           OR
Intercept      1.338633        31.709571     6.515172
TEXTURE_MN     0.083950         5.062517     0.651920
AREA_MN        0.000017        15.694363     0.016376
SMOOTHNESS_MN  0.088178         3.257385     0.535937
CONCAVE_MN     0.313858       198.775399     7.898563
TEXTURE_SE     0.047147         1.557973     0.271023
AREA_SE       32.290546    852751.126395  5247.456499
SMOOTHNESS_SE  0.268423         6.830229     1.354028
CONCAVE_SE     0.027543         7.380093     0.450856
TEXTURE_L      1.515244       470.729255    26.707105
AR