# Breast Cancer prediction machine learning algorithm
### Column class  0=no cancer 1=postive for cancer

### The data was obtained from https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)
### Missing values with "?" were replaced with "NaN" to make it easy to find and delete rows with missing data
### No other pre-processing was done
### Data description can be found at https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names

In [2]:
# Improting pandas to read CSV file

In [3]:
import pandas as pd

In [4]:
# Read breast cancer csv file
data = pd.read_csv('wisconsin_breast_cancer.csv')

In [5]:
# Display the first 5 rows
data.head()

Unnamed: 0,id,thickness,size,shape,adhesion,single,nuclei,chromatin,nucleoli,mitosis,class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0
2,1015425,3,1,1,1,2,2.0,3,1,1,0
3,1016277,6,8,8,1,3,4.0,3,7,1,0
4,1017023,4,1,1,3,2,1.0,3,1,1,0


In [6]:
data.shape

(699, 11)

In [7]:
# There are 699 rows and 11 columns in this CSV file

# Delete rows with missing data

In [8]:
data.isnull().sum() # Find out how many cells are missing values

id            0
thickness     0
size          0
shape         0
adhesion      0
single        0
nuclei       16
chromatin     0
nucleoli      0
mitosis       0
class         0
dtype: int64

In [9]:
data=data.dropna(how='any') # Dropping any rows that has missing values

In [10]:
# previously we had 699 rows. The above code deleted 16 rows with missing values and now we have 683 rows

In [14]:
x=data[['thickness','size','shape','adhesion','single','nuclei','chromatin','nucleoli','mitosis']] 
#creating feature data set

In [15]:
x.head() # printing the first 5 rows to see whether we got all the features 

Unnamed: 0,thickness,size,shape,adhesion,single,nuclei,chromatin,nucleoli,mitosis
0,5,1,1,1,2,1.0,3,1,1
1,5,4,4,5,7,10.0,3,2,1
2,3,1,1,1,2,2.0,3,1,1
3,6,8,8,1,3,4.0,3,7,1
4,4,1,1,3,2,1.0,3,1,1


In [19]:
y=data['class']
y.isnull().sum()
y.head()

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

# Getting ready to do classification
### imprt cross validation from sklearn for test trian split
### then import logistic regression from sklearn


In [22]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

In [23]:
# train logistic regression model 
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
y_pred_class=logreg.predict(x_test) # make predictions based on x_test and store it to y_pred_class

In [25]:
# Now let us see how our modelel is performing. 
# We will start with accuracy
from sklearn import metrics
print metrics.accuracy_score(y_test, y_pred_class)

0.929824561404


In [26]:
## Not bad at all. 92% of the time our model was able to identify breast cancer based on the training data 
# Now let us see how our "intelligent" model compare to a dumb model which predicts "o" or "not cancer" 
# which is the most prevelent class

In [28]:
y_test.value_counts() # as you see "0" is more prevalent 

0    107
1     64
Name: class, dtype: int64

In [29]:
# Now let us see the percentage of "o" in y_test. This will be the accuracy of a dumb model that predicts 0 
# all the time
1-y_test.mean()

0.6257309941520468

In [30]:
# looks like we did better than the dumb model!

# Now let us create a confusion matrix to identify sensitivity specificity & all  the other good statistical stuff

In [32]:
print metrics.confusion_matrix(y_test, y_pred_class)

[[103   4]
 [  8  56]]


In [38]:
#  let us see what this means
#
#                Predicted 0    Predicted 1
#                                
#Actual  0        103              4
#Actual  1          8             56


In [37]:
confusion =metrics.confusion_matrix(y_test, y_pred_class)
TP = confusion[1,1] # true positive 
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]

In [39]:
# Let us see the sensitivity of our logreg model
print TP / float(TP+FN)

0.875


In [40]:
# Our model's sensitivity is 87.5%

In [41]:
# Let us calculate specificity
print TN / float(TN+FP)

0.96261682243


In [42]:
# Looks like our model has pretty good specificity 96.2%

In [43]:
# Calculate false postive rate - predicting cancer when pt does not have cancer
print FP/ float(TN+FP)

0.0373831775701


In [44]:
# pretty awesome 3.7%

In [45]:
# precison - when it is predicting cancer how precise is it 
# positive predictive value 
print TP / float(TP+FP)

0.933333333333


In [46]:
# 93.3% of the time 

In [47]:
# Negative predictive value
print TN / float(TN+ FN)

0.927927927928


In [51]:
# Now let us say you want to figure out the probaility of a set of features to come back as cancerous
# we can use the predict_proba function
# let us first see the predicted answers
logreg.predict(x_test)[0:10]


array([0, 0, 1, 1, 0, 0, 0, 1, 0, 0])

In [52]:
# Now let us predict the probaility of each prediction
logreg.predict_proba(x_test)[0:10, :]

array([[ 0.92925368,  0.07074632],
       [ 0.97810202,  0.02189798],
       [ 0.05497954,  0.94502046],
       [ 0.01971146,  0.98028854],
       [ 0.98442876,  0.01557124],
       [ 0.96262046,  0.03737954],
       [ 0.9421951 ,  0.0578049 ],
       [ 0.00202679,  0.99797321],
       [ 0.9598277 ,  0.0401723 ],
       [ 0.98330704,  0.01669296]])

In [53]:
# The first colun is the probability of it being benign. Second column is the probablity of it being cancerous 