In [1]:
import pandas as pd

# Read the mammographic masses data with pandas, specifying the respective columns (features) names
# Map all the missing data in the dataset represented with '?' as NaN values
masses_data = pd.read_csv('mammographic_masses.data.txt', na_values=['?'], names = ['BI-RADS', 'age', 'shape', 'margin', 'density', 'severity'])
masses_data.head()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [2]:
masses_data.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [3]:
n_records = len(masses_data)
print("Total number of records BEFORE cleaning: {}".format(n_records))

Total number of records BEFORE cleaning: 961


In [4]:
masses_data.loc[(masses_data['BI-RADS'].isnull()) |
                (masses_data['age'].isnull())     |
                (masses_data['shape'].isnull())   |
                (masses_data['margin'].isnull())  |
                (masses_data['density'].isnull()) |
                (masses_data['severity'].isnull())]

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
1,4.0,43.0,1.0,1.0,,1
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0
9,5.0,60.0,,5.0,1.0,1
12,4.0,64.0,1.0,,3.0,0
19,4.0,40.0,1.0,,,0
20,,66.0,,,1.0,1
22,4.0,43.0,1.0,,,0


In [5]:
masses_data.dropna(inplace=True)
masses_data.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [6]:
n_records = len(masses_data)

# Number of records where individual's clinical case is classified as Malignant
n_malignant = len(masses_data[masses_data['severity'] == 1])

# Number of records where individual's clinical case is classified as Benign
n_benign = len(masses_data[masses_data['severity'] == 0])

# Percentage of individuals whose breast cancer is classified as Malignant
malignant_percent = n_malignant*100.00/n_records

print("Total number of records AFTER cleaning: {}".format(n_records))
print("Breast cancer classified as malignant: {}".format(n_malignant))
print("Breast cancer classified as benign: {}".format(n_benign))
print("Percentage of Breast cancer classified as malignant: {:.2f}%".format(malignant_percent))

Total number of records AFTER cleaning: 830
Breast cancer classified as malignant: 403
Breast cancer classified as benign: 427
Percentage of Breast cancer classified as malignant: 48.55%


In [7]:
import numpy as np

features_data = masses_data[['BI-RADS',
                        'age', 
                        'shape',
                        'margin', 
                        'density']].values
# np.asarray(features_data)

outputs = masses_data['severity'].values

feature_names = ['BI-RADS', 'age', 'shape', 'margin', 'density']

print("Features values converted to Numpy Arrays: \n{}".format(features_data))

Features values converted to Numpy Arrays: 
[[ 5. 67.  3.  5.  3.]
 [ 5. 58.  4.  5.  3.]
 [ 4. 28.  1.  1.  3.]
 ...
 [ 4. 64.  4.  5.  3.]
 [ 5. 66.  4.  5.  3.]
 [ 4. 62.  3.  3.  3.]]


In [8]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
all_features_scaled = scaler.fit_transform(features_data)
print("Features values normalized: \n{}".format(all_features_scaled))

Features values normalized: 
[[ 0.3211177   0.7650629   0.17563638  1.39618483  0.24046607]
 [ 0.3211177   0.15127063  0.98104077  1.39618483  0.24046607]
 [-0.20875843 -1.89470363 -1.43517241 -1.157718    0.24046607]
 ...
 [-0.20875843  0.56046548  0.98104077  1.39618483  0.24046607]
 [ 0.3211177   0.69686376  0.98104077  1.39618483  0.24046607]
 [-0.20875843  0.42406719  0.17563638  0.11923341  0.24046607]]


## Decision Tree

In [9]:
from sklearn.model_selection import train_test_split

np.random.seed(1234)

(training_inputs,
 testing_inputs,
 training_classes,
 testing_classes) = train_test_split(all_features_scaled, outputs, train_size=0.75, test_size=0.25, random_state=1)

In [10]:
from sklearn.tree import DecisionTreeClassifier

clf= DecisionTreeClassifier(random_state=1)

# Train the classifier on the training set
clf.fit(training_inputs, training_classes)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [11]:
clf.score(testing_inputs, testing_classes)

0.7788461538461539

## Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


clf = RandomForestClassifier(n_estimators=10, random_state=1)
# instead of a single train/test split, use K-Fold cross validation to get a better measure of your model's accuracy (K=10)
cv_scores = cross_val_score(clf, all_features_scaled, outputs, cv=10)

cv_scores.mean()

0.7841435428123644

## Support Vector Machine (SVM)

In [13]:
from sklearn import svm

C = 1.0
svc = svm.SVC(kernel='rbf', C=C, gamma='auto') # SVC with a rbf kernel

cv_scores = cross_val_score(svc, all_features_scaled, outputs, cv=10)

cv_scores.mean()

0.8286970180373061

## Logistic Regression

binary classification problem

In [20]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='liblinear')
cv_scores = cross_val_score(clf, all_features_scaled, outputs, cv=10)
cv_scores.mean()

0.8251843611379315

## Naive Bayes

In [15]:
from sklearn.naive_bayes import MultinomialNB

scaler = preprocessing.MinMaxScaler()
all_features_minmax = scaler.fit_transform(features_data)

clf = MultinomialNB()
cv_scores = cross_val_score(clf, all_features_minmax, outputs, cv=10)

cv_scores.mean()

0.7832007472398305

## KNN

In [16]:
from sklearn import neighbors

clf = neighbors.KNeighborsClassifier(n_neighbors=10)
cv_scores = cross_val_score(clf, all_features_scaled, outputs, cv=10)

cv_scores.mean()

0.794959629458601

In [17]:
# Try different values of K
for k in range(1, 50):
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)
    cv_scores = cross_val_score(clf, all_features_scaled, outputs, cv=10)
    print (k, cv_scores.mean())

(1, 0.7443092929208122)
(2, 0.727223528259379)
(3, 0.7708328085862615)
(4, 0.7733148622364021)
(5, 0.7855526636161371)
(6, 0.7973699676755803)
(7, 0.7962522564124093)
(8, 0.7950320445545247)
(9, 0.8010424975161973)
(10, 0.794959629458601)
(11, 0.8010274547668024)
(12, 0.7985884303765585)
(13, 0.8022175811258974)
(14, 0.8021301232805789)
(15, 0.8046569553475225)
(16, 0.8009553895022599)
(17, 0.7961640989043282)
(18, 0.7996917985530974)
(19, 0.7973398821767909)
(20, 0.8021455158613549)
(21, 0.8009539901767349)
(22, 0.7961053272322741)
(23, 0.8021738522032381)
(24, 0.8010120621860264)
(25, 0.8033930145669788)
(26, 0.7997642136490212)
(27, 0.805832388788604)
(28, 0.805759623861299)
(29, 0.8106373228104055)
(30, 0.8070225151476988)
(31, 0.805832388788604)
(32, 0.8034080573163737)
(33, 0.8046132264248633)
(34, 0.8010127618487889)
(35, 0.8033790213117278)
(36, 0.8009837258441431)
(37, 0.8058033527839582)
(38, 0.8046128765934819)
(39, 0.8034080573163737)
(40, 0.8058467318752361)
(41, 0.8082423

## Models Results





|     Model   | Decision Tress | Random Forest |  SVM  | Logistic Regression | Naive Bayes |  KNN  |
|-------------|----------------|---------------|-------|---------------------|-------------|-------|
|   Accuracy  |     0.778      |     0.784     | 0.828 |        0.825        |    0.783    | 0.811 |