### Attribute Information:
6 Attributes in total (1 goal field, 1 non-predictive, 4 predictive attributes)
 1. BI-RADS assessment: 1 to 5 (ordinal, non-predictive!)
 2. Age: patient's age in years (integer)
 3. Shape: mass shape: round=1 oval=2 lobular=3 irregular=4 (nominal)
 4. Margin: mass margin: circumscribed=1 microlobulated=2 obscured=3 ill-defined=4 spiculated=5 (nominal)
 5. Density: mass density high=1 iso=2 low=3 fat-containing=4 (ordinal)
 6. Severity: benign=0 or malignant=1 (binominal, goal field!)

http://archive.ics.uci.edu/ml/datasets/mammographic+mass

# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

# read the csv
column_names = ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']
df = pd.read_csv('mammographic_masses.data.txt', na_values=['?'], names=column_names, header=None)

In [2]:
df.head()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [3]:
df.shape

(961, 6)

In [4]:
#display all rows
pd.set_option('display.max_rows', 1000)

In [5]:
df.dtypes

BI-RADS     float64
Age         float64
Shape       float64
Margin      float64
Density     float64
Severity      int64
dtype: object

In [6]:
df.describe()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [7]:
# find the number of missing values for each column
df.isnull().sum()

BI-RADS      2
Age          5
Shape       31
Margin      48
Density     76
Severity     0
dtype: int64

In [8]:
# find the rows with missing data to see if there's any sort of correlation to avoid bias
df.loc[(df['BI-RADS'].isnull()) | (df['Age'].isnull()) | (df['Shape'].isnull()) | (df['Margin'].isnull()) |(df['Density'].isnull()) ]

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
1,4.0,43.0,1.0,1.0,,1
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0
9,5.0,60.0,,5.0,1.0,1
12,4.0,64.0,1.0,,3.0,0
19,4.0,40.0,1.0,,,0
20,,66.0,,,1.0,1
22,4.0,43.0,1.0,,,0


In [9]:
# since the missing data looks randomly distributed, drop rows with NaN values
df.dropna(inplace=True)
df.describe()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [10]:
# print shape of dataframe after dropping rows with NaN values
df.shape

(830, 6)

In [11]:
feature_columns = df[['Age', 'Shape', 'Margin', 'Density']]
target_column = df[['Severity']]

In [12]:
# normalise the data
feature_values = df[['Age', 'Shape', 'Margin', 'Density']].values
target_values = df[['Severity']].values

from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
features_scaled = scaler.fit_transform(feature_values)

# Classification

In [13]:
# all imports
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

lrc = LogisticRegression()
cv_scores = cross_val_score(lrc, features_scaled, target_column, cv=10)
print(f"The mean cv_score for lrc is {cv_scores.mean()}")


The mean cv_score for lrc is 0.8072289156626505


### Decision Trees

In [15]:
from sklearn.tree import DecisionTreeClassifier

dtc_1 = DecisionTreeClassifier(criterion='gini', min_samples_split=3, max_depth=7, min_samples_leaf=15, random_state=1)
cv_scores = cross_val_score(dtc_1, features_scaled, target_column, cv=10)
print(f"The mean cv_score for dtc_1 is {cv_scores.mean()}")

dtc_2 = DecisionTreeClassifier(criterion='entropy', min_samples_split=3, max_depth=7, min_samples_leaf=15, random_state=1)
cv_scores = cross_val_score(dtc_2, features_scaled, target_column, cv=10)
print(f"The mean cv_score for dtc_2 is {cv_scores.mean()}")

dtc_3 = DecisionTreeClassifier(criterion='entropy', min_samples_split=5, max_depth=10, min_samples_leaf=20, random_state=1)
cv_scores = cross_val_score(dtc_3, features_scaled, target_column, cv=10)
print(f"The mean cv_score for dtc_3 is {cv_scores.mean()}")

dtc_4 = DecisionTreeClassifier(criterion='entropy', min_samples_split=10, max_depth=10, min_samples_leaf=25, random_state=1)
cv_scores = cross_val_score(dtc_4, features_scaled, target_column, cv=10)
print(f"The mean cv_score for dtc_4 is {cv_scores.mean()}")

The mean cv_score for dtc_1 is 0.7903614457831326
The mean cv_score for dtc_2 is 0.7915662650602411
The mean cv_score for dtc_3 is 0.8036144578313253
The mean cv_score for dtc_4 is 0.8012048192771084


### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

rfc_1 = RandomForestClassifier(n_estimators = 10, criterion='gini', min_samples_split=3, max_depth=7, min_samples_leaf=15, max_features='auto',random_state=1)
cv_scores = cross_val_score(rfc_1, features_scaled, target_column, cv=10)
print(f"The mean cv_score for rfc_1 is {cv_scores.mean()}")

rfc_2 = RandomForestClassifier(n_estimators = 12, criterion='entropy', min_samples_split=3, max_depth=7, min_samples_leaf=15, max_features='auto',random_state=1)
cv_scores = cross_val_score(rfc_1, features_scaled, target_column, cv=10)
print(f"The mean cv_score for rfc_2 is {cv_scores.mean()}")

rfc_3 = RandomForestClassifier(n_estimators = 20, criterion='entropy', min_samples_split=10, max_depth=10, min_samples_leaf=25, max_features='auto',random_state=1)
cv_scores = cross_val_score(rfc_3, features_scaled, target_column, cv=10)
print(f"The mean cv_score for rfc_3 is {cv_scores.mean()}")

The mean cv_score for rfc_1 is 0.8
The mean cv_score for rfc_2 is 0.8
The mean cv_score for rfc_3 is 0.8012048192771086


### Support Vector Machine

In [17]:
from sklearn import svm

C=1.0
svc_1 = svm.SVC(kernel='linear', C=C)
cv_scores = cross_val_score(svc_1, features_scaled, target_column, cv=10)
print(f"The mean cv_score for svc_1 is {cv_scores.mean()}")

svc_2 = svm.SVC(kernel='poly', C=C)
cv_scores = cross_val_score(svc_2, features_scaled, target_column, cv=10)
print(f"The mean cv_score for svc_2 is {cv_scores.mean()}")

svc_3 = svm.SVC(kernel='sigmoid', C=C)
cv_scores = cross_val_score(svc_3, features_scaled, target_column, cv=10)
print(f"The mean cv_score for svc_3 is {cv_scores.mean()}")

svc_4 = svm.SVC(kernel='rbf', C=C)
cv_scores = cross_val_score(svc_4, features_scaled, target_column, cv=10)
print(f"The mean cv_score for svc_4 is {cv_scores.mean()}")

The mean cv_score for svc_1 is 0.7975903614457832
The mean cv_score for svc_2 is 0.7903614457831326
The mean cv_score for svc_3 is 0.7457831325301204
The mean cv_score for svc_4 is 0.8012048192771084


### KNN

In [18]:
from sklearn import neighbors

for n in range(2, 50):
    knc = neighbors.KNeighborsClassifier(n_neighbors=n, weights='distance')
    cv_scores = cross_val_score(knc, features_scaled, target_column, cv=10)
    print(f"For {n} number of neighbors, the mean cv_score is {cv_scores.mean()}")

For 2 number of neighbors, the mean cv_score is 0.7108433734939759
For 3 number of neighbors, the mean cv_score is 0.7421686746987951
For 4 number of neighbors, the mean cv_score is 0.7397590361445783
For 5 number of neighbors, the mean cv_score is 0.7481927710843375
For 6 number of neighbors, the mean cv_score is 0.7506024096385542
For 7 number of neighbors, the mean cv_score is 0.7578313253012048
For 8 number of neighbors, the mean cv_score is 0.7578313253012048
For 9 number of neighbors, the mean cv_score is 0.7614457831325301
For 10 number of neighbors, the mean cv_score is 0.7614457831325301
For 11 number of neighbors, the mean cv_score is 0.7626506024096387
For 12 number of neighbors, the mean cv_score is 0.7626506024096387
For 13 number of neighbors, the mean cv_score is 0.7614457831325302
For 14 number of neighbors, the mean cv_score is 0.7626506024096387
For 15 number of neighbors, the mean cv_score is 0.7662650602409639
For 16 number of neighbors, the mean cv_score is 0.76506

### Naive Bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB

scaler = preprocessing.MinMaxScaler()
features_minmax = scaler.fit_transform(feature_columns)

nbc = MultinomialNB()
cv_scores = cross_val_score(nbc, features_minmax, target_column, cv=10)
print(f"The mean cv_score for nbc is {cv_scores.mean()}")

The mean cv_score for nbc is 0.7855421686746988


### Neural Networks

In [30]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

def creating_model():
    model = Sequential()
    model.add(Dense(6, input_dim=4, kernel_initializer='normal', activation='relu'))
    #model.add(Dense(4, kernel_initializer='normal', activation='relu'))
    # output layer with binary classification
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.summary()
    return model

estimator_1 = KerasClassifier(build_fn=creating_model, epochs=50, verbose=0)
cv_scores = cross_val_score(estimator_1, features_scaled, target_column, cv=10)
print(f"The mean cv_score for estimator_1 is {cv_scores.mean()}")

estimator_2 = KerasClassifier(build_fn=creating_model, epochs=100, verbose=0)
cv_scores = cross_val_score(estimator_2, features_scaled, target_column, cv=10)
print(f"The mean cv_score for estimator_2 is {cv_scores.mean()}")


estimator_3 = KerasClassifier(build_fn=creating_model, epochs=150, verbose=0)
cv_scores = cross_val_score(estimator_3,features_scaled, target_column, cv=10)
print(f"The mean cv_score for estimator_3 is {cv_scores.mean()}")


The mean cv_score for estimator_1 is 0.8060240924358368
The mean cv_score for estimator_2 is 0.8012048184871674
The mean cv_score for estimator_3 is 0.8036144554615021


The models don't differ much in terms of their cross validation score. Neural networks work the best with 50 epochs. A lot of the models seem to have a cross validation score of 0.8012. 