In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import svm, tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
%matplotlib inline

# Data Preprocessing

In [2]:
mass_train = pd.read_csv('../data/mass_case_description_train_set.csv', sep=',')
mass_test = pd.read_csv('../data/mass_case_description_test_set.csv', sep=',')

calc_train = pd.read_csv('../data/calc_case_description_train_set.csv', sep=',')
calc_test = pd.read_csv('../data/calc_case_description_test_set.csv', sep=',')

In [3]:
# Merge train and test sets together
mass_total = pd.concat([mass_train, mass_test])

calc_total = pd.concat([calc_train, calc_test])

In [4]:
mass_total.head()

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00001,3,LEFT,CC,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
1,P_00001,3,LEFT,MLO,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...
2,P_00004,3,LEFT,CC,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...
3,P_00004,3,LEFT,MLO,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...
4,P_00004,3,RIGHT,MLO,1,mass,OVAL,CIRCUMSCRIBED,4,BENIGN,5,Mass-Training_P_00004_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....


In [5]:
print(mass_total.shape)

(1696, 14)


In [6]:
calc_train.head()

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00005,3,RIGHT,CC,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,Calc-Training_P_00005_RIGHT_CC/1.3.6.1.4.1.959...,Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9...,Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9...
1,P_00005,3,RIGHT,MLO,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,Calc-Training_P_00005_RIGHT_MLO/1.3.6.1.4.1.95...,Calc-Training_P_00005_RIGHT_MLO_1/1.3.6.1.4.1....,Calc-Training_P_00005_RIGHT_MLO_1/1.3.6.1.4.1....
2,P_00007,4,LEFT,CC,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,Calc-Training_P_00007_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00007_LEFT_CC_1/1.3.6.1.4.1.95...,Calc-Training_P_00007_LEFT_CC_1/1.3.6.1.4.1.95...
3,P_00007,4,LEFT,MLO,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,Calc-Training_P_00007_LEFT_MLO/1.3.6.1.4.1.959...,Calc-Training_P_00007_LEFT_MLO_1/1.3.6.1.4.1.9...,Calc-Training_P_00007_LEFT_MLO_1/1.3.6.1.4.1.9...
4,P_00008,1,LEFT,CC,1,calcification,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3,Calc-Training_P_00008_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00008_LEFT_CC_1/1.3.6.1.4.1.95...,Calc-Training_P_00008_LEFT_CC_1/1.3.6.1.4.1.95...


In [7]:
print(calc_total.shape)

(1872, 14)


In [8]:
excluded = ['patient_id', 'abnormality type', 'image file path', 'cropped image file path', 'ROI mask file path']

# Remove unneeded columns in mass abnormality data and calcification abnormality data
mass_total = mass_total.loc[:, ~mass_total.columns.isin(excluded)]

calc_total = calc_total.loc[:, ~calc_total.columns.isin(excluded)]

## Replace NaN

In [9]:
mass_total[mass_total.isnull().any(axis=1)]

Unnamed: 0,breast_density,left or right breast,image view,abnormality id,mass shape,mass margins,assessment,pathology,subtlety
56,2,RIGHT,CC,1,ARCHITECTURAL_DISTORTION,,0,MALIGNANT,4
57,2,RIGHT,MLO,1,ARCHITECTURAL_DISTORTION,,0,MALIGNANT,4
158,4,LEFT,CC,1,FOCAL_ASYMMETRIC_DENSITY,,3,MALIGNANT,1
159,4,LEFT,MLO,1,FOCAL_ASYMMETRIC_DENSITY,,3,MALIGNANT,1
161,2,LEFT,MLO,1,ASYMMETRIC_BREAST_TISSUE,,3,BENIGN_WITHOUT_CALLBACK,3
...,...,...,...,...,...,...,...,...,...
204,1,RIGHT,MLO,1,FOCAL_ASYMMETRIC_DENSITY,,3,BENIGN_WITHOUT_CALLBACK,5
206,3,LEFT,CC,3,ARCHITECTURAL_DISTORTION,,4,MALIGNANT,2
208,3,LEFT,MLO,4,ARCHITECTURAL_DISTORTION,,4,MALIGNANT,2
285,2,RIGHT,CC,1,ASYMMETRIC_BREAST_TISSUE,,3,BENIGN_WITHOUT_CALLBACK,5


In [10]:
mass_total['mass shape'].value_counts()

IRREGULAR                                   464
OVAL                                        412
LOBULATED                                   384
ROUND                                       164
ARCHITECTURAL_DISTORTION                    103
IRREGULAR-ARCHITECTURAL_DISTORTION           52
LYMPH_NODE                                   35
ASYMMETRIC_BREAST_TISSUE                     25
FOCAL_ASYMMETRIC_DENSITY                     25
LOBULATED-IRREGULAR                           6
OVAL-LYMPH_NODE                               6
LOBULATED-LYMPH_NODE                          4
ROUND-OVAL                                    3
LOBULATED-ARCHITECTURAL_DISTORTION            2
IRREGULAR-FOCAL_ASYMMETRIC_DENSITY            2
LOBULATED-OVAL                                1
ROUND-IRREGULAR-ARCHITECTURAL_DISTORTION      1
ROUND-LOBULATED                               1
OVAL-LOBULATED                                1
IRREGULAR-ASYMMETRIC_BREAST_TISSUE            1
Name: mass shape, dtype: int64

In [11]:
mass_total['mass margins'].value_counts()

CIRCUMSCRIBED                               392
ILL_DEFINED                                 370
SPICULATED                                  363
OBSCURED                                    247
MICROLOBULATED                              129
ILL_DEFINED-SPICULATED                       30
CIRCUMSCRIBED-ILL_DEFINED                    29
OBSCURED-ILL_DEFINED                         24
CIRCUMSCRIBED-OBSCURED                       22
MICROLOBULATED-ILL_DEFINED                    5
OBSCURED-ILL_DEFINED-SPICULATED               5
OBSCURED-SPICULATED                           4
CIRCUMSCRIBED-OBSCURED-ILL_DEFINED            4
CIRCUMSCRIBED-MICROLOBULATED-ILL_DEFINED      3
MICROLOBULATED-ILL_DEFINED-SPICULATED         2
MICROLOBULATED-SPICULATED                     2
CIRCUMSCRIBED-MICROLOBULATED                  2
OBSCURED-CIRCUMSCRIBED                        2
CIRCUMSCRIBED-SPICULATED                      1
Name: mass margins, dtype: int64

In [12]:
mass_total = mass_total.fillna({'mass margins': 'CIRCUMSCRIBED', 'mass shape': 'IRREGULAR'})

In [13]:
# Do final check that there is no null values
mass_total[mass_total.isnull().any(axis=1)]

Unnamed: 0,breast_density,left or right breast,image view,abnormality id,mass shape,mass margins,assessment,pathology,subtlety


In [14]:
calc_total[calc_total.isnull().any(axis=1)]

Unnamed: 0,breast density,left or right breast,image view,abnormality id,calc type,calc distribution,assessment,pathology,subtlety
4,1,LEFT,CC,1,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3
5,1,LEFT,CC,2,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3
6,1,LEFT,CC,3,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3
7,1,LEFT,MLO,1,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3
8,1,LEFT,MLO,2,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3
...,...,...,...,...,...,...,...,...,...
229,3,RIGHT,MLO,2,SKIN,,2,BENIGN_WITHOUT_CALLBACK,4
288,2,LEFT,CC,1,COARSE-LUCENT_CENTER,,2,BENIGN_WITHOUT_CALLBACK,3
289,2,LEFT,CC,2,COARSE-LUCENT_CENTER,,2,BENIGN_WITHOUT_CALLBACK,3
290,2,LEFT,MLO,1,COARSE-LUCENT_CENTER,,2,BENIGN_WITHOUT_CALLBACK,3


In [15]:
calc_total['calc type'].value_counts()

PLEOMORPHIC                                                 813
AMORPHOUS                                                   181
PUNCTATE                                                    132
LUCENT_CENTER                                               110
FINE_LINEAR_BRANCHING                                       102
VASCULAR                                                     90
COARSE                                                       39
PLEOMORPHIC-FINE_LINEAR_BRANCHING                            31
ROUND_AND_REGULAR-LUCENT_CENTER                              31
PUNCTATE-PLEOMORPHIC                                         30
ROUND_AND_REGULAR                                            27
ROUND_AND_REGULAR-LUCENT_CENTER-PUNCTATE                     24
ROUND_AND_REGULAR-EGGSHELL                                   23
DYSTROPHIC                                                   20
LUCENT_CENTERED                                              18
AMORPHOUS-PLEOMORPHIC                   

In [16]:
calc_total['calc distribution'].value_counts()

CLUSTERED              935
SEGMENTAL              202
LINEAR                 112
REGIONAL               102
DIFFUSELY_SCATTERED     40
CLUSTERED-LINEAR        29
LINEAR-SEGMENTAL         7
CLUSTERED-SEGMENTAL      5
REGIONAL-REGIONAL        1
Name: calc distribution, dtype: int64

In [17]:
calc_total = calc_total.fillna({'calc type': 'PLEOMORPHIC', 'calc distribution':'CLUSTERED'})

In [18]:
# Do final check that there is no more null values
calc_total[calc_total.isnull().any(axis=1)]

Unnamed: 0,breast density,left or right breast,image view,abnormality id,calc type,calc distribution,assessment,pathology,subtlety


## One-Hot Encoding

### Mass

In [19]:
mass_total.dtypes

breast_density           int64
left or right breast    object
image view              object
abnormality id           int64
mass shape              object
mass margins            object
assessment               int64
pathology               object
subtlety                 int64
dtype: object

In [20]:
mass_total = pd.get_dummies(mass_total, columns=['left or right breast', 'image view', 'mass shape', 'mass margins'])

In [21]:
mass_total.dtypes

breast_density                                            int64
abnormality id                                            int64
assessment                                                int64
pathology                                                object
subtlety                                                  int64
left or right breast_LEFT                                 uint8
left or right breast_RIGHT                                uint8
image view_CC                                             uint8
image view_MLO                                            uint8
mass shape_ARCHITECTURAL_DISTORTION                       uint8
mass shape_ASYMMETRIC_BREAST_TISSUE                       uint8
mass shape_FOCAL_ASYMMETRIC_DENSITY                       uint8
mass shape_IRREGULAR                                      uint8
mass shape_IRREGULAR-ARCHITECTURAL_DISTORTION             uint8
mass shape_IRREGULAR-ASYMMETRIC_BREAST_TISSUE             uint8
mass shape_IRREGULAR-FOCAL_ASYMMETRIC_DE

In [22]:
mass_total.head()

Unnamed: 0,breast_density,abnormality id,assessment,pathology,subtlety,left or right breast_LEFT,left or right breast_RIGHT,image view_CC,image view_MLO,mass shape_ARCHITECTURAL_DISTORTION,...,mass margins_MICROLOBULATED,mass margins_MICROLOBULATED-ILL_DEFINED,mass margins_MICROLOBULATED-ILL_DEFINED-SPICULATED,mass margins_MICROLOBULATED-SPICULATED,mass margins_OBSCURED,mass margins_OBSCURED-CIRCUMSCRIBED,mass margins_OBSCURED-ILL_DEFINED,mass margins_OBSCURED-ILL_DEFINED-SPICULATED,mass margins_OBSCURED-SPICULATED,mass margins_SPICULATED
0,3,1,4,MALIGNANT,4,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3,1,4,MALIGNANT,4,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,3,1,4,BENIGN,3,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,3,1,4,BENIGN,3,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,3,1,4,BENIGN,5,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Calc

In [23]:
calc_total.dtypes

breast density           int64
left or right breast    object
image view              object
abnormality id           int64
calc type               object
calc distribution       object
assessment               int64
pathology               object
subtlety                 int64
dtype: object

In [24]:
calc_total = pd.get_dummies(calc_total, columns=['left or right breast', 'image view', 'calc type', 'calc distribution'])

In [25]:
calc_total.dtypes

breast density                          int64
abnormality id                          int64
assessment                              int64
pathology                              object
subtlety                                int64
                                        ...  
calc distribution_LINEAR                uint8
calc distribution_LINEAR-SEGMENTAL      uint8
calc distribution_REGIONAL              uint8
calc distribution_REGIONAL-REGIONAL     uint8
calc distribution_SEGMENTAL             uint8
Length: 65, dtype: object

In [26]:
calc_total.head()

Unnamed: 0,breast density,abnormality id,assessment,pathology,subtlety,left or right breast_LEFT,left or right breast_RIGHT,image view_CC,image view_MLO,calc type_AMORPHOUS,...,calc type_VASCULAR-COARSE-LUCENT_CENTERED,calc distribution_CLUSTERED,calc distribution_CLUSTERED-LINEAR,calc distribution_CLUSTERED-SEGMENTAL,calc distribution_DIFFUSELY_SCATTERED,calc distribution_LINEAR,calc distribution_LINEAR-SEGMENTAL,calc distribution_REGIONAL,calc distribution_REGIONAL-REGIONAL,calc distribution_SEGMENTAL
0,3,1,3,MALIGNANT,3,0,1,1,0,1,...,0,1,0,0,0,0,0,0,0,0
1,3,1,3,MALIGNANT,3,0,1,0,1,1,...,0,1,0,0,0,0,0,0,0,0
2,4,1,4,BENIGN,4,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,1,4,BENIGN,4,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,1,1,2,BENIGN_WITHOUT_CALLBACK,3,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


# Model Training

## SVM on Mass Abnormality Data

In [28]:
x_mass = mass_total.loc[:, ~mass_total.columns.isin(['pathology'])]

y_mass = mass_total['pathology']

In [29]:
x_mass.head()

Unnamed: 0,breast_density,abnormality id,assessment,subtlety,left or right breast_LEFT,left or right breast_RIGHT,image view_CC,image view_MLO,mass shape_ARCHITECTURAL_DISTORTION,mass shape_ASYMMETRIC_BREAST_TISSUE,...,mass margins_MICROLOBULATED,mass margins_MICROLOBULATED-ILL_DEFINED,mass margins_MICROLOBULATED-ILL_DEFINED-SPICULATED,mass margins_MICROLOBULATED-SPICULATED,mass margins_OBSCURED,mass margins_OBSCURED-CIRCUMSCRIBED,mass margins_OBSCURED-ILL_DEFINED,mass margins_OBSCURED-ILL_DEFINED-SPICULATED,mass margins_OBSCURED-SPICULATED,mass margins_SPICULATED
0,3,1,4,4,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3,1,4,4,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,1,4,3,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,4,3,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,3,1,4,5,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
y_mass.head()

0    MALIGNANT
1    MALIGNANT
2       BENIGN
3       BENIGN
4       BENIGN
Name: pathology, dtype: object

In [31]:
# Split data
x_mass_train, x_mass_test, y_mass_train, y_mass_test = train_test_split(x_mass, y_mass, random_state=1234, test_size=0.2)

In [33]:
# Print shapes of each set (before applying StandardScaler().fit())
print('Shape before applying StandardScaler fit:')
print(f'Shape of x_mass_train = {x_mass_train.shape}')
print(f'Shape of x_mass_test = {x_mass_test.shape}')
print(f'Shape of y_mass_train = {y_mass_train.shape}')
print(f'Shape of y_mass_test = {y_mass_test.shape}')

Shape before applying StandardScaler fit:
Shape of x_mass_train = (1356, 47)
Shape of x_mass_test = (340, 47)
Shape of y_mass_train = (1356,)
Shape of y_mass_test = (340,)


In [34]:
# Create SVC object
clf1 = svm.SVC()

# Call fit method on x and y training data
clf1.fit(x_mass_train, y_mass_train)

# Generate predictions on x testing data
predictions1 = clf1.predict(x_mass_test)

# Calculate accuracy score on y testing data and test predictions
score1 = accuracy_score(y_mass_test, predictions1)

# Print accuracy score
print(score1)

0.7941176470588235


## SVM on Calcification Abnormality Data

In [36]:
x_calc = calc_total.loc[:, ~calc_total.columns.isin(['pathology'])]

y_calc = calc_total['pathology']

In [37]:
x_calc.head()

Unnamed: 0,breast density,abnormality id,assessment,subtlety,left or right breast_LEFT,left or right breast_RIGHT,image view_CC,image view_MLO,calc type_AMORPHOUS,calc type_AMORPHOUS-PLEOMORPHIC,...,calc type_VASCULAR-COARSE-LUCENT_CENTERED,calc distribution_CLUSTERED,calc distribution_CLUSTERED-LINEAR,calc distribution_CLUSTERED-SEGMENTAL,calc distribution_DIFFUSELY_SCATTERED,calc distribution_LINEAR,calc distribution_LINEAR-SEGMENTAL,calc distribution_REGIONAL,calc distribution_REGIONAL-REGIONAL,calc distribution_SEGMENTAL
0,3,1,3,3,0,1,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,3,1,3,3,0,1,0,1,1,0,...,0,1,0,0,0,0,0,0,0,0
2,4,1,4,4,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,1,4,4,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,1,2,3,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [38]:
y_calc.head()

0                  MALIGNANT
1                  MALIGNANT
2                     BENIGN
3                     BENIGN
4    BENIGN_WITHOUT_CALLBACK
Name: pathology, dtype: object

In [39]:
# Split data
x_calc_train, x_calc_test, y_calc_train, y_calc_test = train_test_split(x_calc, y_calc, random_state=1234, test_size=0.2)

In [41]:
clf1.fit(x_calc_train, y_calc_train)

# Generate predictions on x testing data
predictions2 = clf1.predict(x_calc_test)

# Calculate accuracy score on y testing data and test predictions
score2 = accuracy_score(y_calc_test, predictions2)

# Print accuracy score
print(score2)

0.7386666666666667
