In [2]:
import datetime
import gc
import numpy as np
import os
import random
from scipy import misc
import string
import time
import sys
import sklearn.metrics as skm
import collections
from sklearn.svm import SVC
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
from sklearn import metrics
import dwdii_bc_model_helper as bc

random.seed(20275)
np.set_printoptions(precision=2)

In [3]:
imagePath = "png"
#imagePath = "/root/bc_data/Data_Thresholded/DDSM"
trainDataPath = "data/ddsm_train.csv"
testDataPath = "data/ddsm_test.csv"
imgResize = (150, 150)

In [4]:
os.listdir('data')

['ddsm_test.csv', 'ddsm_train.csv', 'ddsm_val.csv', 'mias_all.csv']

In [5]:
metaData, meta2, mCounts = bc.load_training_metadata(trainDataPath, balanceViaRemoval=True, verbose=True)

Raw Balance
----------------
benign 531
malignant 739
normal 2685
balanaceViaRemoval.avgE: 1318
balanaceViaRemoval.theshold: 1318.0

After Balancing
----------------
benign 531
malignant 739
normal 862


In [6]:
# Actually load some representative data for model experimentation
maxData = len(metaData)
X_data, Y_data = bc.load_data(trainDataPath, imagePath, maxData = maxData, verboseFreq = 50, imgResize=imgResize)
print X_data.shape
print Y_data.shape

Raw Balance
----------------
benign 531
malignant 739
normal 2685
balanaceViaRemoval.avgE: 1318
balanaceViaRemoval.theshold: 1318.0

After Balancing
----------------
benign 531
malignant 739
normal 862


  Y_data = np.zeros([total, 1], dtype=np.int8)


0.0000: A_1058_1.LEFT_MLO.LJPEG.png
0.0235: B_3422_1.RIGHT_MLO.LJPEG.png
0.0469: A_0578_1.RIGHT_MLO.LJPEG.png
0.0704: C_0363_1.RIGHT_MLO.LJPEG.png
0.0938: B_3393_1.LEFT_MLO.LJPEG.png
0.1173: C_0192_1.RIGHT_CC.LJPEG.png
0.1407: B_3025_1.LEFT_CC.LJPEG.png
0.1642: A_0313_1.RIGHT_CC.LJPEG.png
0.1876: B_3010_1.RIGHT_MLO.LJPEG.png
0.2111: C_0397_1.LEFT_CC.LJPEG.png
0.2345: A_1082_1.RIGHT_MLO.LJPEG.png
0.2580: C_0190_1.RIGHT_CC.LJPEG.png
0.2814: C_0275_1.LEFT_CC.LJPEG.png
0.3049: C_0367_1.RIGHT_MLO.LJPEG.png
0.3283: C_0370_1.LEFT_CC.LJPEG.png
0.3518: A_0589_1.RIGHT_CC.LJPEG.png
0.3752: A_0522_1.RIGHT_MLO.LJPEG.png
0.3987: B_3509_1.LEFT_CC.LJPEG.png
0.4221: A_0510_1.LEFT_CC.LJPEG.png
0.4456: C_0174_1.RIGHT_CC.LJPEG.png
0.4690: A_1007_1.RIGHT_MLO.LJPEG.png
0.4925: A_0569_1.RIGHT_MLO.LJPEG.png
0.5159: C_0022_1.RIGHT_CC.LJPEG.png
0.5394: C_0488_1.LEFT_CC.LJPEG.png
0.5629: A_0574_1.RIGHT_MLO.LJPEG.png
0.5863: B_3012_1.RIGHT_CC.LJPEG.png
0.6098: B_3110_1.LEFT_CC.LJPEG.png
0.6332: C_0101_1.RIGHT_MLO

In [7]:
# Actually load some representative data for model experimentation
maxData = len(metaData)
X_test, Y_test = bc.load_data(testDataPath, imagePath, maxData = maxData, verboseFreq = 50, imgResize=imgResize)
print X_test.shape
print Y_test.shape

Raw Balance
----------------
benign 142
malignant 179
normal 658
balanaceViaRemoval.avgE: 326
balanaceViaRemoval.theshold: 326.0

After Balancing
----------------
benign 142
malignant 179
normal 215
0.0000: A_1105_1.RIGHT_CC.LJPEG.png
0.0235: C_0320_1.LEFT_CC.LJPEG.png
0.0469: A_1076_1.RIGHT_CC.LJPEG.png
0.0704: B_3357_1.LEFT_CC.LJPEG.png
0.0938: B_3504_1.RIGHT_MLO.LJPEG.png
0.1173: C_0500_1.LEFT_CC.LJPEG.png
0.1407: C_0166_1.RIGHT_CC.LJPEG.png
0.1642: A_0498_1.LEFT_CC.LJPEG.png
0.1876: A_0293_1.RIGHT_CC.LJPEG.png
0.2111: A_1061_1.RIGHT_MLO.LJPEG.png
0.2345: B_3133_1.RIGHT_MLO.LJPEG.png
(536L, 150L, 150L)
(536L, 1L)


In [8]:
X_train = X_data
Y_train = Y_data

In [9]:
print X_train.shape
print X_test.shape

print Y_train.shape
print Y_test.shape

(2132L, 150L, 150L)
(536L, 150L, 150L)
(2132L, 1L)
(536L, 1L)


In [10]:
def yDist(y):
    bcCounts = collections.defaultdict(int)
    for a in range(0, y.shape[0]):
        bcCounts[y[a][0]] += 1
    return bcCounts

print "Y_train Dist: " + str(yDist(Y_train))
print "Y_test Dist: " + str(yDist(Y_test))

Y_train Dist: defaultdict(<type 'int'>, {0: 862, 1: 531, 2: 739})
Y_test Dist: defaultdict(<type 'int'>, {0: 215, 1: 142, 2: 179})


In [11]:
# Load the bc array for our count in the model definition
bcTypes = bc.bcNumerics()
print bcTypes
print len(bcTypes)

{'benign': 1, 'malignant': 2, 'normal': 0}
3


In [12]:
X_train_s = X_train.reshape((2132,-1))

In [13]:
X_test_s = X_test.reshape((536,-1))

In [14]:
model = SVC(gamma=0.001)

In [15]:
model.fit(X_train_s,Y_train)

  y_ = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
predicted = model.predict(X_test_s)
expected = Y_test

In [17]:
svm_matrix = skm.confusion_matrix(Y_test, predicted)
svm_matrix

array([[113,  35,  67],
       [ 14,  67,  61],
       [ 46,  26, 107]])

In [19]:
print metrics.accuracy_score(expected,predicted)

0.535447761194
