In [1]:
#!pip install opencv-python

In [2]:
#!pip install imutils

In [3]:
#!pip install Keras

In [40]:
#!pip install tensorflow

## Step 1: Split train, validation and test Datasets

In [29]:
import os
import imutils
from imutils import paths
import random
import shutil
import os

In [30]:
import tensorflow

In [31]:
# import the necessary packages
import os

# initialize the path to the *original* input directory of images
ORIG_INPUT_DATASET = '/Users/Xingkang/Desktop/cell_images'

# initialize the base path to the *new* directory that will contain
# our images after computing the training and testing split
BASE_PATH = '/Users/Xingkang/Desktop'

# derive the training, validation, and testing directories
TRAIN_PATH = os.path.sep.join([BASE_PATH, "training"])
VAL_PATH = os.path.sep.join([BASE_PATH, "validation"])
TEST_PATH = os.path.sep.join([BASE_PATH, "testing"])

# define the amount of data that will be used training
TRAIN_SPLIT = 0.8

# the amount of validation data will be a percentage of the training* data
VAL_SPLIT = 0.1

In [32]:
imagePaths = list(paths.list_images(ORIG_INPUT_DATASET))

In [33]:
len(imagePaths)

27558

In [34]:
random.seed(42)
random.shuffle(imagePaths)

In [35]:
# compute the training and testing split
i = int(len(imagePaths) * TRAIN_SPLIT)
trainPaths = imagePaths[:i]
testPaths = imagePaths[i:]

In [36]:
# we'll be using part of the training data for validation
i = int(len(trainPaths) * VAL_SPLIT)
valPaths = trainPaths[:i]
trainPaths = trainPaths[i:]

In [37]:
# define the datasets that we'll be building
datasets = [
    ("training", trainPaths, TRAIN_PATH),
    ("validation", valPaths, VAL_PATH),
    ("testing", testPaths, TEST_PATH)
]

In [14]:
# loop over the datasets
for (dType, imagePaths, baseOutput) in datasets:
    # show which data split we are creating
    print("[INFO] building '{}' split".format(dType))
    # if the output base output directory does not exist, create it
    if not os.path.exists(baseOutput):
        print("[INFO] 'creating {}' directory".format(baseOutput))
        os.makedirs(baseOutput)
    # loop over the input image paths
    for inputPath in imagePaths:
        # extract the filename of the input image along with its
        # corresponding class label
        filename = inputPath.split(os.path.sep)[-1]
        label = inputPath.split(os.path.sep)[-2]
        # build the path to the label directory
        labelPath = os.path.sep.join([baseOutput, label])
        # if the label output directory does not exist, create it
        if not os.path.exists(labelPath):
            print("[INFO] 'creating {}' directory".format(labelPath))
            os.makedirs(labelPath)
        # construct the path to the destination image and then copy the image itself
        p = os.path.sep.join([labelPath, filename])
        shutil.copy2(inputPath, p)

[INFO] building 'training' split
[INFO] 'creating /Users/Xingkang/Desktop/training' directory
[INFO] 'creating /Users/Xingkang/Desktop/training/Parasitized' directory
[INFO] 'creating /Users/Xingkang/Desktop/training/Uninfected' directory
[INFO] building 'validation' split
[INFO] 'creating /Users/Xingkang/Desktop/validation' directory
[INFO] 'creating /Users/Xingkang/Desktop/validation/Parasitized' directory
[INFO] 'creating /Users/Xingkang/Desktop/validation/Uninfected' directory
[INFO] building 'testing' split
[INFO] 'creating /Users/Xingkang/Desktop/testing' directory
[INFO] 'creating /Users/Xingkang/Desktop/testing/Parasitized' directory
[INFO] 'creating /Users/Xingkang/Desktop/testing/Uninfected' directory


In [2]:
# import the necessary packages
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers import SGD
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import numpy as np
import argparse

In [16]:
totalTrain = len(list(paths.list_images(TRAIN_PATH)))
totalVal = len(list(paths.list_images(VAL_PATH)))
totalTest = len(list(paths.list_images(TEST_PATH)))

In [17]:
print(totalTest)

5512


In [18]:
print(totalTrain)

19842


In [19]:
print(totalVal)

2204


## Step 2: Extra features and labels for all three datasets

In [20]:
from keras.applications import VGG16

In [21]:
conv_base = VGG16(weights = "imagenet",
include_top = False)
conv_base.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [22]:
# initialize the training training data augmentation object
trainAug = ImageDataGenerator(
    rescale=1 / 255.0,
    rotation_range=20,
    zoom_range=0.05,
    width_shift_range=0.05,
    height_shift_range=0.05,
    shear_range=0.05,
    horizontal_flip=True,
    fill_mode="nearest")

# initialize the validation (and testing) data augmentation object
valAug = ImageDataGenerator(rescale=1 / 255.0)

In [24]:
# initialize the training generator

BS = 200

trainGen = trainAug.flow_from_directory(
    TRAIN_PATH,
    class_mode="binary",
    target_size=(150, 150),
    color_mode="rgb",
    shuffle=True,
    batch_size=BS)

# initialize the validation generator
valGen = valAug.flow_from_directory(
    VAL_PATH,
    class_mode="binary",
    target_size=(150, 150),
    color_mode="rgb",
    shuffle=False,
    batch_size=BS)

# initialize the testing generator
testGen = valAug.flow_from_directory(
    TEST_PATH,
    class_mode="binary",
    target_size=(150, 150),
    color_mode="rgb",
    shuffle=False,
    batch_size=BS)

Found 19842 images belonging to 2 classes.
Found 2204 images belonging to 2 classes.
Found 5512 images belonging to 2 classes.


In [25]:
# create function to extra features and labels
def extract_features(generator,sample_count):
    features = np.zeros(shape=(sample_count,4,4,512))
    labels = np.zeros(shape=(sample_count))
    i=0
    batch_size=BS
    
    for inputs_batch,labels_batch in generator:
        #print(labels_batch)
        features_batch = conv_base.predict(inputs_batch)
        features[i * batch_size: (i+1) * batch_size] = features_batch
        labels[i * batch_size: (i+1) * batch_size] = labels_batch
        
        i += 1
        
        if ((i * batch_size % 1000) == 0 ):
            print("processed size =", i * batch_size)
        if i * batch_size >= sample_count:
            break

    return features, labels

In [26]:
valFeatures,valLabels = extract_features(valGen,totalVal)

processed size = 1000
processed size = 2000


In [29]:
valFeatures.shape

(2204, 4, 4, 512)

In [30]:
trainFeatures,trainLabels = extract_features(trainGen,totalTrain)

processed size = 1000
processed size = 2000
processed size = 3000
processed size = 4000
processed size = 5000
processed size = 6000
processed size = 7000
processed size = 8000
processed size = 9000
processed size = 10000
processed size = 11000
processed size = 12000
processed size = 13000
processed size = 14000
processed size = 15000
processed size = 16000
processed size = 17000
processed size = 18000
processed size = 19000
processed size = 20000


In [31]:
np.save('val_features',valFeatures)
np.save('val_labels',valLabels)
np.save('train_features',trainFeatures)
np.save('train_labels',trainLabels)

In [32]:
trainFeatures.shape

(19842, 4, 4, 512)

In [33]:
trainLabels.shape

(19842,)

In [36]:
testFeatures,testLabels = extract_features(testGen,totalTest)

processed size = 1000
processed size = 2000
processed size = 3000
processed size = 4000
processed size = 5000


In [37]:
np.save('test_features',testFeatures)
np.save('test_labels',testLabels)

In [3]:
import numpy as np

trainFeatures = np.load('train_features.npy')
trainLabels = np.load('train_labels.npy')
valFeatures = np.load('val_features.npy')
valLabels = np.load('val_labels.npy')
testFeatures = np.load('test_features.npy')
testLabels = np.load('test_labels.npy')

In [7]:
!python --version

Python 3.8.3


In [13]:
pip show tensorflow

Name: tensorflow
Version: 2.4.1
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /Users/Xingkang/opt/anaconda3/lib/python3.8/site-packages
Requires: grpcio, numpy, astunparse, protobuf, absl-py, opt-einsum, flatbuffers, termcolor, wrapt, keras-preprocessing, wheel, google-pasta, gast, tensorboard, tensorflow-estimator, h5py, six, typing-extensions
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [12]:
trainFeatures.shape

(19842, 4, 4, 512)

In [19]:
trainLabels.sum()/len(trainLabels)

0.49828646305815943

In [24]:
trainFeatures[0]

array([[[0.        , 0.        , 0.0788542 , ..., 0.        ,
         0.9115749 , 0.        ],
        [0.42573977, 0.        , 1.56391287, ..., 0.        ,
         0.75693053, 0.        ],
        [0.93625277, 0.        , 1.07402086, ..., 0.        ,
         0.36188185, 0.        ],
        [0.43122604, 0.        , 1.96310353, ..., 0.        ,
         1.03826785, 0.        ]],

       [[0.        , 0.        , 2.09876704, ..., 0.        ,
         0.78235775, 0.        ],
        [1.01027966, 0.        , 2.41391921, ..., 0.        ,
         0.49252808, 0.        ],
        [1.49402761, 0.        , 1.34955549, ..., 0.        ,
         0.28055608, 0.        ],
        [0.90597242, 0.        , 1.86991072, ..., 0.        ,
         0.95605409, 0.        ]],

       [[0.        , 0.        , 1.91962194, ..., 0.        ,
         0.81638527, 0.        ],
        [1.05807376, 0.        , 1.86351085, ..., 0.        ,
         0.81781983, 0.        ],
        [1.35148668, 0.        , 0.5

In [39]:
trainFeatures

array([[[[0.00000000e+00, 0.00000000e+00, 7.88542032e-02, ...,
          0.00000000e+00, 9.11574900e-01, 0.00000000e+00],
         [4.25739765e-01, 0.00000000e+00, 1.56391287e+00, ...,
          0.00000000e+00, 7.56930530e-01, 0.00000000e+00],
         [9.36252773e-01, 0.00000000e+00, 1.07402086e+00, ...,
          0.00000000e+00, 3.61881852e-01, 0.00000000e+00],
         [4.31226045e-01, 0.00000000e+00, 1.96310353e+00, ...,
          0.00000000e+00, 1.03826785e+00, 0.00000000e+00]],

        [[0.00000000e+00, 0.00000000e+00, 2.09876704e+00, ...,
          0.00000000e+00, 7.82357752e-01, 0.00000000e+00],
         [1.01027966e+00, 0.00000000e+00, 2.41391921e+00, ...,
          0.00000000e+00, 4.92528081e-01, 0.00000000e+00],
         [1.49402761e+00, 0.00000000e+00, 1.34955549e+00, ...,
          0.00000000e+00, 2.80556083e-01, 0.00000000e+00],
         [9.05972421e-01, 0.00000000e+00, 1.86991072e+00, ...,
          0.00000000e+00, 9.56054091e-01, 0.00000000e+00]],

        [[0.00000000

## Step 3: Model Building

In [8]:
from sklearn.svm import SVC

In [9]:
from tensorflow.keras import layers
from tensorflow.keras import models

In [10]:
ann = models.Sequential([
    layers.Flatten(input_shape=(4,4,512)),
    layers.Dense(3000,activation='relu'),
    layers.Dense(1000,activation='relu'),
    layers.Dense(1,activation='sigmoid')
])

In [11]:
ann.compile(optimizer='adam',
           loss='binary_crossentropy',
           metrics=['accuracy'])

ann.fit(trainFeatures,trainLabels,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe6c3834070>

In [14]:
ann.evaluate(valFeatures,valLabels)



[0.2044825404882431, 0.926950991153717]

In [15]:
ann.evaluate(testFeatures,testLabels)



[0.18221238255500793, 0.9277939200401306]

In [69]:
cnn = models.Sequential([
    # cnn
    layers.Conv2D(filters=500,activation='relu',kernel_size=3,input_shape=(4,4,512)),
    layers.MaxPool2D(2,2),
    
    # dense
    layers.Flatten(),
    #layers.Dropout(0.5),
    layers.Dense(500,activation='relu'),
    layers.Dense(64,activation='relu'),
    layers.Dense(1,activation='sigmoid')
])

In [70]:
cnn.compile(optimizer='adam',
           loss='binary_crossentropy',
           metrics=['accuracy'])

cnn.fit(trainFeatures,trainLabels,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe5a3947100>

In [71]:
cnn.evaluate(valFeatures,valLabels)



[0.19289305806159973, 0.9419237971305847]

### Step 4: Make prediction for the testing data

In [73]:
accuracy = cnn.evaluate(testFeatures,testLabels)



In [75]:
print('The test accuracy is {}'.format(accuracy[1]))

The test accuracy is 0.944847583770752
