In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import zipfile
import os


os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.metrics import TruePositives, TrueNegatives, FalsePositives, FalseNegatives

In [None]:
path_API_autentification_token= '/content/drive/MyDrive/BigData Pneumonia Project'

In [None]:
import zipfile
import os

os.environ['KAGGLE_CONFIG_DIR'] = path_API_autentification_token

!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia

zip_ref = zipfile.ZipFile('chest-xray-pneumonia.zip', 'r')
zip_ref.extractall('/tmp')
zip_ref.close()

In [None]:
train_dir = '/tmp/chest_xray/train'
val_dir = '/tmp/chest_xray/val'
test_dir = '/tmp/chest_xray/test'

In [None]:
img_height = 128
img_width = 128
batch_size = 32

In [None]:
train_df = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir,
    color_mode = 'grayscale',
    image_size = (img_height,img_width),
    batch_size = batch_size
)

val_df = tf.keras.preprocessing.image_dataset_from_directory(
    val_dir,
    color_mode = 'grayscale',
    image_size = (img_height,img_width),
    batch_size = batch_size
)

test_df = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir,
    color_mode = 'grayscale',
    image_size = (img_height,img_width),
    batch_size = batch_size
)

In [None]:
train_labels = []
test_labels = []
val_labels = []

for images, labels in train_df.unbatch():
  train_labels.append(labels.numpy())

for images, labels in test_df.unbatch():
  test_labels.append(labels.numpy())

for images, labels in val_df.unbatch():
  val_labels.append(labels.numpy())



In [None]:
print("Count values of instances per label in train dataset\n")
print(pd.DataFrame(np.unique(train_labels, return_counts = True), index=['Label', 'count'], columns=['Normal','Pneumonia']))
print("\n\nCount values of instances per label in test dataset\n")
print(pd.DataFrame(np.unique(test_labels, return_counts = True), index=['Label', 'count'], columns=['Normal','Pneumonia']))
print("\n\nCount values of instances per label in validation dataset\n")
print(pd.DataFrame(np.unique(val_labels, return_counts = True), index=['Label', 'count'], columns=['Normal','Pneumonia']))

In [None]:
plt.figure(figsize=(10, 10))
for images, labels in train_df.take(1):
    for i in range(9):
        plt.subplot(3, 3, i + 1)
        plt.imshow(np.squeeze(images[i].numpy().astype("uint8")))
        plt.title(train_df.class_names[labels[i]])
        plt.axis("off")

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_df = train_df.cache().prefetch(buffer_size=AUTOTUNE)
val_df = val_df.cache().prefetch(buffer_size=AUTOTUNE)
test_df = test_df.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
!pip install https://sourceforge.net/projects/analytics-zoo/files/dllib-py-spark3/bigdl_dllib_spark3-0.14.0b20211107-py3-none-manylinux1_x86_64.whl

In [None]:
!pip install findspark

In [None]:
from bigdl.dllib.nn.layer import *
from bigdl.dllib.nn.criterion import *
from bigdl.dllib.optim.optimizer import *
from bigdl.dllib.nncontext import *
from bigdl.dllib import keras
from bigdl.dllib.keras.layers import *
from bigdl.dllib.keras.models import *
from bigdl.dllib.nnframes import *
from bigdl.dllib.nn.criterion import *

from pyspark import SparkContext
from pyspark.sql import SparkSession
from IPython.display import Markdown, display

from keras.preprocessing.image import ImageDataGenerator
from keras.utils import load_img
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense, Flatten, BatchNormalization, Conv2D, MaxPool2D
from keras.optimizers import Adam
from keras import backend as K
from keras.preprocessing import image

In [None]:
import findspark
findspark.init()

In [None]:
#spark context
sc = init_nncontext(cluster_mode="local")
spark = SparkSession(sc)

In [None]:
batch_size = 32

#ImageDataGeneratorthực hiện tăng cường dữ liệu và chuẩn bị các lô hình ảnh để đào tạo và đánh giá
#data transformation
train_datagen = ImageDataGenerator(rescale=1./255, horizontal_flip=True, vertical_flip=True)
validation_datagen = ImageDataGenerator(rescale=1./255, horizontal_flip=True, vertical_flip=True)
test_datagen = ImageDataGenerator(rescale=1./255, horizontal_flip=True, vertical_flip=True)

#import the data
#batch_size xác định số lượng mẫu mỗi lô trong quá trình đào tạo hoặc đánh giá
#target_size chỉ định kích thước mà hình ảnh sẽ được thay đổi kích thước

train_generator= train_datagen.flow_from_directory(train_dir, target_size = (64,64), batch_size = 64, class_mode="binary" )
validation_generator = validation_datagen.flow_from_directory(val_dir, target_size = (64,64), batch_size =  batch_size, class_mode="binary")
test_generator = test_datagen.flow_from_directory(test_dir, target_size = (64,64), batch_size= batch_size, class_mode="binary")

In [None]:
X_train, Y_train = next(train_generator)
X_val, Y_val = next(validation_generator)
X_test, Y_test = next(test_generator)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Reshape, Conv2D, MaxPooling2D, Flatten, Dense

model = Sequential()
model.add(Reshape((64, 64, 3), input_shape=(64, 64, 3)))
model.add(Conv2D(32, (3, 3), activation="relu", name="conv1"))
model.add(MaxPooling2D())
model.add(Conv2D(32, (3, 3), activation="relu", name="conv2"))
model.add(MaxPooling2D())
model.add(Conv2D(32, (3, 3), activation="relu", name="conv3"))
model.add(MaxPooling2D())
model.add(Flatten())
model.add(Dense(128, activation="relu", name="fc1"))
model.add(Dense(2, activation="softmax", name="fc2"))

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
model.fit(
    X_train, Y_train,
    epochs=30,
    batch_size=50,
    validation_data=(X_val, Y_val)
)

In [None]:
accuracy = model.evaluate(X_test, Y_test, batch_size=20)
print("Loss: ", accuracy[0])
print("Accuracy: ", accuracy[1])

In [None]:
model.save('bigdl.h5')