In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=03174a0cd7c3721e7fc75ab5e4e121ea1b5ba910aac5ede6033c1feff764c7a2
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import col, split
from pyspark.ml.feature import StringIndexer
import pyspark.sql.functions as fn
import shutil
import io
import numpy as np
import pandas as pd
from PIL import Image
import warnings
import time
import os

warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def timing(start):
    print(f'Elapsed time: {time.time() - start:.2f} s')
# start = time.time()

In [None]:
start = time.time()

spark = SparkSession.builder.appName('SparkCPU').config("spark.driver.memory", "15g").getOrCreate()

timing(start)

Elapsed time: 0.07 s


In [None]:
image_path = '/content/drive/MyDrive/gpudb'

##1. Extract

In [None]:
start = time.time()

test_images = spark.read.format("binaryFile").option("recursiveFileLookup", "true").load(image_path + "/test")

# [patient id] [filename] [class] [data source]
test_txt = spark.read.text(image_path + "/test.txt")

timing(start)

Elapsed time: 0.22 s


#2. Extract

In [None]:
def extract_size (content):
    # Extrach image size from its raw content
    image = Image.open(io.BytesIO(content))
    return image.size

@fn.pandas_udf("width: int, height: int")
def extract_size_udf(content_series):
    sizes = content_series.apply(extract_size)
    return pd.DataFrame(list(sizes))


def transform_merge(image, text):
    image = image.withColumn("file_name", fn.substring_index(image.path, "/", -1))
    text = text.select(split(col("value")," ").getItem(0).alias("patient_id"),
                       split(col("value")," ").getItem(1).alias("file_name"),
                       split(col("value")," ").getItem(2).alias("class")).drop("value")
    df = image.join(text,['file_name'],how='inner')
    df = df.select(fn.col("path"),
                   fn.col("file_name"),
                   extract_size_udf(fn.col("content")).alias("size"),
                   fn.col("content"),
                   fn.col("class"))
    indexer = StringIndexer(inputCol="class", outputCol="label")
    df = indexer.fit(df).transform(df)

    return df

In [None]:
start = time.time()

test_df = transform_merge(test_images, test_txt)

timing(start)

Elapsed time: 1.37 s


#3. Load

In [None]:
start = time.time()

test_df.write.format("parquet").option("mergeSchema", True).saveAsTable("covid_test_binary")


timing(start)

AnalysisException: ignored

# Train Model

In [None]:
!pip install tensorflow_io

Collecting tensorflow_io
  Downloading tensorflow_io-0.33.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (28.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.6/28.6 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_io
Successfully installed tensorflow_io-0.33.0


In [None]:
test_df.columns

['path', 'file_name', 'size', 'content', 'class', 'label']

In [None]:
import tensorflow as tf
import tensorflow_io as tfio
def get_dataset(file_path):
    autotune = tf.data.experimental.AUTOTUNE
    filenames = tf.data.Dataset.list_files(file_path+'/*',
                           shuffle=True)
    def parquet_ds(file):

        ds = tfio.IODataset.from_parquet(file,
               {'content': tf.string,
                'label': tf.int32})
        return ds
    ds = filenames.interleave(parquet_ds,
                              num_parallel_calls=autotune,
                              deterministic=False)
    def parse(example):
        image = tf.io.decode_raw(example['image'], tf.uint8)
        image = tf.reshape(image, [32, 32, 3])
        label = example['label']
        return image, label
    ds = ds.map(parse,num_parallel_calls=autotune)

    return ds

In [None]:
temp_file_path = image_path+'/spark-warehouse/covid_test_binary'
autotune = tf.data.experimental.AUTOTUNE
filenames = tf.data.Dataset.list_files(temp_file_path+'/*',
                        shuffle=True)
#data = get_dataset(image_path+'/spark-warehouse/covid_test_binary')

In [None]:
filenames[0]

TypeError: ignored

In [None]:
def parquet_ds(file):
    print(file)
    ds = tfio.IODataset.from_parquet(file,
            {'content': tf.string,
            'label': tf.int32})
    return ds
ds = filenames.interleave(parquet_ds,
                          num_parallel_calls=autotune,
                          deterministic=False)

Tensor("args_0:0", shape=(), dtype=string)


NotImplementedError: ignored