In [1]:
import pandas as pd
import tensorflow as tf
from object_detection.utils import dataset_util
from collections import namedtuple, OrderedDict
from pyspark.sql.types import *
from pyspark.sql import Row, SparkSession
from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import udf
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, IntegerType, FloatType, ArrayType
import cv2
import numpy as np

In [2]:
spark = SparkSession.builder\
    .master("local[4]")\
    .appName("sql.functions tests")\
    .getOrCreate()
sc = spark.sparkContext

In [3]:
tiff_train_loc = '/Users/johnmorin/KaylaTek/Data/images/train/*.tif'
tiff_test_loc = '/Users/johnmorin/KaylaTek/Data/images/test/*.tif'
tiff_train_labels = '/Users/johnmorin/KaylaTek/Data/images/training_data.csv'
tiff_test_labels = '/Users/johnmorin/KaylaTek/Data/images/test_data.csv'

In [4]:
#train_file_loc = '/Users/johnmorin/Documents/GitHub/Obj-Det-Tutorial/TensorFlow-Object-Detection-API-Tutorial-Train-Multiple-Objects-Windows-10/images/train_labels.csv'
#test_file_loc = '/Users/johnmorin/Documents/GitHub/Obj-Det-Tutorial/TensorFlow-Object-Detection-API-Tutorial-Train-Multiple-Objects-Windows-10/images/test_labels.csv'

In [5]:
#TRAINING_FILE_LOCATION = '/Users/johnmorin/Documents/GitHub/Obj-Det-Tutorial/TensorFlow-Object-Detection-API-Tutorial-Train-Multiple-Objects-Windows-10/images/train/*.jpg'
#TEST_FILE_LOCATION = '/Users/johnmorin/Documents/GitHub/Obj-Det-Tutorial/TensorFlow-Object-Detection-API-Tutorial-Train-Multiple-Objects-Windows-10/images/train/*.jpg'

In [6]:
#image_df.printSchema()

In [7]:
#tif_image_df = spark.read.format('image').load('/Users/johnmorin/KaylaTek/Data/images/train/frame_000000.tif')

In [8]:
#to_shape = tf.cast(tf.stack([1024, 1280, 3]), tf.int32)
#image = tf.reshape(tf.decode_raw(encoded_inputs, tf.uint8), to_shape)

In [9]:
def class_text_to_int(row_label):
    if row_label == 'nine':
        return 1
    elif row_label == 'ten':
        return 2
    elif row_label == 'jack':
        return 3
    elif row_label == 'queen':
        return 4
    elif row_label == 'king':
        return 5
    elif row_label == 'ace':
        return 6
    else:
        return None

In [31]:
def load_label_files(label_filename):
    df = pd.read_csv(label_filename)
    grouped = df['filename'].unique()
    
    csv_df = pd.DataFrame()
    for files in grouped:
        mydf = df[df['filename'] == files]
        #print(mydf.head())
        height = int(mydf['h'].unique()[0])
        width = int(mydf['w'].unique()[0])
        filename = mydf.filename.unique()[0]
        num_label = mydf['classes'].tolist()
        label = mydf['classes_text'].tolist()
        #print(list(label))
        #num_label = mydf['class'].map(class_text_to_int).tolist()
        # Creating proportional locations for the bounding boxes instead of pixel values
        xmin = mydf['xmin'].map(lambda x: x/width).tolist()
        ymin = mydf['ymin'].map(lambda x: x/height).tolist()
        xmax = mydf['xmax'].map(lambda x: x/width).tolist()
        ymax = mydf['ymax'].map(lambda x: x/height).tolist()
        temp = pd.DataFrame({'image/filename':filename, 'image/width':width, 'image/height':height, 
                        'image/object/class/label':[num_label], 'image/object/bbox/xmin':[xmin], 
                        'image/object/bbox/ymin':[ymin],'image/object/bbox/xmax':[xmax],
                        'image/object/bbox/ymax':[ymax], 'image/object/class/text':[label],
                        # Change Number of Channels to correct number
                        'image/channels': int(3)
                        #'image/encoded': dataset_util.bytes_feature(encoded_inputs),
                        })
        csv_df = pd.concat([csv_df, temp])
    csv_df.reset_index(inplace= True)
    csv_df.drop('index', axis=1, inplace = True)
    csv_df['image/format'] = '.tif'
    return csv_df

In [32]:
# Use OpenCV to read in binary for image file
#image_df = image_df.withColumn('file_locs', image_df.image.origin.substr(6, 300))
#cv2_udf = udf(lambda x: cv2.imread(x, 1).tostring(), BinaryType())
#image_df = image_df.withColumn('cv2', cv2_udf(image_df.file_locs))

In [19]:
image_df = spark.read.format('image').load(tiff_train_loc)
image_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



In [20]:
image_df.first().image.origin

'file:/Users/johnmorin/KaylaTek/Data/images/train/frame_000365.tif'

In [76]:
def create_tfrecord(pandas_df, tf_record_loc, image_location, which):
    pand_df = sqlContext.createDataFrame(pandas_df)
    image_df = spark.read.format('image').load(image_location)
    if which is 'train':
        image_df = image_df.withColumn('filename', image_df.image.origin.substr(50, 300))
    if which is 'test':
        image_df = image_df.withColumn('filename', image_df.image.origin.substr(49, 300))
    combined_df = image_df.join(pand_df, \
                            image_df.filename == pand_df['image/filename'], \
                            'inner').drop(image_df.filename)
    # Original Code to add encoded data to 
    #combined_df = combined_df.withColumn('image/encoded', combined_df['image'].data)
    # Code to encode tiffs into binary format when pyspark won't read them
    combined_df = combined_df.withColumn('image/source_id', combined_df.image.origin.substr(6, 300))
    cv2_udf = udf(lambda x: cv2.imread(x, 1).tostring(), BinaryType())
    combined_df = combined_df.withColumn('image/encoded', cv2_udf(combined_df['image/source_id']))
    #print(combined_df.first())
    # Cleaning up dataframe before writing it to tfrecord
    combined_df = combined_df.drop('image')
    
    # Writing Pyspark Dataframe to tfrecord
    combined_df.write.format('tfrecords').option('recordType', 'Example').save(tf_record_loc)
    #combined_df.printSchema()

In [33]:
train_df = load_label_files(tiff_train_labels)
test_df = load_label_files(tiff_test_labels)

In [77]:
create_tfrecord(train_df, 'tf_tiff_train', tiff_train_loc, 'train')
create_tfrecord(test_df, 'tf_tiff_test', tiff_test_loc, 'test')

Row(image=Row(origin='file:/Users/johnmorin/KaylaTek/Data/images/train/frame_000365.tif', height=-1, width=-1, nChannels=-1, mode=-1, data=bytearray(b'')), filename='frame_000365.tif')
Row(image=Row(origin='file:/Users/johnmorin/KaylaTek/Data/images/test/frame_000403.tif', height=-1, width=-1, nChannels=-1, mode=-1, data=bytearray(b'')), filename='frame_000403.tif')


In [15]:
#label_files = spark.read.format('csv') \
#    .option("header", "true") \
#    .option("inferSchema", 'true') \
#    .load('train_label_file.csv')

In [86]:
tfrecords = '/tf_card_train/*.*'
tf_single = 'pand_tf_train.record/part-r-00000'

In [87]:
tf_rec = spark.read.format("tfrecords").option("recordType", "Example").load(tfrecords)

Py4JJavaError: An error occurred while calling o3033.load.
: org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input Pattern file:/tf_card_train/*.* matches 0 files
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:323)
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:265)
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:387)
	at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:130)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$aggregate$1.apply(RDD.scala:1124)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.aggregate(RDD.scala:1117)
	at org.tensorflow.spark.datasources.tfrecords.TensorFlowInferSchema$.apply(TensorFlowInferSchema.scala:39)
	at org.tensorflow.spark.datasources.tfrecords.TensorflowRelation$$anonfun$3.apply(TensorflowRelation.scala:42)
	at org.tensorflow.spark.datasources.tfrecords.TensorflowRelation$$anonfun$3.apply(TensorflowRelation.scala:42)
	at scala.Option.getOrElse(Option.scala:121)
	at org.tensorflow.spark.datasources.tfrecords.TensorflowRelation.x$1$lzycompute(TensorflowRelation.scala:42)
	at org.tensorflow.spark.datasources.tfrecords.TensorflowRelation.x$1(TensorflowRelation.scala:32)
	at org.tensorflow.spark.datasources.tfrecords.TensorflowRelation.tfSchema$lzycompute(TensorflowRelation.scala:32)
	at org.tensorflow.spark.datasources.tfrecords.TensorflowRelation.tfSchema(TensorflowRelation.scala:32)
	at org.tensorflow.spark.datasources.tfrecords.TensorflowRelation.schema(TensorflowRelation.scala:59)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:403)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
	at sun.reflect.GeneratedMethodAccessor86.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [18]:
tf_rec.printSchema()

root
 |-- image/object/class/text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- image/width: long (nullable = true)
 |-- image/height: long (nullable = true)
 |-- image/object/bbox/ymax: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- image/object/bbox/xmin: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- image/filename: string (nullable = true)
 |-- image/format: string (nullable = true)
 |-- image/source_id: string (nullable = true)
 |-- image/object/class/label: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- image/encoded: string (nullable = true)
 |-- image/object/bbox/xmax: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- image/object/bbox/ymin: array (nullable = true)
 |    |-- element: float (containsNull = true)

