In [20]:
import pandas as pd
import tensorflow as tf
from object_detection.utils import dataset_util
from collections import namedtuple, OrderedDict
from pyspark.sql.types import *
from pyspark.sql import Row, SparkSession
from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import udf
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, IntegerType, FloatType, ArrayType
import cv2
import numpy as np

In [21]:
spark = SparkSession.builder\
    .master("local[4]")\
    .appName("sql.functions tests")\
    .getOrCreate()
sc = spark.sparkContext

In [22]:
tiff_train_loc = '/Users/johnmorin/KaylaTek/Data/images/train/*.tif'
tiff_test_loc = '/Users/johnmorin/KaylaTek/Data/images/test/*.tif'
tiff_train_labels = '/Users/johnmorin/KaylaTek/Data/images/training_data.csv'
tiff_test_labels = '/Users/johnmorin/KaylaTek/Data/images/test_data.csv'

In [23]:
#train_file_loc = '/Users/johnmorin/Documents/GitHub/Obj-Det-Tutorial/TensorFlow-Object-Detection-API-Tutorial-Train-Multiple-Objects-Windows-10/images/train_labels.csv'
#test_file_loc = '/Users/johnmorin/Documents/GitHub/Obj-Det-Tutorial/TensorFlow-Object-Detection-API-Tutorial-Train-Multiple-Objects-Windows-10/images/test_labels.csv'

In [24]:
#TRAINING_FILE_LOCATION = '/Users/johnmorin/Documents/GitHub/Obj-Det-Tutorial/TensorFlow-Object-Detection-API-Tutorial-Train-Multiple-Objects-Windows-10/images/train/*.jpg'
#TEST_FILE_LOCATION = '/Users/johnmorin/Documents/GitHub/Obj-Det-Tutorial/TensorFlow-Object-Detection-API-Tutorial-Train-Multiple-Objects-Windows-10/images/train/*.jpg'

In [25]:
#tif_image_df = spark.read.format('image').load('/Users/johnmorin/KaylaTek/Data/images/train/frame_000000.tif')

In [26]:
#to_shape = tf.cast(tf.stack([1024, 1280, 3]), tf.int32)
#image = tf.reshape(tf.decode_raw(encoded_inputs, tf.uint8), to_shape)

In [27]:
def class_text_to_int(row_label):
    if row_label == 'nine':
        return 1
    elif row_label == 'ten':
        return 2
    elif row_label == 'jack':
        return 3
    elif row_label == 'queen':
        return 4
    elif row_label == 'king':
        return 5
    elif row_label == 'ace':
        return 6
    else:
        return None

In [28]:
def load_label_files(label_filename):
    df = pd.read_csv(label_filename)
    grouped = df['filename'].unique()
    
    csv_df = pd.DataFrame()
    for files in grouped:
        mydf = df[df['filename'] == files]
        #print(mydf.head())
        height = int(mydf['h'].unique()[0])
        width = int(mydf['w'].unique()[0])
        filename = mydf.filename.unique()[0]
        num_label = mydf['classes'].tolist()
        label = mydf['classes_text'].tolist()
        #print(list(label))
        #num_label = mydf['class'].map(class_text_to_int).tolist()
        # Creating proportional locations for the bounding boxes instead of pixel values
        xmin = mydf['xmin'].map(lambda x: x/width).tolist()
        ymin = mydf['ymin'].map(lambda x: x/height).tolist()
        xmax = mydf['xmax'].map(lambda x: x/width).tolist()
        ymax = mydf['ymax'].map(lambda x: x/height).tolist()
        temp = pd.DataFrame({'image/filename':filename, 'image/width':width, 'image/height':height, 
                        'image/object/class/label':[num_label], 'image/object/bbox/xmin':[xmin], 
                        'image/object/bbox/ymin':[ymin],'image/object/bbox/xmax':[xmax],
                        'image/object/bbox/ymax':[ymax], 'image/object/class/text':[label],
                        # Change Number of Channels to correct number
                        'image/channels': int(3)
                        #'image/encoded': dataset_util.bytes_feature(encoded_inputs),
                        })
        csv_df = pd.concat([csv_df, temp])
    csv_df.reset_index(inplace= True)
    csv_df.drop('index', axis=1, inplace = True)
    csv_df['image/format'] = '.tif'
    return csv_df

In [29]:
# Create UDF
#cv2_udf = udf(lambda x: cv2.imread(x, 1).tostring(), BinaryType())

In [30]:
# Use OpenCV to read in binary for image file
#image_df = image_df.withColumn('file_locs', image_df.image.origin.substr(6, 300))

#image_df = image_df.withColumn('cv2', cv2_udf(image_df.file_locs))

In [31]:
image_df = spark.read.format('image').load(tiff_train_loc)
image_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



In [32]:
image_df.first().image.origin

'file:/Users/johnmorin/KaylaTek/Data/images/train/frame_000365.tif'

In [33]:
def create_tfrecord(pandas_df, tf_record_loc, image_location, which):
    pand_df = sqlContext.createDataFrame(pandas_df)
    image_df = spark.read.format('image').load(image_location)
    if which is 'train':
        image_df = image_df.withColumn('filename', image_df.image.origin.substr(50, 300))
    if which is 'test':
        image_df = image_df.withColumn('filename', image_df.image.origin.substr(49, 300))
    combined_df = image_df.join(pand_df, \
                            image_df.filename == pand_df['image/filename'], \
                            'inner').drop(image_df.filename)
    # Original Code to add encoded data to 
    #combined_df = combined_df.withColumn('image/encoded', combined_df['image'].data)
    # Code to encode tiffs into binary format when pyspark won't read them
    combined_df = combined_df.withColumn('image/source_id', combined_df.image.origin.substr(6, 300))
    cv2_udf = udf(lambda x: cv2.imread(x, 1).tostring(), BinaryType())
    combined_df = combined_df.withColumn('image/encoded', cv2_udf(combined_df['image/source_id']))
    #print(combined_df.first())
    # Cleaning up dataframe before writing it to tfrecord
    combined_df = combined_df.drop('image')
    
    # Writing Pyspark Dataframe to tfrecord
    combined_df.write.format('tfrecords').option('recordType', 'Example').save(tf_record_loc)
    #combined_df.printSchema()

In [34]:
train_df = load_label_files(tiff_train_labels)
test_df = load_label_files(tiff_test_labels)

In [36]:
create_tfrecord(train_df, 'tf_tiff_train', tiff_train_loc, 'train')
create_tfrecord(test_df, 'tf_tiff_test', tiff_test_loc, 'test')

In [None]:
#label_files = spark.read.format('csv') \
#    .option("header", "true") \
#    .option("inferSchema", 'true') \
#    .load('train_label_file.csv')

In [None]:
tfrecords = '/tf_card_train/*.*'
tf_single = 'pand_tf_train.record/part-r-00000'

In [None]:
tf_rec = spark.read.format("tfrecords").option("recordType", "Example").load(tfrecords)

In [None]:
tf_rec.printSchema()

In [19]:
import os