In [608]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [609]:
spark_session = SparkSession.builder.master('local[*]').config('spark.driver.memory', '10G').config('spark.executor.memory', '8G').getOrCreate()

In [610]:
df_trainval_2014 = spark_session.read.json('file:///workspace/d/ds_ai_stuff/datasets/mscoco/annotations_trainval2014/annotations')

## Data exploration for annotations_trainval2014

In [611]:
df_trainval_2014.printSchema()

root
 |-- annotations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- area: double (nullable = true)
 |    |    |-- bbox: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |    |    |-- caption: string (nullable = true)
 |    |    |-- category_id: long (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- image_id: long (nullable = true)
 |    |    |-- iscrowd: long (nullable = true)
 |    |    |-- keypoints: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- num_keypoints: long (nullable = true)
 |    |    |-- segmentation: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- keypoints: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- skelet

In [612]:
captions_df = df_trainval_2014.withColumn('id_caption', F.arrays_zip(F.col('annotations.id'), F.col('annotations.caption'), F.col('annotations.image_id')))\
    .select(F.explode(F.col('id_caption')).alias('id_caption'))\
    .select(
        F.col('id_caption.id').alias('id'),
        F.col('id_caption.caption').alias('caption'),
        F.col('id_caption.image_id').alias('image_id')
    ).alias('captions_df')

In [613]:
captions_df.show(truncate=False)

+---+-----------------------------------------------------------------------------+--------+
|id |caption                                                                      |image_id|
+---+-----------------------------------------------------------------------------+--------+
|48 |A very clean and well decorated empty bathroom                               |318556  |
|67 |A panoramic view of a kitchen and all of its appliances.                     |116100  |
|126|A blue and white bathroom with butterfly themed wall tiles.                  |318556  |
|148|A panoramic photo of a kitchen and dining room                               |116100  |
|173|A graffiti-ed stop sign across the street from a red car                     |379340  |
|188|A vandalized stop sign and a red beetle on the road                          |379340  |
|219|A bathroom with a border of butterflies and blue paint on the walls above it.|318556  |
|255|An angled view of a beautifully decorated bathroom.              

In [614]:
images_df = df_trainval_2014.withColumn('images_id_path', F.arrays_zip(F.col('images.id'), F.col('images.file_name'), F.col('images.height'), F.col('images.width')))\
    .select(F.explode(F.col('images_id_path')).alias('id_path_height_width'))\
    .select(
        F.col('id_path_height_width.id').alias('image_id'),
        F.col('id_path_height_width.file_name').alias('path'),
        F.col('id_path_height_width.height').alias('height'),
        F.col('id_path_height_width.width').alias('width')
    ).alias('images_df')

In [616]:
images_df.show(truncate=False)

+--------+-------------------------------+------+-----+
|image_id|path                           |height|width|
+--------+-------------------------------+------+-----+
|57870   |COCO_train2014_000000057870.jpg|480   |640  |
|384029  |COCO_train2014_000000384029.jpg|429   |640  |
|222016  |COCO_train2014_000000222016.jpg|640   |480  |
|520950  |COCO_train2014_000000520950.jpg|427   |640  |
|69675   |COCO_train2014_000000069675.jpg|480   |640  |
|547471  |COCO_train2014_000000547471.jpg|640   |383  |
|122688  |COCO_train2014_000000122688.jpg|640   |480  |
|392136  |COCO_train2014_000000392136.jpg|347   |500  |
|398494  |COCO_train2014_000000398494.jpg|640   |427  |
|90570   |COCO_train2014_000000090570.jpg|429   |640  |
|504616  |COCO_train2014_000000504616.jpg|640   |480  |
|161919  |COCO_train2014_000000161919.jpg|612   |612  |
|457732  |COCO_train2014_000000457732.jpg|640   |427  |
|44404   |COCO_train2014_000000044404.jpg|480   |640  |
|4428    |COCO_train2014_000000004428.jpg|640   

In [618]:
joined_df = captions_df\
    .join(images_df, how='inner', on=F.col('captions_df.image_id') == F.col('images_df.image_id'))\
    .select(
        F.col('captions_df.id').alias('id'),
        F.col('captions_df.image_id').alias('image_id'),
        F.col('captions_df.caption').alias('caption'),
        F.col('images_df.path').alias('filename'),
        F.col('images_df.height').alias('height'),
        F.col('images_df.width').alias('width')
    ).dropDuplicates()

In [619]:
joined_df.show()

+-------+--------+--------------------+--------------------+------+-----+
|     id|image_id|             caption|            filename|height|width|
+-------+--------+--------------------+--------------------+------+-----+
| 567271|      77|A young man ridin...|COCO_train2014_00...|   375|  500|
| 569752|      77|a group of teenag...|COCO_train2014_00...|   375|  500|
| 573184|      77|A time lapse imag...|COCO_train2014_00...|   375|  500|
| 579604|      77|Group of boys per...|COCO_train2014_00...|   375|  500|
| 582316|      77|some male skatebo...|COCO_train2014_00...|   375|  500|
| 471474|      77|                null|COCO_train2014_00...|   375|  500|
| 502475|      77|                null|COCO_train2014_00...|   375|  500|
| 642036|      77|                null|COCO_train2014_00...|   375|  500|
|1473417|      77|                null|COCO_train2014_00...|   375|  500|
|1727807|      77|                null|COCO_train2014_00...|   375|  500|
|1734810|      77|                null

## Number of entries per image id

In [620]:
joined_df.groupby(F.col('image_id')).agg({'id': 'count'}).show()

+--------+---------+
|image_id|count(id)|
+--------+---------+
|      77|       13|
|     113|       23|
|     196|       47|
|     241|       19|
|     415|        7|
|     474|        7|
|     486|       13|
|     502|        6|
|     564|       22|
|     656|        8|
|     724|        9|
|     730|       11|
|     831|       10|
|     965|       10|
|    1059|        6|
|    1145|       46|
|    1224|        7|
|    1353|       12|
|    1360|       12|
|    1374|        7|
+--------+---------+
only showing top 20 rows



## Total number of entries without duplicates

In [621]:
joined_df.count()

1513549

## Let's filter all rows with empty or null caption

In [622]:
no_null_captions_df = joined_df.filter((~F.isnull(F.col('caption'))) & (F.length(F.col('caption')) > F.lit(0)))

In [623]:
no_null_captions_df.printSchema()
img_df = spark_session.read.format('image').load('file:///workspace/d/ds_ai_stuff/datasets/mscoco/train2014/train2014/')
img_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- image_id: long (nullable = true)
 |-- caption: string (nullable = true)
 |-- filename: string (nullable = true)
 |-- height: long (nullable = true)
 |-- width: long (nullable = true)

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



## Total number of entries after filtering null captions

In [624]:
no_null_captions_df.count()

616767

In [634]:
no_null_captions_df.filter(F.col('filename') == F.lit('COCO_train2014_000000005882.jpg')).coalesce(1).write.parquet('file:///workspace/d/cptr-vistion-transformer/tests/resources/metadata/')

## Captions stats

* *TOTAL ROWS*: _1.513.549_
* *TOTAL ROWS EXCLUDING NULL CAPTIONS*: _616.767_

## Let's see the length of captions and order by the descending order

In [None]:
no_null_captions_df.withColumn('caption_len', F.length(F.col('caption'))).orderBy(F.col('caption_len'), ascending=False).show()