## Reference:
- https://www.kaggle.com/kaushal2896/global-wheat-detection-starter-eda

In [None]:
from imutils import paths
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

In [None]:
train_df = pd.read_csv('../input/global-wheat-detection/train.csv')
train_df.head()

From [this description](https://www.kaggle.com/c/global-wheat-detection/data), we have - 

- image_id - the unique image ID
- width, height - the width and height of the images
- bbox - a bounding box, formatted as a Python-style list of [xmin, ymin, width, height]
- etc.

It's important to note that not all images have bounding boxes.

In [None]:
# How many unique images?
len(train_df["image_id"].unique())

In [None]:
# Total number of entries
train_df.shape[0]

In [None]:
# Total number of images in the training directory
#len(list(paths.list_images("train")))

This means that `3422 - 3373` i.e. **49 images** do not have any annotations. [This notebook](https://www.kaggle.com/kaushal2896/global-wheat-detection-starter-eda) does an excellent job at providing more insights. Be sure to check it out. 

In [None]:
from tqdm import tqdm
import ast

In [None]:
# Separating out the coordinates
xmin, ymin, width, height = [], [], [], []

for i in tqdm(train_df["bbox"]):
    cooridinates_list = ast.literal_eval(i)
    xmin.append(cooridinates_list[0])
    ymin.append(cooridinates_list[1])
    width.append(cooridinates_list[2])
    height.append(cooridinates_list[3])

In [None]:
len(xmin), len(ymin), len(width), len(height)

In [None]:
train_df["xmin"] = xmin
train_df["ymin"] = ymin
train_df["width"] = width
train_df["height"] = height
train_df.head()

In [None]:
# Visualizing some samples from the training set

sample_indices = np.random.choice(np.unique(train_df["image_id"].tolist()), 8)

fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))
count=0

for row in ax:
    for col in row:
        img = plt.imread("train/" + sample_indices[count] + ".jpg")
        col.grid(False)
        col.set_xticks([])
        col.set_yticks([])
        col.imshow(img)
        count += 1
plt.show()

In [None]:
# Visualize the images with bounding boxes
import matplotlib.patches as patches

def get_bbox(image_id, df, col, color='white'):
    bboxes = df[df['image_id'] == image_id]
    
    for i in range(len(bboxes)):
        # Create a Rectangle patch
        rect = patches.Rectangle(
            (bboxes['xmin'].iloc[i], bboxes['ymin'].iloc[i]),
            bboxes['width'].iloc[i], 
            bboxes['height'].iloc[i], 
            linewidth=2, 
            edgecolor=color, 
            facecolor='none')

        # Add the patch to the Axes
        col.add_patch(rect)

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))
count=0
for row in ax:
    for col in row:
        img = plt.imread("train/" + sample_indices[count] + ".jpg")
        col.grid(False)
        col.set_xticks([])
        col.set_yticks([])
        get_bbox(sample_indices[count], train_df, col, color='red')
        col.imshow(img)
        count += 1
plt.show()

In [None]:
# Images without bounding box
images_w_bbox = train_df["image_id"].unique()
images_w_bbox = ["train/" + image_id + ".jpg" for image_id in images_w_bbox]

all_images = list(paths.list_images("train"))

In [None]:
images_w_bbox[:5]

In [None]:
all_images[:5]

In [None]:
images_wo_bbox = list(set(all_images) - set(images_w_bbox))
images_wo_bbox[:5]

In [None]:
# Visualizing some images without any wheat heads

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))
count=0

for row in ax:
    for col in row:
        img = plt.imread(images_wo_bbox[count])
        col.grid(False)
        col.set_xticks([])
        col.set_yticks([])
        col.imshow(img)
        count += 1
plt.show()

In [None]:
# Serialize `train_df` to a .csv file
train_df.to_csv("train_df.csv", index=False)
!head -5 train_df.csv