# Annotations Ingesting

In [None]:
!curl http://images.cocodataset.org/annotations/annotations_trainval2014.zip >> anns.zip
!unzip anns.zip
!rm anns.zip

# Annotations Loading

In [None]:
import json
import pandas as pd


with open("annotations/captions_val2014.json", 'r') as f:
    coco_data = json.load(f)

images = {image['id']: image for image in coco_data['images']}
annotations = coco_data['annotations']

data = []
for ann in annotations:
    image_id = ann['image_id']
    caption = ann['caption']

    if image_id in images:
        image_info = images[image_id]
        image_name = image_info['file_name']
        width = image_info['width']
        height = image_info['height']

        data.append([image_name, width, height, caption])

df = pd.DataFrame(data, columns=['image_name', 'width', 'height', 'caption'])

# Annotations Exploration

## Quick Look

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.width.unique().size, df.height.unique().size

In [None]:
df.loc[:, ["width", "height"]].hist()

In [None]:
df.width.mean(), df.height.mean()

In [None]:
for c in df.loc[:15, "caption"]:
    print(c)

In [None]:
df["words_count"] = df["caption"].map(lambda string: len(string.split(" ")))

In [None]:
df["words_count"].head()

In [None]:
df["words_count"].hist()

In [None]:
df["words_count"].mean()

## Data Cleaning

### Choosing Standard Widths & Heights

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.grid(True)
sns.scatterplot(df.loc[:, ["width", "height"]], x="width", y="height", alpha=3e-3)
plt.show()


In [None]:
import numpy as np

set1 = {
    "min_w": 460,
    "max_w": 540,
    "min_h": 300,
    "max_h": 400
}

set2 = {
    "min_w": 400,
    "max_w": 500,
    "min_h": 630,
    "max_h": 9999
}

set3 = {
    "min_w": 630,
    "max_w": 9999,
    "min_h": 300,
    "max_h": 400
}

def filter_df(df, cords_set):
    return df[
        (df["width"].map(lambda w: cords_set["min_w"] < w < cords_set["max_w"]))
        & (df["height"].map(lambda h: cords_set["min_h"] < h < cords_set["max_h"]))
    ]

df1 = filter_df(df, set1)
df2 = filter_df(df, set2)
df3 = filter_df(df, set3)

In [None]:
df1.shape[0],  df2.shape[0],  df3.shape[0]

In [None]:
df.width.value_counts()

In [None]:
df.height.value_counts()

In [None]:
df.groupby(["width", "height"]).size().sort_values(ascending=False).head(10)

### Pipeline Steps

In [None]:
target_pairs = [
    (640, 480),  # The perfect sizes
    (480, 640),  # Flip axis
    (427, 640),  # Resize width and flip axis
    (640, 426),  # Resize height
    (640, 427),  # Resize height
    (640, 428),  # Resize height
]
df_filtered = df[
    df.apply(lambda rec: (rec.width, rec.height) in target_pairs, axis=1)
]

In [None]:
df_filtered.info()