# Connection to DF2 on S3

In [2]:
import boto3
import os
import pandas as pd
import json, csv
import matplotlib.image as img
from PIL import Image

In [2]:
# Setting up connection to DF2 bucket
bucket = 'descriptiveworld-datasets'
subfolder = 'DeepFashion2/'

In [3]:
from sagemaker import get_execution_role
role = get_execution_role()

In [4]:
# Check connection
conn = boto3.client('s3')
contents = conn.list_objects(Bucket=bucket, Prefix=subfolder)['Contents']
for f in contents:
    print(f['Key'])

DeepFashion2/
DeepFashion2/test/image/000001.jpg
DeepFashion2/test/image/000002.jpg
DeepFashion2/test/image/000003.jpg
DeepFashion2/test/image/000004.jpg
DeepFashion2/test/image/000005.jpg
DeepFashion2/test/image/000006.jpg
DeepFashion2/test/image/000007.jpg
DeepFashion2/test/image/000008.jpg
DeepFashion2/test/image/000009.jpg
DeepFashion2/test/image/000010.jpg
DeepFashion2/test/image/000011.jpg
DeepFashion2/test/image/000012.jpg
DeepFashion2/test/image/000013.jpg
DeepFashion2/test/image/000014.jpg
DeepFashion2/test/image/000015.jpg
DeepFashion2/test/image/000016.jpg
DeepFashion2/test/image/000017.jpg
DeepFashion2/test/image/000018.jpg
DeepFashion2/test/image/000019.jpg
DeepFashion2/test/image/000020.jpg
DeepFashion2/test/image/000021.jpg
DeepFashion2/test/image/000022.jpg
DeepFashion2/test/image/000023.jpg
DeepFashion2/test/image/000024.jpg
DeepFashion2/test/image/000025.jpg
DeepFashion2/test/image/000026.jpg
DeepFashion2/test/image/000027.jpg
DeepFashion2/test/image/000028.jpg
DeepFa

### Making the folders locally for the dataset

In [5]:
if not os.path.exists('df2'):
    os.mkdir('df2')
if not os.path.exists('df2/train'):
    os.mkdir('df2/train')
if not os.path.exists('df2/val'):
    os.mkdir('df2/val')
if not os.path.exists('df2/test'):
    os.mkdir('df2/test')

## Determining the Training Dataset

In [7]:
train_df = pd.read_json("train.json")

In [3]:
train_df.head()

Unnamed: 0,item2,source,pair_id,item3,item4,item6,item5,item8,item7,img,segmentation,scale,viewpoint,zoom_in,landmarks,style,bounding_box,category_id,occlusion,category_name
0,"{'segmentation': [[460, 438, 374, 484, 251, 52...",user,1,,,,,,,000001.jpg,"[[257, 35, 261, 89, 228, 123, 137, 103, 45, 91...",3,2,2,"[182, 54, 1, 45, 91, 1, 137, 103, 1, 228, 123,...",1,"[0, 29, 466, 622]",1,2,short sleeve top
1,"{'segmentation': [[220.25, 187.55, 259.6, 177....",shop,1,,,,,,,000002.jpg,"[[145.21, 314.0, 162.67, 312.8, 175.12, 313.05...",3,2,2,"[127, 335, 1, 73, 340, 1, 107, 354, 1, 140, 35...",1,"[1, 300, 367, 701]",1,2,short sleeve top
2,,user,2,,,,,,,000003.jpg,"[[338, 64, 299, 133, 228, 189, 183, 121, 160, ...",3,1,2,"[240, 108, 2, 160, 63, 2, 183, 121, 2, 228, 18...",1,"[1, 52, 467, 831]",11,1,long sleeve dress
3,,user,2,,,,,,,000004.jpg,"[[266, 160, 257, 191, 233, 218, 184, 210, 149,...",3,2,2,"[223, 187, 1, 149, 190, 2, 184, 210, 2, 233, 2...",1,"[0, 113, 467, 623]",11,1,long sleeve dress
4,,user,2,,,,,,,000005.jpg,"[[204.0, 143.0, 182.0, 137.0, 167.0, 130.0, 16...",3,1,2,"[205, 143, 2, 162, 129, 2, 192, 164, 2, 222, 1...",1,"[1, 98, 467, 814]",11,1,long sleeve dress


In [4]:
train_df.groupby('category_name').count()['img']

category_name
long sleeve dress        7387
long sleeve outwear     11138
long sleeve top         25085
short sleeve dress      16706
short sleeve outwear      434
short sleeve top        53914
shorts                  11999
skirt                    7471
sling                    1618
sling dress              4027
trousers                23250
vest                    12803
vest dress              16121
Name: img, dtype: int64

In [8]:
# Removing the too small (too little samples) categories
train_df = train_df[(train_df.category_name != 'short sleeve outwear') & (train_df.category_name != 'sling')]
train_df.groupby('category_name').count()['img']

category_name
long sleeve dress       7387
long sleeve outwear    11138
long sleeve top        25085
short sleeve dress     16706
short sleeve top       53914
shorts                 11999
skirt                   7471
sling dress             4027
trousers               23250
vest                   12803
vest dress             16121
Name: img, dtype: int64

In [11]:
# Deselecting the least clear pictures of the items
train = train_df[train_df.occlusion < 2]
train.groupby('category_name').count()['img']

category_name
long sleeve dress       4542
long sleeve outwear     7377
long sleeve top        16536
short sleeve dress      9083
short sleeve top       32660
shorts                  5680
skirt                   3529
sling dress             2382
trousers                7602
vest                    7081
vest dress              7700
Name: img, dtype: int64

In [12]:
# Making columns for the bounding box coordinates
train[['x1', 'y1', 'x2', 'y2']] = pd.DataFrame(
    train.bounding_box.tolist(), index=train.index)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,item2,source,pair_id,item3,item4,item6,item5,item8,item7,img,...,landmarks,style,bounding_box,category_id,occlusion,category_name,x1,y1,x2,y2
2,,user,2,,,,,,,000003.jpg,...,"[240, 108, 2, 160, 63, 2, 183, 121, 2, 228, 18...",1,"[1, 52, 467, 831]",11,1,long sleeve dress,1,52,467,831
3,,user,2,,,,,,,000004.jpg,...,"[223, 187, 1, 149, 190, 2, 184, 210, 2, 233, 2...",1,"[0, 113, 467, 623]",11,1,long sleeve dress,0,113,467,623
4,,user,2,,,,,,,000005.jpg,...,"[205, 143, 2, 162, 129, 2, 192, 164, 2, 222, 1...",1,"[1, 98, 467, 814]",11,1,long sleeve dress,1,98,467,814
6,,shop,2,,,,,,,000007.jpg,...,"[225, 117, 1, 173, 120, 2, 189, 142, 2, 218, 1...",2,"[23, 106, 430, 696]",11,1,long sleeve dress,23,106,430,696
7,,shop,2,,,,,,,000008.jpg,...,"[235, 113, 1, 187, 114, 2, 207, 135, 2, 233, 1...",1,"[23, 94, 432, 674]",11,1,long sleeve dress,23,94,432,674


In [14]:
# Selecting columns of interest
wanted_columns = ["img", "category_id", "category_name", 'x1', 'y1', 'x2', 'y2']

In [14]:
train_final = train.loc[:, wanted_columns]
train_final.head()

Unnamed: 0,img,category_id,category_name,x1,y1,x2,y2
2,000003.jpg,11,long sleeve dress,1,52,467,831
3,000004.jpg,11,long sleeve dress,0,113,467,623
4,000005.jpg,11,long sleeve dress,1,98,467,814
6,000007.jpg,11,long sleeve dress,23,106,430,696
7,000008.jpg,11,long sleeve dress,23,94,432,674


### Collecting the training images from S3 to local

In [3]:
# Read spliced dataset
df_train = pd.read_parquet('train_set.parquet.gzip')

In [7]:
img_list_train = list(df_train["img"])
img_list_train[:5]

['000003.jpg', '000004.jpg', '000005.jpg', '000007.jpg', '000008.jpg']

In [None]:
# Transferring trainig images from s3 to local
s3 = boto3.resource('s3')

for image in img_list_train:
    orig_img = 'DeepFashion2/train/image/' + image
    dest_img = '/home/ec2-user/SageMaker/df2/train/' + image
    
    # Connect to S3 bucket and download image
    s3.Bucket(bucket).download_file(orig_img, dest_img)

In [19]:
len(img_list_train)

104172

### Continuing to prepare the final train dataset

In [12]:
# determine the dimensions of each image
for file in list(df_train['img']):
    i = Image.open('./df2/train/'+file)
    dims.append([*i.size])
dims_df = pd.DataFrame(dims, columns=['i_w', 'i_h'])

In [14]:
most_common_w = dims_df.groupby('i_w').count().sort_values(by='i_h', ascending=False).iloc[:1]
most_common_h = dims_df.groupby('i_h').count().sort_values(by='i_w', ascending=False).iloc[:1]

In [16]:
train_df = pd.concat([df_train.reset_index(drop=True), dims_df.reset_index(drop=True)], axis=1)

In [28]:
IMG_DIM = 512

In [19]:
# determine which images are outside the range of the image dim
toosmall = train_df[(train_df['i_w'] < IMG_DIM) | (train_df['i_h'] < IMG_DIM)]
print(len(toosmall))

2548


In [20]:
# Removing small images
train_final = train_df[~train_df.img.isin(toosmall.img)]
train_final.groupby('category_name').count()['img']

category_name
long sleeve dress       4425
long sleeve outwear     7244
long sleeve top        16171
short sleeve dress      8798
short sleeve top       31891
shorts                  5564
skirt                   3367
sling dress             2317
trousers                7482
vest                    6904
vest dress              7461
Name: img, dtype: int64

In [6]:
# Remake the category_id to be continuour
categories = list(df_train["category_name"].unique())
categories

['long sleeve dress',
 'long sleeve top',
 'short sleeve top',
 'skirt',
 'shorts',
 'long sleeve outwear',
 'trousers',
 'vest dress',
 'short sleeve dress',
 'vest',
 'sling dress']

In [13]:
dict_class = dict(zip(categories, range(0,11)))

In [14]:
dict_class

{'long sleeve dress': 0,
 'long sleeve top': 1,
 'short sleeve top': 2,
 'skirt': 3,
 'shorts': 4,
 'long sleeve outwear': 5,
 'trousers': 6,
 'vest dress': 7,
 'short sleeve dress': 8,
 'vest': 9,
 'sling dress': 10}

In [15]:
df_train["category_id"] = df_train['category_name'].apply(lambda x: dict_class[x])

In [16]:
df_train['category_id'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [19]:
df_train.head()

Unnamed: 0,img,category_id,category_name,x1,y1,x2,y2,i_w,i_h
0,000003.jpg,0,long sleeve dress,1,52,467,831,468,832
1,000004.jpg,0,long sleeve dress,0,113,467,623,468,624
2,000005.jpg,0,long sleeve dress,1,98,467,814,468,832
3,000007.jpg,0,long sleeve dress,23,106,430,696,468,702
4,000008.jpg,0,long sleeve dress,23,94,432,674,468,702


In [18]:
# Save spliced dataset
df_train.to_parquet('train_set.parquet.gzip', compression='gzip')

## Determining the Validation Dataset

In [8]:
val_df = pd.read_json("validation.json")

In [9]:
val_df.groupby('category_name').count()['img']

category_name
long sleeve dress       1403
long sleeve outwear     1694
long sleeve top         4414
short sleeve dress      3026
short sleeve outwear     107
short sleeve top        9603
shorts                  1200
skirt                   2156
sling                    285
sling dress              726
trousers                2792
vest                    1753
vest dress              2994
Name: img, dtype: int64

In [10]:
# Removing the same categories as training
val_df = val_df[(val_df.category_name != 'short sleeve outwear') & (val_df.category_name != 'sling')]
val_df.groupby('category_name').count()['img']

category_name
long sleeve dress      1403
long sleeve outwear    1694
long sleeve top        4414
short sleeve dress     3026
short sleeve top       9603
shorts                 1200
skirt                  2156
sling dress             726
trousers               2792
vest                   1753
vest dress             2994
Name: img, dtype: int64

In [11]:
val = val_df[val_df.occlusion < 2]
val.groupby('category_name').count()['img']

category_name
long sleeve dress       785
long sleeve outwear    1065
long sleeve top        2863
short sleeve dress     1656
short sleeve top       7065
shorts                  513
skirt                  1209
sling dress             429
trousers                690
vest                    925
vest dress             1406
Name: img, dtype: int64

In [12]:
# Making columns for the bounding box coordinates
val[['x1', 'y1', 'x2', 'y2']] = pd.DataFrame(
    val.bounding_box.tolist(), index=val.index)
val.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,item2,source,pair_id,item3,item4,item5,img,segmentation,scale,viewpoint,...,landmarks,style,bounding_box,category_id,occlusion,category_name,x1,y1,x2,y2
3,,user,3,,,,000004.jpg,"[[255.0586538462, 215.8183302808, 242.38266178...",1,3,...,"[255, 217, 1, 275, 214, 1, 0, 0, 0, 0, 0, 0, 0...",1,"[174, 205, 327, 393]",10,1,short sleeve dress,174,205,327,393
4,,user,4,,,,000005.jpg,"[[372.7580174927, 63.6168263224, 380.588088296...",3,2,...,"[426, 12, 1, 0, 0, 0, 346, 40, 2, 380, 70, 2, ...",1,"[48, 0, 467, 623]",1,1,short sleeve top,48,0,467,623
7,,user,5,,,,000008.jpg,"[[136.7588121221, 107.582797825, 136.758812122...",3,2,...,"[153, 163, 1, 137, 108, 1, 129, 145, 2, 141, 1...",2,"[42, 102, 223, 370]",12,1,vest dress,42,102,223,370
10,,user,6,,,,000011.jpg,"[[0.6857142857, 45.2571428571, 14.4, 50.057142...",3,2,...,"[224, 43, 1, 119, 12, 1, 170, 124, 1, 257, 221...",1,"[0, 0, 467, 466]",10,1,short sleeve dress,0,0,467,466
12,,user,6,,,,000013.jpg,"[[105.8021978022, 19.1538461538, 114.010989011...",2,2,...,"[164, 18, 1, 105, 18, 1, 124, 47, 2, 167, 49, ...",1,"[32, 9, 313, 492]",10,1,short sleeve dress,32,9,313,492


In [20]:
val_df = val.loc[:, wanted_columns]
val_df.head()

Unnamed: 0,img,category_id,category_name,x1,y1,x2,y2
3,000004.jpg,10,short sleeve dress,174,205,327,393
4,000005.jpg,1,short sleeve top,48,0,467,623
7,000008.jpg,12,vest dress,42,102,223,370
10,000011.jpg,10,short sleeve dress,0,0,467,466
12,000013.jpg,10,short sleeve dress,32,9,313,492


### Collecting the validation images from s3

In [20]:
val_df = pd.read_parquet('val_set.parquet.gzip')

In [17]:
img_list_val = list(val_df["img"])
img_list_val[:5]

['000004.jpg', '000005.jpg', '000008.jpg', '000011.jpg', '000013.jpg']

In [22]:
# Transferring trainig images from s3 to local
s3 = boto3.resource('s3')

for image in img_list_val:
    orig_dir = 'DeepFashion2/validation/image/' + image
    dest_dir = '/home/ec2-user/SageMaker/df2/val/' + image
    
    # Connect to S3 bucket and download image
    s3.Bucket(bucket).download_file(orig_dir, dest_dir)

In [18]:
len(img_list_val)

18606

### Continuing to Prepare the final val dataset

In [23]:
val_dim = []
for file in list(img_list_val):
    i = Image.open('./df2/val/'+file)
    val_dim.append([*i.size])
val_dim_df = pd.DataFrame(val_dim, columns=['i_w', 'i_h'])

In [24]:
val_w = val_dim_df.groupby('i_w').count().sort_values(by='i_h', ascending=False).iloc[:1]
val_h = val_dim_df.groupby('i_h').count().sort_values(by='i_w', ascending=False).iloc[:1]

In [25]:
df_val = pd.concat([val_df.reset_index(drop=True), val_dim_df.reset_index(drop=True)], axis=1)

In [29]:
toosmall_val = df_val[(df_val['i_w'] < IMG_DIM) | (df_val['i_h'] < IMG_DIM)]
print(len(toosmall_val))

10377


In [30]:
val_final = df_val[~df_val.img.isin(toosmall_val.img)]
val_final.groupby('category_name').count()['img']

category_name
long sleeve dress       331
long sleeve outwear     548
long sleeve top        1241
short sleeve dress      834
short sleeve top       3500
shorts                  183
skirt                   244
sling dress             205
trousers                171
vest                    400
vest dress              572
Name: img, dtype: int64

In [22]:
val_df["category_id"] = val_df['category_name'].apply(lambda x: dict_class[x])

In [25]:
val_df['category_id'].unique()

array([ 3,  9, 10,  8,  7,  2,  4,  5,  1,  6,  0])

In [23]:
val_df.head()

Unnamed: 0,img,category_id,category_name,x1,y1,x2,y2,i_w,i_h
7035,010580.jpg,3,skirt,234,574,465,916,750,1145
7221,010846.jpg,9,vest,256,313,421,516,640,892
7222,010848.jpg,10,sling dress,260,233,385,598,640,816
7223,010849.jpg,10,sling dress,345,365,568,1022,880,1161
7224,010850.jpg,9,vest,186,315,439,549,640,960


In [26]:
# Save spliced dataset
val_df.to_parquet('val_set.parquet.gzip', compression='gzip')

## Collecting test examples

In [None]:
# orig_file = 'DeepFashion2/train/train.json'
# dest_file = '/home/ec2-user/SageMaker/train.json'

# # Connect to S3 bucket and download file - train.json
# s3 = boto3.resource('s3')
# s3.Bucket(bucket).download_file(orig_file, dest_file)

## Pulling samples from test folder

In [25]:
test_list = []
for i in contents[1:101]:
    image = i['Key']
    test_list.append(image[-10:])

In [28]:
# transferring images from s3 to local
s3 = boto3.resource('s3')

for image in test_list:
    orig_img = 'DeepFashion2/test/image/' + image
    dest_img = '/home/ec2-user/SageMaker/effdet_df2/test/' + image
    
    # Connect to S3 bucket and download image
    s3.Bucket(bucket).download_file(orig_img, dest_img)