#### Data Prep

In [1]:
import pandas as pd
import os

In [2]:
# download tar file from buckets
!gsutil cp gs://runwai-dataset/images.tgz ./data

Copying gs://runwai-dataset/images.tgz...
\ [1 files][  6.4 GiB/  6.4 GiB]   53.6 MiB/s                                   
Operation completed over 1 objects/6.4 GiB.                                      


Run in command line: 
```tar -xvf images.tgz```  
Run following commands to check that all images are there:  
```cd images```  
```ls | wc -l```  
The output should be 44096.

In [3]:
# File Paths
DATA_DIR = "/home/jupyter/runwai/data"
IMAGES_DIR = f"{DATA_DIR}/images"
LABELS_DIR = f"{DATA_DIR}/labels"
FABRIC_PATH = f"{LABELS_DIR}/fabric_ann.txt"
PATTERN_PATH = f"{LABELS_DIR}/pattern_ann.txt"
SHAPE_PATH = f"{LABELS_DIR}/shape_anno_all.txt"

In [4]:
# delete output file to avoid duplicates and overriding
if os.path.exists(f"{LABELS_DIR}/small_labels.csv"):
  os.remove(f"{LABELS_DIR}/small_labels.csv")
  print("Previous CSV is deleted")
else:
  print("The file does not exist")

The file does not exist


In [5]:
# Upload text files to DataFrames
fabric_df=pd.read_csv(FABRIC_PATH, delimiter=' ', header=None, 
                      names=["image", "upper_fabric", "lower_fabric", "outer_fabric"])
pattern_df=pd.read_csv(PATTERN_PATH, delimiter=' ', header=None, 
                       names=["image", "upper_color", "lower_color" , "outer_color"])
shape_df=pd.read_csv(SHAPE_PATH, delimiter=' ', header=None, 
                       names=["image", "sleeve_len", "lower_clothing_len", "socks", "hat", 
                              "glasses", "neckwear", "wrist_acs", "rings", "waist_acs", "neckline", 
                              "cradigan", "navel"])

In [6]:
shape_df

Unnamed: 0,image,sleeve_len,lower_clothing_len,socks,hat,glasses,neckwear,wrist_acs,rings,waist_acs,neckline,cradigan,navel
0,MEN-Denim-id_00000080-01_7_additional.jpg,5,3,0,0,0,0,0,0,3,2,1,1
1,MEN-Denim-id_00000089-01_7_additional.jpg,0,3,0,0,0,0,0,0,3,2,1,1
2,MEN-Denim-id_00000089-02_7_additional.jpg,3,3,0,0,0,0,0,0,3,4,1,1
3,MEN-Denim-id_00000089-03_7_additional.jpg,1,3,0,0,0,0,0,0,3,2,1,1
4,MEN-Denim-id_00000089-04_7_additional.jpg,3,3,0,0,0,0,0,0,3,4,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42539,WOMEN-Tees_Tanks-id_00007979-04_4_full.jpg,0,0,0,0,0,0,0,1,0,2,1,1
42540,WOMEN-Tees_Tanks-id_00007979-04_7_additional.jpg,0,0,3,0,0,0,0,1,0,2,1,1
42541,WOMEN-Tees_Tanks-id_00007981-03_1_front.jpg,0,4,3,0,0,0,1,1,3,2,1,1
42542,WOMEN-Tees_Tanks-id_00007981-03_3_back.jpg,0,4,3,0,0,0,1,1,3,6,2,2


In [7]:
# Drop unnecessary columns
shape_df_small = shape_df.drop(columns=["socks", "hat", "glasses", "neckwear", "wrist_acs", "rings", "waist_acs", "cradigan", "navel"])

In [8]:
shape_df_small

Unnamed: 0,image,sleeve_len,lower_clothing_len,neckline
0,MEN-Denim-id_00000080-01_7_additional.jpg,5,3,2
1,MEN-Denim-id_00000089-01_7_additional.jpg,0,3,2
2,MEN-Denim-id_00000089-02_7_additional.jpg,3,3,4
3,MEN-Denim-id_00000089-03_7_additional.jpg,1,3,2
4,MEN-Denim-id_00000089-04_7_additional.jpg,3,3,4
...,...,...,...,...
42539,WOMEN-Tees_Tanks-id_00007979-04_4_full.jpg,0,0,2
42540,WOMEN-Tees_Tanks-id_00007979-04_7_additional.jpg,0,0,2
42541,WOMEN-Tees_Tanks-id_00007981-03_1_front.jpg,0,4,2
42542,WOMEN-Tees_Tanks-id_00007981-03_3_back.jpg,0,4,6


In [9]:
# Merge three DataFrames into one using the image name
fabric_pattern_df = pd.merge(fabric_df, pattern_df, on='image')
small_all = pd.merge(fabric_pattern_df, shape_df_small, on='image')

In [10]:
small_all

Unnamed: 0,image,upper_fabric,lower_fabric,outer_fabric,upper_color,lower_color,outer_color,sleeve_len,lower_clothing_len,neckline
0,MEN-Denim-id_00000080-01_7_additional.jpg,1,1,7,3,4,7,5,3,2
1,MEN-Denim-id_00000089-01_7_additional.jpg,1,1,7,3,3,7,0,3,2
2,MEN-Denim-id_00000089-02_7_additional.jpg,1,1,7,2,3,7,3,3,4
3,MEN-Denim-id_00000089-03_7_additional.jpg,1,1,7,3,3,7,1,3,2
4,MEN-Denim-id_00000089-04_7_additional.jpg,0,1,7,3,3,7,3,3,4
...,...,...,...,...,...,...,...,...,...,...
42539,WOMEN-Tees_Tanks-id_00007979-04_4_full.jpg,1,1,7,3,3,7,0,0,2
42540,WOMEN-Tees_Tanks-id_00007979-04_7_additional.jpg,1,0,7,3,3,7,0,0,2
42541,WOMEN-Tees_Tanks-id_00007981-03_1_front.jpg,1,0,7,5,3,7,0,4,2
42542,WOMEN-Tees_Tanks-id_00007981-03_3_back.jpg,1,0,7,5,3,7,0,4,6


In [11]:
print(small_all.mean())
print(small_all.std())

upper_fabric          1.775174
lower_fabric          0.878643
outer_fabric          6.370957
upper_color           2.479198
lower_color           2.782813
outer_color           6.468386
sleeve_len            1.325780
lower_clothing_len    2.121897
neckline              3.752915
dtype: float64
upper_fabric          1.629063
lower_fabric          1.292736
outer_fabric          1.730588
upper_color           1.402494
lower_color           0.982287
outer_color           1.444305
sleeve_len            1.311799
lower_clothing_len    1.834552
neckline              2.030285
dtype: float64


  """Entry point for launching an IPython kernel.
  


In [12]:
# Annotations
fabric_dict =  {0 : 'denim', 
                1 : 'cotton',
                2 : 'leather',
                3 : 'furry',
                4 : 'knitted',
                5 : 'chiffon',
                6 : 'other',
                7 : 'NA'}
color_dict = {0 : 'floral', 
                1 : 'graphic',
                2 : 'striped',
                3 : 'pure color',
                4 : 'lattice',
                5 : 'other',
                6 : 'color block',
                7 : 'NA'}
neckline_dict ={0 : 'V-shape', 
                1 : 'square',
                2 : 'round',
                3 : 'standing',
                4 : 'lapel',
                5 : 'suspenders',
                6 : 'NA'}
sleeve_len_dict =  {0 : 'sleeveless', 
                    1 : 'short-sleeve',
                    2 : 'medium-sleeve',
                    3 : 'long-sleeve',
                    4 : 'not long-sleeve',
                    5 : 'NA'}
lower_clothing_len_dict =  {0 : 'three-point', 
                            1 : 'medium short',
                            2 : 'three-quarter',
                            3 : 'long',
                            4 : 'NA'}

In [13]:
# replace all numbers in DataFrame to corresponding string descriptions
small_full_descriptions=small_all.replace({"upper_fabric": fabric_dict, 
                                           "lower_fabric": fabric_dict, 
                                           "outer_fabric": fabric_dict,
                                           "upper_color": color_dict, 
                                           "lower_color": color_dict, 
                                           "outer_color": color_dict,
                                           "sleeve_len": sleeve_len_dict,
                                           "lower_clothing_len": lower_clothing_len_dict,
                                           "neckline" : neckline_dict
                                          })

In [14]:
small_full_descriptions

Unnamed: 0,image,upper_fabric,lower_fabric,outer_fabric,upper_color,lower_color,outer_color,sleeve_len,lower_clothing_len,neckline
0,MEN-Denim-id_00000080-01_7_additional.jpg,cotton,cotton,,pure color,lattice,,,long,round
1,MEN-Denim-id_00000089-01_7_additional.jpg,cotton,cotton,,pure color,pure color,,sleeveless,long,round
2,MEN-Denim-id_00000089-02_7_additional.jpg,cotton,cotton,,striped,pure color,,long-sleeve,long,lapel
3,MEN-Denim-id_00000089-03_7_additional.jpg,cotton,cotton,,pure color,pure color,,short-sleeve,long,round
4,MEN-Denim-id_00000089-04_7_additional.jpg,denim,cotton,,pure color,pure color,,long-sleeve,long,lapel
...,...,...,...,...,...,...,...,...,...,...
42539,WOMEN-Tees_Tanks-id_00007979-04_4_full.jpg,cotton,cotton,,pure color,pure color,,sleeveless,three-point,round
42540,WOMEN-Tees_Tanks-id_00007979-04_7_additional.jpg,cotton,denim,,pure color,pure color,,sleeveless,three-point,round
42541,WOMEN-Tees_Tanks-id_00007981-03_1_front.jpg,cotton,denim,,other,pure color,,sleeveless,,round
42542,WOMEN-Tees_Tanks-id_00007981-03_3_back.jpg,cotton,denim,,other,pure color,,sleeveless,,


In [15]:
# save DataFrame as a csv
small_full_descriptions.to_csv(f"{LABELS_DIR}/small_labels.csv") 