# Import Libraries

In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

# Dataset Download

In [3]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d gopalbhattrai/pascal-voc-2012-dataset

Dataset URL: https://www.kaggle.com/datasets/gopalbhattrai/pascal-voc-2012-dataset
License(s): unknown
Downloading pascal-voc-2012-dataset.zip to /content
100% 3.51G/3.52G [00:43<00:00, 98.6MB/s]
100% 3.52G/3.52G [00:43<00:00, 87.6MB/s]


In [70]:
!unzip "/content/pascal-voc-2012-dataset.zip" -d "/content/dataset"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/dataset/VOC2012_train_val/VOC2012_train_val/SegmentationClass/2008_001874.png  
  inflating: /content/dataset/VOC2012_train_val/VOC2012_train_val/SegmentationClass/2008_001876.png  
  inflating: /content/dataset/VOC2012_train_val/VOC2012_train_val/SegmentationClass/2008_001882.png  
  inflating: /content/dataset/VOC2012_train_val/VOC2012_train_val/SegmentationClass/2008_001885.png  
  inflating: /content/dataset/VOC2012_train_val/VOC2012_train_val/SegmentationClass/2008_001895.png  
  inflating: /content/dataset/VOC2012_train_val/VOC2012_train_val/SegmentationClass/2008_001896.png  
  inflating: /content/dataset/VOC2012_train_val/VOC2012_train_val/SegmentationClass/2008_001926.png  
  inflating: /content/dataset/VOC2012_train_val/VOC2012_train_val/SegmentationClass/2008_001966.png  
  inflating: /content/dataset/VOC2012_train_val/VOC2012_train_val/SegmentationClass/2008_001971.png  
  inflating: /con

# Data Preparation

In [2]:
# Get the path of each xml file
xml_train_val = glob("VOC2012_train_val/VOC2012_train_val/Annotations/*.xml")
xml_test = glob("VOC2012_test/VOC2012_test/Annotations/*.xml")

In [3]:
# Read xml flies
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find("filename").text
    # width and height of the image
    width = root.find("size").find("width").text
    height = root.find("size").find("height").text
    objs = root.findall("object")
    parser = []
    for obj in objs:
        name = obj.find("name").text
        bndbox = obj.find("bndbox")
        xmin = bndbox.find("xmin").text
        xmax = bndbox.find("xmax").text
        ymin = bndbox.find("ymin").text
        ymax = bndbox.find("ymax").text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])

    return parser

In [4]:
parser_train_val = list(map(extract_text, xml_train_val))
parser_test = list(map(extract_text, xml_test))

In [5]:
train_val_data = reduce(lambda x, y : x+y, parser_train_val)
test_data = reduce(lambda x, y : x+y, parser_test)

In [6]:
df_train_val = pd.DataFrame(train_val_data, columns = ["filename", "width", "height", "name", "xmin", "xmax", "ymin", "ymax"])
df_test = pd.DataFrame(test_data, columns = ["filename", "width", "height", "name", "xmin", "xmax", "ymin", "ymax"])

In [7]:
df_train_val["name"].value_counts()

person         17401
chair           3056
car             2492
dog             1598
bottle          1561
cat             1277
bird            1271
pottedplant     1202
sheep           1084
boat            1059
aeroplane       1002
tvmonitor        893
sofa             841
bicycle          837
horse            803
motorbike        801
diningtable      800
cow              771
train            704
bus              685
Name: name, dtype: int64

In [8]:
 df_train_val.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,2007_000027.jpg,486,500,person,174,349,101,351
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123
3,2007_000032.jpg,500,281,person,195,213,180,229
4,2007_000032.jpg,500,281,person,26,44,189,238


In [9]:
# type conversion
cols = ["width", "height", "xmin", "xmax", "ymin", "ymax"]
df_train_val['ymin'] = df_train_val['ymin'].astype(float).round().astype(int)
df_train_val[cols] = df_train_val[cols].astype(int)
df_test[cols] = df_test[cols].astype(int)


In [10]:
df_train_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40138 entries, 0 to 40137
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  40138 non-null  object
 1   width     40138 non-null  int32 
 2   height    40138 non-null  int32 
 3   name      40138 non-null  object
 4   xmin      40138 non-null  int32 
 5   xmax      40138 non-null  int32 
 6   ymin      40138 non-null  int32 
 7   ymax      40138 non-null  int32 
dtypes: int32(6), object(2)
memory usage: 1.5+ MB


In [11]:
# center x, center y
df_train_val["center_x"] = ((df_train_val["xmax"]+df_train_val["xmin"])/2)/df_train_val["width"]
df_train_val["center_y"] = ((df_train_val["ymax"]+df_train_val["ymin"])/2)/df_train_val["height"]
df_test["center_x"] = ((df_test["xmax"]+df_test["xmin"])/2)/df_test["width"]
df_test["center_y"] = ((df_test["ymax"]+df_test["ymin"])/2)/df_test["height"]
# w
df_train_val["w"] = (df_train_val["xmax"]-df_train_val["xmin"])/df_train_val["width"]
df_test["w"] = (df_test["xmax"]-df_test["xmin"])/df_test["width"]
# h
df_train_val["h"] = (df_train_val["ymax"]-df_train_val["ymin"])/df_train_val["height"]
df_test["h"] = (df_test["ymax"]-df_test["ymin"])/df_test["height"]

In [12]:
df_train_val.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377


### Split data into train and valid

In [13]:
images = df_train_val["filename"].unique()

In [14]:
img_df = pd.DataFrame(images, columns = ["filename"])
img_train = tuple(img_df.sample(frac=0.8)["filename"]) # shuffle and pick 80% of images

In [15]:
img_val = tuple(img_df.query(f"filename not in {img_train}")["filename"]) # take the rest 20% images

In [16]:
len(img_train), len(img_val)

(13700, 3425)

In [17]:
train_df = df_train_val.query(f"filename in {img_train}")
val_df = df_train_val.query(f"filename in {img_val}")

In [18]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377


In [19]:
val_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
40,2007_000272.jpg,333,500,person,25,304,71,500,0.493994,0.571,0.837838,0.858
41,2007_000323.jpg,500,375,person,277,500,3,375,0.777,0.504,0.446,0.992
42,2007_000323.jpg,500,375,person,12,305,3,375,0.317,0.504,0.586,0.992
62,2007_000480.jpg,500,375,person,293,419,162,375,0.712,0.716,0.252,0.568
63,2007_000480.jpg,500,375,person,114,228,165,373,0.342,0.717333,0.228,0.554667


In [20]:
df_test.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2008_000200.jpg,500,375,person,119,184,76,311,0.303,0.516,0.13,0.626667
1,2008_000200.jpg,500,375,person,266,338,43,323,0.604,0.488,0.144,0.746667
2,2008_000210.jpg,500,333,person,21,439,1,333,0.46,0.501502,0.836,0.996997
3,2008_000216.jpg,500,371,person,60,197,52,371,0.257,0.570081,0.274,0.859838
4,2008_000216.jpg,500,371,person,169,277,133,371,0.446,0.679245,0.216,0.641509


### Assign id number to object names

In [21]:
# Label encoding
def label_encoding(x):
    labels = {"person":0, "car":1, "chair":2, "bottle":3, "pottedplant":4, "bird":5, "dog":6,
              "sofa":7, "bicycle":8, "horse":9, "boat":10, "motorbike":11, "cat":12, "tvmonitor":13,
              "cow":14, "sheep":15, "aeroplane":16, "train":17, "diningtable":18, "bus":19}
    return labels[x]

In [22]:
train_df["id"] = train_df["name"].apply(label_encoding)
val_df["id"] = val_df["name"].apply(label_encoding)
df_test["id"] = df_test["name"].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["id"] = train_df["name"].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df["id"] = val_df["name"].apply(label_encoding)


In [23]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5,0
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665,16
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555,16
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377,0
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377,0
5,2007_000033.jpg,500,366,aeroplane,9,499,107,263,0.508,0.505464,0.98,0.42623,16
6,2007_000033.jpg,500,366,aeroplane,421,482,200,226,0.903,0.581967,0.122,0.071038,16
7,2007_000033.jpg,500,366,aeroplane,325,411,188,223,0.736,0.561475,0.172,0.095628,16
8,2007_000039.jpg,500,375,tvmonitor,156,344,89,279,0.5,0.490667,0.376,0.506667,13
9,2007_000042.jpg,500,335,train,263,500,32,295,0.763,0.48806,0.474,0.785075,17


### Save Image and Labels in text

In [24]:
import os
from shutil import move

In [25]:
train_folder = "data_images/train"
test_folder = "data_images/test"
val_folder = "data_images/validation"

In [26]:
cols = ["filename", "id", "center_x", "center_y", "w", "h"]
groupby_obj_train = train_df[cols].groupby("filename")
groupby_obj_val = val_df[cols].groupby("filename")
groupby_obj_test = df_test[cols].groupby("filename")

In [38]:
# Save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join("VOC2012_test/VOC2012_test/JPEGImages",filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst) # move image to the destination folder

    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+".txt")
    group_obj.get_group(filename).set_index("filename").to_csv(text_filename, sep = " ", index=False,header=False)

In [39]:
filename_series = pd.Series(groupby_obj_test.groups.keys())

In [None]:
filename_series.apply(save_data, args=(test_folder,groupby_obj_test))

In [81]:
import os
import shutil

# Define the source and destination folders
folder_to_save = '/content/data_images'
destination_folder = '/content/drive/MyDrive/Models/YOLO'

# Walk through the directory tree and copy files
for root, dirs, files in os.walk(folder_to_save):
    # Construct the corresponding directory structure in the destination folder
    dest_root = os.path.join(destination_folder, os.path.relpath(root, folder_to_save))
    os.makedirs(dest_root, exist_ok=True)

    # Copy files in the current directory
    for file_name in files:
        source_file = os.path.join(root, file_name)
        destination_file = os.path.join(dest_root, file_name)
        shutil.copy(source_file, destination_file)
