In [99]:
import os
import pandas as pd
from glob import glob
from functools import reduce
from xml.etree import ElementTree as et

In [100]:
#load all xml file ans store in a list
xmlfiles = glob ('./images/*.xml')
#data clean
replace_text = lambda x: x.replace ('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))


In [101]:
xmlfiles

[]

In [102]:
# read xml files and extract  file name ans (width , high) of the image from object ( xmin ,xmax,ymin,ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    image_name=root.find('filename').text
    width= root.find('size').find('width').text
    height= root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs: 
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox .find('xmin').text
        xmax = bndbox .find('xmax').text
        ymin = bndbox .find('ymin').text
        ymax = bndbox .find('ymax').text
        parser.append([image_name,width,height,name,xmin,xmax,ymin,ymax])
    
    return (parser)
    



In [103]:
parser_all = list(map(extract_text,xmlfiles))

In [104]:
data = reduce(lambda x,y : x+y,parser_all)

TypeError: reduce() of empty iterable with no initial value

In [105]:
df = pd.DataFrame(data, columns=['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [106]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,0AHV0ASNCXTM.jpg,32,32,chair,1,32,1,32
1,0CXQ0Z6D2130.jpg,32,32,chair,1,31,2,30
2,0FSL56XHX45X.jpg,32,32,chair,3,29,1,32
3,0TVYI0FKFM75.jpg,32,32,chair,6,27,1,30
4,0ZA6E9VEPXO7.jpg,32,32,chair,1,32,1,31


In [107]:
df.shape


(285, 8)

In [108]:
df['name'].value_counts()

name
person    88
bottle    55
plant     49
car       48
chair     45
Name: count, dtype: int64

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  285 non-null    object
 1   width     285 non-null    object
 2   height    285 non-null    object
 3   name      285 non-null    object
 4   xmin      285 non-null    object
 5   xmax      285 non-null    object
 6   ymin      285 non-null    object
 7   ymax      285 non-null    object
dtypes: object(8)
memory usage: 17.9+ KB


In [110]:
# type convert
cols = ['width','height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  285 non-null    object
 1   width     285 non-null    int64 
 2   height    285 non-null    int64 
 3   name      285 non-null    object
 4   xmin      285 non-null    int64 
 5   xmax      285 non-null    int64 
 6   ymin      285 non-null    int64 
 7   ymax      285 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 17.9+ KB


In [111]:
#center x , center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
#w
df['w'] = (df['xmax']-df['xmin'])/df['width']
df['h'] = (df['ymax']-df['ymin'])/df['height']          

In [112]:
df.head()


Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,0AHV0ASNCXTM.jpg,32,32,chair,1,32,1,32,0.515625,0.515625,0.96875,0.96875
1,0CXQ0Z6D2130.jpg,32,32,chair,1,31,2,30,0.5,0.5,0.9375,0.875
2,0FSL56XHX45X.jpg,32,32,chair,3,29,1,32,0.5,0.515625,0.8125,0.96875
3,0TVYI0FKFM75.jpg,32,32,chair,6,27,1,30,0.515625,0.484375,0.65625,0.90625
4,0ZA6E9VEPXO7.jpg,32,32,chair,1,32,1,31,0.515625,0.5,0.96875,0.9375


In [113]:
images = df['filename'].unique()

In [114]:
len(images)

204

In [115]:
# 80% train 20% test 
img_df = pd.DataFrame(images, columns=['filename'])
img_train = tuple(img_df.sample(frac = 0.8)['filename'])  #shuffle and pick 80% of images


In [116]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])

In [117]:
len(img_train),len(img_test)

(163, 41)

In [118]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [119]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,0AHV0ASNCXTM.jpg,32,32,chair,1,32,1,32,0.515625,0.515625,0.96875,0.96875
2,0FSL56XHX45X.jpg,32,32,chair,3,29,1,32,0.5,0.515625,0.8125,0.96875
3,0TVYI0FKFM75.jpg,32,32,chair,6,27,1,30,0.515625,0.484375,0.65625,0.90625
4,0ZA6E9VEPXO7.jpg,32,32,chair,1,32,1,31,0.515625,0.5,0.96875,0.9375
5,1-liter-drinking-water-pet-bottle-995-w300_2_1...,281,500,bottle,58,181,1,485,0.425267,0.486,0.437722,0.968


In [120]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
1,0CXQ0Z6D2130.jpg,32,32,chair,1,31,2,30,0.5,0.5,0.9375,0.875
18,110_1.jpg,320,213,car,28,320,1,185,0.54375,0.43662,0.9125,0.86385
20,114_1.jpg,320,203,car,72,149,130,187,0.345313,0.780788,0.240625,0.280788
21,114_1.jpg,320,203,car,187,295,124,178,0.753125,0.743842,0.3375,0.26601
22,114_1.jpg,320,203,car,138,184,125,162,0.503125,0.706897,0.14375,0.182266


In [121]:
def label_encoding (x) :
    labels = {'person':0 , 'car':1 ,'chair':2, 'plant':3,'bottle':4 }
    return labels[x]


In [122]:
# Utiliser .loc pour éviter le SettingWithCopyWarning
train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)
test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)


In [123]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,0AHV0ASNCXTM.jpg,32,32,chair,1,32,1,32,0.515625,0.515625,0.96875,0.96875,2
2,0FSL56XHX45X.jpg,32,32,chair,3,29,1,32,0.5,0.515625,0.8125,0.96875,2
3,0TVYI0FKFM75.jpg,32,32,chair,6,27,1,30,0.515625,0.484375,0.65625,0.90625,2
4,0ZA6E9VEPXO7.jpg,32,32,chair,1,32,1,31,0.515625,0.5,0.96875,0.9375,2
5,1-liter-drinking-water-pet-bottle-995-w300_2_1...,281,500,bottle,58,181,1,485,0.425267,0.486,0.437722,0.968,4
6,10.png,607,412,person,65,154,124,340,0.180395,0.563107,0.146623,0.524272,0
7,10.png,607,412,person,291,398,292,412,0.567545,0.854369,0.176277,0.291262,0
8,100.jpg,320,240,car,26,317,19,233,0.535937,0.525,0.909375,0.891667,1
9,102.jpg,320,213,car,66,264,1,209,0.515625,0.492958,0.61875,0.976526,1
10,104_1.jpg,320,213,car,49,289,87,197,0.528125,0.666667,0.75,0.516432,1


In [124]:
import os
from shutil import move 

In [125]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

FileExistsError: [WinError 183] Impossible de créer un fichier déjà existant: 'data_images/train'

In [126]:
cols = ['filename','id','center_x','center_y','w','h']
groupeby_obj_train = train_df[cols].groupby('filename')
groupeby_obj_test = test_df[cols].groupby('filename')

In [131]:
# groupeby_obj_train.get_group('10.png').set_index('filename').to_csv('sample.txt',index=False,header=False)
def save_data(filename, folder_path, group_obj):
    src = os.path.join('data_images', filename)
    dst = os.path.join(folder_path, filename)

    # Check if file exists before moving
    if not os.path.exists(src):
        print(f"File not found: {src}")
        return  # Skip this file if it doesn't exist

    # Move the file
    move(src, dst)

    # Save label
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0] + '.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

    

In [132]:
filename_series = pd.Series(groupeby_obj_train.groups.keys())

In [133]:
filename_series.apply(save_data,args=(train_folder,groupeby_obj_train))

File not found: data_images\0AHV0ASNCXTM.jpg
File not found: data_images\0FSL56XHX45X.jpg


0      None
1      None
2      None
3      None
4      None
       ... 
158    None
159    None
160    None
161    None
162    None
Length: 163, dtype: object

In [134]:
filename_series_test = pd.Series(groupeby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupeby_obj_test))

File not found: data_images\0CXQ0Z6D2130.jpg


0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
40    None
dtype: object