#**Mount Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


# Navigate to the project directory

In [None]:
%cd /content/drive/MyDrive/visual-pollution-object-detection

/content/drive/MyDrive/visual-pollution-object-detection


In [None]:
%cd preprocessing-dataset

/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset


In [None]:
import pandas as pd
import os
import shutil

In [None]:
ls

 annotations.csv  'Pre-processing The dataset.ipynb'


#**Load the original CSV file into a DataFrame**

In [None]:
df = pd.read_csv('annotations.csv')

In [None]:
df

Unnamed: 0,class,image_path,name,xmax,xmin,ymax,ymin
0,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,797.0,701.0,262.0,211.0
1,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,932.0,786.0,329.0,238.0
2,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,736.0,657.0,275.0,229.0
3,7.0,ea906a663da6321bcef78be4b7d1afff.jpg,BAD_BILLBOARD,986.0,786.0,136.0,0.0
4,8.0,1c7d48005a12d1b19261b8e71df7cafe.jpg,SAND_ON_ROAD,667.0,549.0,228.0,179.0
...,...,...,...,...,...,...,...
19945,4.0,081e7bb3832ec5bb25276db161a96274.jpg,CONSTRUCTION_ROAD,1025.0,600.0,408.0,148.0
19946,2.0,1ff38a7af7f13b1201d17c6e1829373a.jpg,POTHOLES,657.0,418.0,364.0,282.0
19947,2.0,1ff38a7af7f13b1201d17c6e1829373a.jpg,POTHOLES,507.0,338.0,436.0,283.0
19948,7.0,ac97490f13140fc1bfe613ec69301b34.jpg,BAD_BILLBOARD,956.0,713.0,110.0,25.0


In [None]:
df.shape

(19950, 7)

#**Modify Annotations**

The bounding box coordinates ​​in the ``xmax``, ``xmin``, ``ymax`` and ``ymin`` columns are incorrect. 

Thus, I will correct the bbox coordinates ​​by multiplying them by 2

#Slicing columns

In [None]:
bbox_annots = df.iloc[:,3:7]
bbox_annots

Unnamed: 0,xmax,xmin,ymax,ymin
0,797.0,701.0,262.0,211.0
1,932.0,786.0,329.0,238.0
2,736.0,657.0,275.0,229.0
3,986.0,786.0,136.0,0.0
4,667.0,549.0,228.0,179.0
...,...,...,...,...
19945,1025.0,600.0,408.0,148.0
19946,657.0,418.0,364.0,282.0
19947,507.0,338.0,436.0,283.0
19948,956.0,713.0,110.0,25.0


# Correcte the bounding boxes coordinates by multipling them by 2

In [None]:
bbox_annots2 = bbox_annots*2
bbox_annots2

Unnamed: 0,xmax,xmin,ymax,ymin
0,1594.0,1402.0,524.0,422.0
1,1864.0,1572.0,658.0,476.0
2,1472.0,1314.0,550.0,458.0
3,1972.0,1572.0,272.0,0.0
4,1334.0,1098.0,456.0,358.0
...,...,...,...,...
19945,2050.0,1200.0,816.0,296.0
19946,1314.0,836.0,728.0,564.0
19947,1014.0,676.0,872.0,566.0
19948,1912.0,1426.0,220.0,50.0


In [None]:
df['xmax'] = bbox_annots2['xmax']

In [None]:
df['xmin'] = bbox_annots2['xmin']

In [None]:
df['ymax'] = bbox_annots2['ymax']

In [None]:
df['ymin'] = bbox_annots2['ymin']

#Check the modifed dataset annotations

In [None]:
df

Unnamed: 0,class,image_path,name,xmax,xmin,ymax,ymin
0,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1594.0,1402.0,524.0,422.0
1,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1864.0,1572.0,658.0,476.0
2,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1472.0,1314.0,550.0,458.0
3,7.0,ea906a663da6321bcef78be4b7d1afff.jpg,BAD_BILLBOARD,1972.0,1572.0,272.0,0.0
4,8.0,1c7d48005a12d1b19261b8e71df7cafe.jpg,SAND_ON_ROAD,1334.0,1098.0,456.0,358.0
...,...,...,...,...,...,...,...
19945,4.0,081e7bb3832ec5bb25276db161a96274.jpg,CONSTRUCTION_ROAD,2050.0,1200.0,816.0,296.0
19946,2.0,1ff38a7af7f13b1201d17c6e1829373a.jpg,POTHOLES,1314.0,836.0,728.0,564.0
19947,2.0,1ff38a7af7f13b1201d17c6e1829373a.jpg,POTHOLES,1014.0,676.0,872.0,566.0
19948,7.0,ac97490f13140fc1bfe613ec69301b34.jpg,BAD_BILLBOARD,1912.0,1426.0,220.0,50.0


In [None]:
df.shape

(19950, 7)

#The annotations.csv file contains only one row for the visual pollution of type BAD STREETLIGHT, so I decided to delete this row's information, because it will not be of any use or importance in the model training process.

In [None]:
BAD_STREETLIGHT=df[df['name']=='BAD_STREETLIGHT']

BAD_STREETLIGHT

Unnamed: 0,class,image_path,name,xmax,xmin,ymax,ymin
19353,6.0,53d3797457a0d2e3afe146e2f797e77e.jpg,BAD_STREETLIGHT,1970.0,718.0,322.0,-94.0


In [None]:
print(len(BAD_STREETLIGHT))

1


In [None]:
df.drop(df[df['name']=='BAD_STREETLIGHT'].index, inplace=True)

In [None]:
df

Unnamed: 0,class,image_path,name,xmax,xmin,ymax,ymin
0,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1594.0,1402.0,524.0,422.0
1,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1864.0,1572.0,658.0,476.0
2,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1472.0,1314.0,550.0,458.0
3,7.0,ea906a663da6321bcef78be4b7d1afff.jpg,BAD_BILLBOARD,1972.0,1572.0,272.0,0.0
4,8.0,1c7d48005a12d1b19261b8e71df7cafe.jpg,SAND_ON_ROAD,1334.0,1098.0,456.0,358.0
...,...,...,...,...,...,...,...
19945,4.0,081e7bb3832ec5bb25276db161a96274.jpg,CONSTRUCTION_ROAD,2050.0,1200.0,816.0,296.0
19946,2.0,1ff38a7af7f13b1201d17c6e1829373a.jpg,POTHOLES,1314.0,836.0,728.0,564.0
19947,2.0,1ff38a7af7f13b1201d17c6e1829373a.jpg,POTHOLES,1014.0,676.0,872.0,566.0
19948,7.0,ac97490f13140fc1bfe613ec69301b34.jpg,BAD_BILLBOARD,1912.0,1426.0,220.0,50.0


In [None]:
df.shape

(19949, 7)

#Check the total number of labels after dropping out of one class (BAD STREETLIGHT)

In [None]:
df_class = df['name']

In [None]:
df_class = df_class.sort_values().unique()

In [None]:
df_class

array(['BAD_BILLBOARD', 'BROKEN_SIGNAGE', 'CLUTTER_SIDEWALK',
       'CONSTRUCTION_ROAD', 'FADED_SIGNAGE', 'GARBAGE', 'GRAFFITI',
       'POTHOLES', 'SAND_ON_ROAD', 'UNKEPT_FACADE'], dtype=object)

In [None]:
print(len(df_class))

10


#Check out the total number of images after dropping out of one class with its corresponding image (BAD STREETLIGHT)

In [None]:
df_images = df['image_path']

In [None]:
df_images = df_images.sort_values().unique()

In [None]:
print(len(df_images))

7873


#Write the pandas DataFrame to a CSV file(save the modified csv annotations)

In [None]:
df.to_csv("annotations.csv", index=False)

#**Load the modified CSV file into the DataFrame and verify the changes**

In [None]:
df = pd.read_csv('annotations.csv')

In [None]:
df

Unnamed: 0,class,image_path,name,xmax,xmin,ymax,ymin
0,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1594.0,1402.0,524.0,422.0
1,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1864.0,1572.0,658.0,476.0
2,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1472.0,1314.0,550.0,458.0
3,7.0,ea906a663da6321bcef78be4b7d1afff.jpg,BAD_BILLBOARD,1972.0,1572.0,272.0,0.0
4,8.0,1c7d48005a12d1b19261b8e71df7cafe.jpg,SAND_ON_ROAD,1334.0,1098.0,456.0,358.0
...,...,...,...,...,...,...,...
19944,4.0,081e7bb3832ec5bb25276db161a96274.jpg,CONSTRUCTION_ROAD,2050.0,1200.0,816.0,296.0
19945,2.0,1ff38a7af7f13b1201d17c6e1829373a.jpg,POTHOLES,1314.0,836.0,728.0,564.0
19946,2.0,1ff38a7af7f13b1201d17c6e1829373a.jpg,POTHOLES,1014.0,676.0,872.0,566.0
19947,7.0,ac97490f13140fc1bfe613ec69301b34.jpg,BAD_BILLBOARD,1912.0,1426.0,220.0,50.0


In [None]:
df.shape

(19949, 7)

#**Split the modified csv annotations into train.csv and test.csv files**

#**1- train.csv annotations**

#Slicing rows

In [None]:
train_df = df.iloc[:15858]

#Check the number of train images


In [None]:
train_images = train_df['image_path']

In [None]:
train_images = train_images.sort_values().unique()

In [None]:
print(len(train_images))

6300


#Verify that the sliced train csv dataset contains all types of visual pollution (10 categories)

In [None]:
train_class = train_df['name']

In [None]:
train_class = train_class.sort_values().unique()

In [None]:
train_class

array(['BAD_BILLBOARD', 'BROKEN_SIGNAGE', 'CLUTTER_SIDEWALK',
       'CONSTRUCTION_ROAD', 'FADED_SIGNAGE', 'GARBAGE', 'GRAFFITI',
       'POTHOLES', 'SAND_ON_ROAD', 'UNKEPT_FACADE'], dtype=object)

In [None]:
print(len(train_class))

10


#Write the pandas dataframe to train.csv file(save the annotations)

In [None]:
train_df.to_csv("train.csv", index=False)

#**Load the train CSV file into a Pandas DataFrame**
#to verify that everything is saved correctly

In [None]:
train_df = pd.read_csv('train.csv')

In [None]:
train_df

Unnamed: 0,class,image_path,name,xmax,xmin,ymax,ymin
0,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1594.0,1402.0,524.0,422.0
1,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1864.0,1572.0,658.0,476.0
2,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1472.0,1314.0,550.0,458.0
3,7.0,ea906a663da6321bcef78be4b7d1afff.jpg,BAD_BILLBOARD,1972.0,1572.0,272.0,0.0
4,8.0,1c7d48005a12d1b19261b8e71df7cafe.jpg,SAND_ON_ROAD,1334.0,1098.0,456.0,358.0
...,...,...,...,...,...,...,...
15853,1.0,6028f8d801929a376ba22f33873316d2.jpg,FADED_SIGNAGE,1672.0,1530.0,422.0,290.0
15854,7.0,6028f8d801929a376ba22f33873316d2.jpg,BAD_BILLBOARD,1334.0,1154.0,316.0,218.0
15855,7.0,6028f8d801929a376ba22f33873316d2.jpg,BAD_BILLBOARD,1142.0,954.0,306.0,210.0
15856,7.0,6028f8d801929a376ba22f33873316d2.jpg,BAD_BILLBOARD,338.0,70.0,380.0,236.0


In [None]:
train_df.shape

(15858, 7)

#**2- test.csv annotations**

#Slicing rows

In [None]:
test_df = df.iloc[15858:]

#Check the number of test images

In [None]:
test_images = test_df['image_path']

In [None]:
test_images = test_images.sort_values().unique()

In [None]:
print(len(test_images))

1573


#Verify that the sliced test csv dataset contains all types of visual pollution (10 categories)

In [None]:
test_class = test_df['name']

In [None]:
test_class = test_class.sort_values().unique()

In [None]:
test_class

array(['BAD_BILLBOARD', 'BROKEN_SIGNAGE', 'CLUTTER_SIDEWALK',
       'CONSTRUCTION_ROAD', 'FADED_SIGNAGE', 'GARBAGE', 'GRAFFITI',
       'POTHOLES', 'SAND_ON_ROAD', 'UNKEPT_FACADE'], dtype=object)

In [None]:
print(len(test_class))

10


#Write the pandas dataframe to test.csv file(save the annotations)

In [None]:
test_df.to_csv("test.csv", index=False)

#**Load the test CSV file into a Pandas DataFrame**
#to verify that everything is saved correctly

In [None]:
test_df = pd.read_csv('test.csv')

In [None]:
test_df

Unnamed: 0,class,image_path,name,xmax,xmin,ymax,ymin
0,2.0,f3170bf0bc4fd71db4be995d3b7adc86.jpg,POTHOLES,830.0,604.0,516.0,372.0
1,2.0,f3170bf0bc4fd71db4be995d3b7adc86.jpg,POTHOLES,1186.0,976.0,534.0,376.0
2,2.0,f3170bf0bc4fd71db4be995d3b7adc86.jpg,POTHOLES,320.0,-60.0,1124.0,650.0
3,9.0,dbd930c9a67c931a447f09c241abd272.jpg,CLUTTER_SIDEWALK,800.0,-94.0,688.0,414.0
4,9.0,dbd930c9a67c931a447f09c241abd272.jpg,CLUTTER_SIDEWALK,1492.0,494.0,692.0,406.0
...,...,...,...,...,...,...,...
4086,4.0,081e7bb3832ec5bb25276db161a96274.jpg,CONSTRUCTION_ROAD,2050.0,1200.0,816.0,296.0
4087,2.0,1ff38a7af7f13b1201d17c6e1829373a.jpg,POTHOLES,1314.0,836.0,728.0,564.0
4088,2.0,1ff38a7af7f13b1201d17c6e1829373a.jpg,POTHOLES,1014.0,676.0,872.0,566.0
4089,7.0,ac97490f13140fc1bfe613ec69301b34.jpg,BAD_BILLBOARD,1912.0,1426.0,220.0,50.0


In [None]:
test_df.shape

(4091, 7)

#**Split images into train and test images**
80% train images of the total 7873 images = 6300 train images

20% test images of the total 7873 images = 1573 train images

#**1- Move train images to a new folder**

In [None]:
train_df = pd.read_csv('train.csv')

In [None]:
images_name = train_df['image_path']

In [None]:
images_name = images_name.sort_values().unique()

In [None]:
print(len(images_name))

6300


In [None]:
images_path = '/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset/images'

In [None]:
train_dir = '/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset/train'

In [None]:
for image_id in images_name:

    for image_id in list(train_df[train_df['image_path']==image_id]['image_path']): 
        get_image = os.path.join(images_path, image_id) 
        move_image = shutil.copy(get_image, train_dir)

In [None]:
total_train_images = os.listdir('/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset/train')
print(len(total_train_images))

6300


#**2- Move test images to a new folder**


In [None]:
test_df = pd.read_csv('test.csv')

In [None]:
images_name = test_df['image_path']

In [None]:
images_name = images_name.sort_values().unique()

In [None]:
print(len(images_name))

1573


In [None]:
images_path = '/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset/images'

In [None]:
test_dir = '/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset/test'

In [None]:
for image_id in images_name:

    for image_id in list(test_df[test_df['image_path']==image_id]['image_path']): 
        get_image = os.path.join(images_path, image_id) 
        move_image = shutil.copy(get_image, test_dir)

In [None]:
total_test_images = os.listdir('/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset/test')
print(len(total_test_images))

1573


#**Due to the huge size of the dataset, I decided to train the model on 700 images**
**500 train images and**
**200 test images**

The whole model training process will be demonstrated in the ``visual pollution-object-detection`` jupyter notebook.

#**Frist --------------> Split the last created train csv annotations into train.csv and test.csv files**

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df

Unnamed: 0,class,image_path,name,xmax,xmin,ymax,ymin
0,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1594.0,1402.0,524.0,422.0
1,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1864.0,1572.0,658.0,476.0
2,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,1472.0,1314.0,550.0,458.0
3,7.0,ea906a663da6321bcef78be4b7d1afff.jpg,BAD_BILLBOARD,1972.0,1572.0,272.0,0.0
4,8.0,1c7d48005a12d1b19261b8e71df7cafe.jpg,SAND_ON_ROAD,1334.0,1098.0,456.0,358.0
...,...,...,...,...,...,...,...
15853,1.0,6028f8d801929a376ba22f33873316d2.jpg,FADED_SIGNAGE,1672.0,1530.0,422.0,290.0
15854,7.0,6028f8d801929a376ba22f33873316d2.jpg,BAD_BILLBOARD,1334.0,1154.0,316.0,218.0
15855,7.0,6028f8d801929a376ba22f33873316d2.jpg,BAD_BILLBOARD,1142.0,954.0,306.0,210.0
15856,7.0,6028f8d801929a376ba22f33873316d2.jpg,BAD_BILLBOARD,338.0,70.0,380.0,236.0


In [None]:
df.shape

(15858, 7)

#**1- train.csv annotations**

#Slicing rows

In [None]:
train_df = df.iloc[:1200]

#Check the number of train images

In [None]:
images_name = train_df['image_path']

In [None]:
images_name = images_name.sort_values().unique()

In [None]:
print(len(images_name))

500


#Verify that the sliced train csv dataset contains all types of visual pollution (10 categories)

In [None]:
train_labels = train_df['name']

In [None]:
train_labels = train_labels.sort_values().unique()

In [None]:
print(len(train_labels))

10


In [None]:
train_labels

array(['BAD_BILLBOARD', 'BROKEN_SIGNAGE', 'CLUTTER_SIDEWALK',
       'CONSTRUCTION_ROAD', 'FADED_SIGNAGE', 'GARBAGE', 'GRAFFITI',
       'POTHOLES', 'SAND_ON_ROAD', 'UNKEPT_FACADE'], dtype=object)

#**Write the pandas dataframe to a train.csv file(save the annotations)**

In [None]:
train_df.to_csv("/content/drive/MyDrive/visual-pollution-object-detection/data/train_labels.csv", index=False)

#**2- test.csv annotations**

#Slicing rows

In [None]:
test_df = df.iloc[1200:1701]

#Check the number of test images

In [None]:
images_name = test_df['image_path']

In [None]:
images_name = images_name.sort_values().unique()

In [None]:
print(len(images_name))

200


#Verify that the sliced test csv dataset contains all types of visual pollution (10 categories)

In [None]:
test_labels = test_df['name']

In [None]:
test_labels = test_labels.sort_values().unique()

In [None]:
print(len(test_labels))

10


#Write the pandas dataframe to a test.csv file(save the annotations)

In [None]:
test_df.to_csv("/content/drive/MyDrive/visual-pollution-object-detection/data/test_labels.csv", index=False)

#**Second --------------> Split images from train folder into train and test** 
80% approximately train images of the total 6300 images = 500 train images

20% approximately test images of the total 6300 images = 200 train images 

#**1- Move 500 train images into a train folder**

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/visual-pollution-object-detection/data/train_labels.csv')

In [None]:
images_name = train_df['image_path']

In [None]:
images_name = images_name.sort_values().unique()

In [None]:
print(len(images_name))

500


In [None]:
total_train_images = os.listdir('/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset/train')
print(len(total_train_images))

6300


In [None]:
images_path = '/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset/train'

In [None]:
train_dir = '/content/drive/MyDrive/visual-pollution-object-detection/images/train'

In [None]:
for img_id in images_name:
  get_image = os.path.join(images_path, img_id)
  move_image_to_cat = shutil.copy(get_image, train_dir)

In [None]:
total_train_images = os.listdir(train_dir)
print(len(total_train_images))

500


#**2- Move 200 test images into a test folder**

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/visual-pollution-object-detection/data/test_labels.csv')

In [None]:
images_name = test_df['image_path']

In [None]:
images_name = images_name.sort_values().unique()

In [None]:
print(len(images_name))

200


In [None]:
total_train_images = os.listdir('/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset/train')
print(len(total_train_images))

6300


In [None]:
images_path = '/content/drive/MyDrive/visual-pollution-object-detection/preprocessing-dataset/train'

In [None]:
test_dir = '/content/drive/MyDrive/visual-pollution-object-detection/images/test'

In [None]:
for img_id in images_name:
  get_image = os.path.join(images_path, img_id)
  move_image_to_cat = shutil.copy(get_image, test_dir)

In [None]:
total_test_images = os.listdir(test_dir)
print(len(total_test_images))

200
