# Create Dataset for models

In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!tar -xf "/content/drive/MyDrive/parking_model/PKLot.tar.gz" --directory "/content/drive/MyDrive/parking_model/"

This notebook is used to create dataset for all the models such as CNN and CNN-SVM models. 
 - For CNN-SVM models .csv files are created for train and test set to extract features separately using extract_features_from_images.py file
 - For CNN models, images are copied from PKLot Segmented folder into separate dataset folder with below folder structure
     - cnn_dataset
         - train
             - Empty
             - Occupied
         - valid
             - Empty
             - Occupied

In [None]:
from shutil import copy2
from glob import glob
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(1381)

%matplotlib inline

In [None]:
# Set the folders and its path to create train and validation split
# Model is trained on images from one parking lot and tested on another one
root_dir = "/content/drive/MyDrive/parking_model/PKLot/PKLotSegmented/"
classes = ["Empty", "Occupied"]

datasets = {
    "train": ["UFPR04/Sunny/", "UFPR04/Rainy/", "UFPR04/Cloudy/", "UFPR05/Sunny/", "UFPR05/Rainy/", "UFPR05/Cloudy/"],
    "valid": ["PUC/Sunny/","PUC/Rainy/", "PUC/Cloudy/"]
}

In [None]:
# Split the images into train and validation dataset and write the list as csv file
train_dataset = []
valid_dataset = []
                  
for c in classes:
    for data_type in datasets:
        for folder in datasets[data_type]:
            dir_content = [d for d in os.listdir(os.path.join(root_dir,folder)) if os.path.isdir(os.path.join(root_dir,folder,d))]
            #print(dir_content)
            for d in dir_content:
                folder_path = os.path.join(root_dir,folder,d, c)
                images = glob(os.path.join(folder_path, "*.jpg"))
                if len(images)>0:
                    if data_type == 'train':
                        sample_size = 0.3
                    else:
                        sample_size = 0.1
                    random_sample = np.random.choice(images, replace=False,size=int(len(images)*sample_size))
                    #print(len(random_sample))
                    for img in random_sample:
                        image_name = img.split("/")[-1]
                        temp = {}
                        temp['image_name'] = image_name
                        temp["label"] = c
                        temp['folder_path'] = os.path.join(folder,d)
                        temp['data_type'] = data_type
                        if data_type == 'train':
                            train_dataset.append(temp)
                        else:
                            valid_dataset.append(temp)

In [None]:
# convert the list into dataframe to store them as csv file
df_train = pd.DataFrame(train_dataset)
df_valid = pd.DataFrame(valid_dataset)

# shuffle the dataset
df_train = df_train.sample(frac=1, random_state=1431)
df_valid = df_valid.sample(frac=1, random_state=1431)

In [None]:
print("Train dataset size: ", df_train.shape)
print("Valid dataset size: ", df_valid.shape)

Train dataset size:  (81406, 4)
Valid dataset size:  (42384, 4)


In [None]:
def show_label_distribution(df):
    df_class = pd.DataFrame(df['label'].value_counts())
    df_class = df_class.reset_index()
    df_class = df_class.rename(columns={'index': 'label', 'label': 'count'})
    df_class['percentage'] = round((df_class['count'] / sum(df_class['count']))*100, 2)
    return df_class

In [None]:
train_df = show_label_distribution(df_train)
print("Train dataset ", train_df)
valid_df = show_label_distribution(df_valid)
print("Valid dataset ", valid_df)

Train dataset        label  count  percentage
0  Occupied  43027       52.85
1     Empty  38379       47.15
Valid dataset        label  count  percentage
0     Empty  22980       54.22
1  Occupied  19404       45.78


### Create index file for CNN-SVM models

In [None]:
df_train.to_csv('/content/drive/MyDrive/parking_model/dataset_train.csv', index=False)
df_valid.to_csv('/content/drive/MyDrive/parking_model/dataset_valid.csv', index=False)

In [None]:
df_train.head(3)

Unnamed: 0,image_name,label,folder_path,data_type
75820,2013-03-14_17_26_05#014.jpg,Occupied,UFPR05/Cloudy/2013-03-14,train
29303,2013-03-10_10_30_05#017.jpg,Empty,UFPR05/Sunny/2013-03-10,train
21360,2013-02-23_09_45_03#027.jpg,Empty,UFPR05/Sunny/2013-02-23,train


In [None]:
df_valid.head(3)

Unnamed: 0,image_name,label,folder_path,data_type
39424,2012-09-28_08_01_02#004.jpg,Occupied,PUC/Cloudy/2012-09-28,valid
26136,2012-09-18_15_30_13#054.jpg,Occupied,PUC/Sunny/2012-09-18,valid
7062,2012-11-10_15_53_05#086.jpg,Empty,PUC/Sunny/2012-11-10,valid


## Create train and valid dataset for CNN models

In [None]:
import pandas as pd
import os
from shutil import copy2

In [None]:
root_dir = "/content/drive/MyDrive/parking_model/PKLot/PKLotSegmented/"
df_train = pd.read_csv("/content/drive/MyDrive/parking_model/dataset_train.csv")
df_valid = pd.read_csv("/content/drive/MyDrive/parking_model/dataset_valid.csv")

In [None]:
cnn_dataset = "/content/drive/MyDrive/parking_model/cnn_dataset/"

if not os.path.isdir(cnn_dataset):
    os.makedirs(cnn_dataset)
    print('CNN dataset folder created at ', cnn_dataset)

Create sample datataset. To create full train and valid dataset set the sample size to 100000 (size greater than number of images in df_train and df_valid to include all the images)

In [None]:
# set sample size to create sample dataset
train_sample_size = 100000
valid_sample_size = 100000

### Copy train dataset

In [None]:
if not os.path.isdir(os.path.join(cnn_dataset, 'train', 'Occupied')):
    os.makedirs(os.path.join(cnn_dataset, 'train', 'Occupied'))
    print('train/Occupied folder created at ', cnn_dataset)

if not os.path.isdir(os.path.join(cnn_dataset, 'train', 'Empty')):
    os.makedirs(os.path.join(cnn_dataset, 'train', 'Empty'))
    print('train/Empty folder created at ', cnn_dataset)

count = 1
for idx in df_train.index.values[:train_sample_size]:
    src = os.path.join(root_dir, df_train.loc[idx, 'folder_path'], df_train.loc[idx, 'label'], df_train.loc[idx, 'image_name'])
    if df_train.loc[idx, 'label'] == 'Occupied':
        dst = os.path.join(cnn_dataset, 'train', 'Occupied')
    else:
        dst = os.path.join(cnn_dataset, 'train', 'Empty')
        
    copy2(src, dst)
    count+=1

print("Total train images:", count)

Total train images: 81407


In [47]:
#!zip -r cnn_dataset.zip . -i drive/MyDrive/parking_model/cnn_dataset/
!zip -vr drive/MyDrive/parking_model/cnn_dataset.zip drive/MyDrive/parking_model/cnn_dataset/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: drive/MyDrive/parking_model/cnn_dataset/valid/Empty/2012-11-11_14_44_08#091.jpg	(in=2517) (out=2355) (deflated 6%)
  adding: drive/MyDrive/parking_model/cnn_dataset/valid/Empty/2012-10-12_05_37_28#035.jpg	(in=1322) (out=1146) (deflated 13%)
  adding: drive/MyDrive/parking_model/cnn_dataset/valid/Empty/2012-10-28_18_42_20#089.jpg	(in=2084) (out=1923) (deflated 8%)
  adding: drive/MyDrive/parking_model/cnn_dataset/valid/Empty/2012-10-12_08_02_35#047.jpg	(in=1678) (out=1513) (deflated 10%)
  adding: drive/MyDrive/parking_model/cnn_dataset/valid/Empty/2012-10-25_14_08_36#097.jpg	(in=2697) (out=2536) (deflated 6%)
  adding: drive/MyDrive/parking_model/cnn_dataset/valid/Empty/2012-09-29_06_22_03#042.jpg	(in=1270) (out=1091) (deflated 14%)
  adding: drive/MyDrive/parking_model/cnn_dataset/valid/Empty/2012-09-20_14_44_36#095.jpg	(in=1982) (out=1821) (deflated 8%)
  adding: drive/MyDrive/parking_model/cnn_dataset/valid/E

In [46]:
!pwd

/content


### Copy valid dataset

In [None]:
if not os.path.isdir(os.path.join(cnn_dataset, 'valid', 'Occupied')):
    os.makedirs(os.path.join(cnn_dataset, 'valid', 'Occupied'))
    print('valid/Occupied folder created at ', cnn_dataset)

if not os.path.isdir(os.path.join(cnn_dataset, 'valid', 'Empty')):
    os.makedirs(os.path.join(cnn_dataset, 'valid', 'Empty'))
    print('valid/Empty folder created at ', cnn_dataset)

count = 1
for idx in df_valid.index.values[:valid_sample_size]:
    src = os.path.join(root_dir, df_valid.loc[idx, 'folder_path'], df_valid.loc[idx, 'label'], df_valid.loc[idx, 'image_name'])
    if df_valid.loc[idx, 'label'] == 'Occupied':
        dst = os.path.join(cnn_dataset, 'valid', 'Occupied')
    else:
        dst = os.path.join(cnn_dataset, 'valid', 'Empty')
        
    copy2(src, dst)
    count+=1
    
print("Total valid images:", count)

valid/Occupied folder created at  /content/drive/MyDrive/parking_model/cnn_dataset/
valid/Empty folder created at  /content/drive/MyDrive/parking_model/cnn_dataset/
Total valid images: 42385
