<a href="https://colab.research.google.com/github/lawsonk16/IGARSS22/blob/main/Data_Prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import os
sys.path.append('/content/drive/MyDrive/Colab Notebooks/scripts/')
from simclr import *

In [None]:
from tqdm import tqdm
import shutil
import numpy as np
from collections import Counter

def semi_train_val_test(base_folder, new_folder, val_p = 20, test_p = 20, max_samples = 1000000000):
    '''
    IN:
      - base_folder: folder of images with folder names corresponding to class names
      - new_folder: folder where train and test split will be placed
      - test_p: percentage as integer that should be placed in the test folder
      - max_samples: integer value for the max samples per class. If a class has
                     more examples than this, a random selection up to this number
                     will be used
    OUT: no variables returned, creates train and test folder in new_folder, 
         with image chips renamed to include the class name for use in 
         semi-supervised learning
    '''

    # create new data folder with train and test sub-folders
    if not os.path.exists(new_folder):
        os.mkdir(new_folder)

    train_folder = new_folder + 'train/'
    val_folder = new_folder + 'val/'
    test_folder = new_folder + 'test/'

    if not os.path.exists(test_folder):
        os.mkdir(train_folder)
        os.mkdir(test_folder)
        os.mkdir(val_folder)
    
    # get list of class names
    cls_fs = os.listdir(base_folder)

    # process on a class by class basis 
    for cls_fp in tqdm(cls_fs):

        # get a list of images within this class, shuffle them
        fp = base_folder + cls_fp + '/'
        images = os.listdir(fp)
        num_samples = len(images)
        np.random.shuffle(images)

        # split using the percentage
        if num_samples < max_samples:
            test_index = int((num_samples)*(test_p)/100)
            val_index = int((num_samples)*(val_p)/100) + test_index
            test_samples = images[:test_index]
            val_samples = images[test_index:val_index]
            train_samples = images[val_index:]

        # unless the class is too big
        else:
            print(cls_fp)
            test_index = int(max_samples*(test_p)/100)
            val_index = int((max_samples)*(val_p)/100) + test_index
            test_samples = images[:test_index]
            val_samples = images[test_index:val_index]
            train_samples = images[val_index:max_samples]
        
        # Move the selected images to the correct new folders
        for t in test_samples:
            src = fp + t
            dst = test_folder + f'{cls_fp}_{t}'
            shutil.copy2(src,dst)
        for t in train_samples:
            src = fp + t
            dst = train_folder + f'{cls_fp}_{t}'
            shutil.copy2(src,dst)
        for t in val_samples:
            src = fp + t
            dst = val_folder + f'{cls_fp}_{t}'
            shutil.copy2(src,dst)

    return 

def get_semi_class_counts(root_fp):
    for split in os.listdir(root_fp):
        split_fp = root_fp + split
        split_files = os.listdir(split_fp)
        split_files.sort()
        split_files = [f.split('_')[0] for f in split_files]
        print(split, Counter(split_files))
    return

# DOTA Data

In [None]:
! cp '/content/drive/MyDrive/Colab Notebooks/SCHOOL/Final Project/Data/DOTA/DOTA_class.zip' .
! unzip -q '/content/DOTA_class.zip'

In [None]:
base_folder = '/content/content/DOTA_class/'
for cls in os.listdir(base_folder):
    cls_fp = base_folder + cls + '/'
    cls_ct = len(os.listdir(cls_fp))
    print(f'{cls} : {cls_ct}')

baseball-diamond : 412
roundabout : 437
soccer-ball-field : 338
small-vehicle : 126499
container-crane : 142
ship : 32973
harbor : 6016
basketball-court : 529
plane : 7988
helicopter : 635
bridge : 2075
large-vehicle : 22218
swimming-pool : 2181
storage-tank : 5346
tennis-court : 2425
ground-track-field : 331


In [None]:
base_folder = '/content/content/DOTA_class/'
val_p = 20
test_p = 20
max_samples = 2500
new_folder = f'/content/DOTA_val_{val_p}_test_{test_p}_max_{max_samples}/'
semi_train_val_test(base_folder, new_folder, val_p, test_p, max_samples)

 12%|█▎        | 2/16 [00:00<00:01, 12.88it/s]

small-vehicle


 25%|██▌       | 4/16 [00:00<00:02,  4.89it/s]

ship


 38%|███▊      | 6/16 [00:01<00:01,  5.54it/s]

harbor


 50%|█████     | 8/16 [00:01<00:02,  3.82it/s]

plane


 69%|██████▉   | 11/16 [00:02<00:01,  3.58it/s]

large-vehicle


 81%|████████▏ | 13/16 [00:03<00:00,  3.60it/s]

storage-tank


100%|██████████| 16/16 [00:04<00:00,  3.53it/s]


In [None]:
get_semi_class_counts(new_folder)

val Counter({'harbor': 500, 'large-vehicle': 500, 'plane': 500, 'ship': 500, 'small-vehicle': 500, 'storage-tank': 500, 'tennis-court': 485, 'swimming-pool': 436, 'bridge': 415, 'helicopter': 127, 'basketball-court': 105, 'roundabout': 87, 'baseball-diamond': 82, 'soccer-ball-field': 67, 'ground-track-field': 66, 'container-crane': 28})
test Counter({'harbor': 500, 'large-vehicle': 500, 'plane': 500, 'ship': 500, 'small-vehicle': 500, 'storage-tank': 500, 'tennis-court': 485, 'swimming-pool': 436, 'bridge': 415, 'helicopter': 127, 'basketball-court': 105, 'roundabout': 87, 'baseball-diamond': 82, 'soccer-ball-field': 67, 'ground-track-field': 66, 'container-crane': 28})
train Counter({'harbor': 1500, 'large-vehicle': 1500, 'plane': 1500, 'ship': 1500, 'small-vehicle': 1500, 'storage-tank': 1500, 'tennis-court': 1455, 'swimming-pool': 1309, 'bridge': 1245, 'helicopter': 381, 'basketball-court': 319, 'roundabout': 263, 'baseball-diamond': 248, 'soccer-ball-field': 204, 'ground-track-fiel

In [None]:
! zip -rq '/content/DOTA_val_20_test_20_max_2500.zip' '/content/DOTA_val_20_test_20_max_2500'
! cp '/content/DOTA_val_20_test_20_max_2500.zip' '/content/drive/MyDrive/Colab Notebooks/SCHOOL/Advanced Computer Vision/Final Project/'

# FAIR1M Data

In [None]:
! cp '/content/drive/MyDrive/Colab Notebooks/SCHOOL/Final Project/Data/FAIR1M/fair1m_classification.zip' .
! unzip -q 'fair1m_classification.zip'

In [None]:
base_folder = '/content/content/fair1m_classification/'
for cls in os.listdir(base_folder):
    cls_fp = base_folder + cls + '/'
    cls_ct = len(os.listdir(cls_fp))
    print(f'{cls} : {cls_ct}')

other-vehicle : 3065
Boeing737 : 3949
Excavator : 891
Tennis Court : 2924
A350 : 1064
Baseball Field : 1062
C919 : 135
Roundabout : 563
Small Car : 143390
Boeing747 : 1673
Trailer : 589
Bridge : 1008
A330 : 1599
Dry Cargo Ship : 9435
Truck Tractor : 923
ARJ21 : 166
Motorboat : 7706
Warship : 599
A321 : 2505
Boeing777 : 1532
Engineering Ship : 1425
Intersection : 6368
Dump Truck : 25794
Tractor : 262
Bus : 1022
other-airplane : 9975
Tugboat : 1453
Boeing787 : 1669
Van : 132438
other-ship : 2197
Passenger Ship : 575
Football Field : 853
Cargo Truck : 9257
Fishing Boat : 5174
Liquid Cargo Ship : 2898
A220 : 6057
Basketball Court : 1271


In [None]:
! rm -r '/content/content/fair1m_classification/other-airplane/'
! rm -r '/content/content/fair1m_classification/other-ship/'
! rm -r '/content/content/fair1m_classification/other-vehicle/'

In [None]:
val_p = 20
test_p = 20
max_samples = 2500
new_folder = f'/content/FAIR1M-no-other_val_{val_p}_test_{test_p}_max_{max_samples}/'
semi_train_val_test(base_folder, new_folder, val_p, test_p, max_samples)

  0%|          | 0/34 [00:00<?, ?it/s]

Boeing737


  6%|▌         | 2/34 [00:00<00:05,  5.78it/s]

Tennis Court


 21%|██        | 7/34 [00:01<00:04,  6.16it/s]

Small Car


 35%|███▌      | 12/34 [00:03<00:06,  3.26it/s]

Dry Cargo Ship


 41%|████      | 14/34 [00:03<00:05,  3.36it/s]

Motorboat


 50%|█████     | 17/34 [00:04<00:03,  5.01it/s]

A321


 59%|█████▉    | 20/34 [00:05<00:03,  3.54it/s]

Intersection


 62%|██████▏   | 21/34 [00:05<00:04,  3.18it/s]

Dump Truck


 76%|███████▋  | 26/34 [00:06<00:01,  5.15it/s]

Van


 85%|████████▌ | 29/34 [00:07<00:01,  4.35it/s]

Cargo Truck


 88%|████████▊ | 30/34 [00:07<00:00,  4.03it/s]

Fishing Boat


 91%|█████████ | 31/34 [00:07<00:00,  3.96it/s]

Liquid Cargo Ship


 94%|█████████▍| 32/34 [00:08<00:00,  3.37it/s]

A220


100%|██████████| 34/34 [00:08<00:00,  3.86it/s]


In [None]:
get_semi_class_counts(new_folder)

val Counter({'A220': 500, 'A321': 500, 'Boeing737': 500, 'Cargo Truck': 500, 'Dry Cargo Ship': 500, 'Dump Truck': 500, 'Fishing Boat': 500, 'Intersection': 500, 'Liquid Cargo Ship': 500, 'Motorboat': 500, 'Small Car': 500, 'Tennis Court': 500, 'Van': 500, 'Boeing747': 334, 'Boeing787': 333, 'A330': 319, 'Boeing777': 306, 'Tugboat': 290, 'Engineering Ship': 285, 'Basketball Court': 254, 'A350': 212, 'Baseball Field': 212, 'Bus': 204, 'Bridge': 201, 'Truck Tractor': 184, 'Excavator': 178, 'Football Field': 170, 'Warship': 119, 'Trailer': 117, 'Passenger Ship': 115, 'Roundabout': 112, 'Tractor': 52, 'ARJ21': 33, 'C919': 27})
test Counter({'A220': 500, 'A321': 500, 'Boeing737': 500, 'Cargo Truck': 500, 'Dry Cargo Ship': 500, 'Dump Truck': 500, 'Fishing Boat': 500, 'Intersection': 500, 'Liquid Cargo Ship': 500, 'Motorboat': 500, 'Small Car': 500, 'Tennis Court': 500, 'Van': 500, 'Boeing747': 334, 'Boeing787': 333, 'A330': 319, 'Boeing777': 306, 'Tugboat': 290, 'Engineering Ship': 285, 'Bask

In [None]:
fps = ['/content/FAIR1M-no-other_val_20_test_20_max_2500/train/', '/content/FAIR1M-no-other_val_20_test_20_max_2500/val/', '/content/FAIR1M-no-other_val_20_test_20_max_2500/test/']
clean_images(fps)

In [None]:
! zip -rq 'FAIR1M-no-other_val_20_test_20_max_2500.zip' 'FAIR1M-no-other_val_20_test_20_max_2500'
! cp 'FAIR1M-no-other_val_20_test_20_max_2500.zip' '/content/drive/MyDrive/Colab Notebooks/SCHOOL/Advanced Computer Vision/Final Project/'