# HSO Magma Dataset

In [None]:
from keras.preprocessing import image as keras_image

import os
import glob
import shutil
import matplotlib.pyplot as plt
import zipfile
import urllib.request

## Download & Extract the Dataset

In [None]:
url = 'https://bwsyncandshare.kit.edu/dl/fiHEyLDSnrVJGFd6j1xDDh3r/MagmaDataset.zip'

print('Beginning file download...')
urllib.request.urlretrieve(url,'./MagmaDataset.zip',)

print('Unzip MagmaDataset.zip to ./data...')
with zipfile.ZipFile('./MagmaDataset.zip', 'r') as zip_ref:
    zip_ref.extractall('./data')
print('Download & Extraction finished')

## Constants

In [None]:
CATEGORIES = ['0','1','2','3','4','5','6','7']
dataset_dir = './data/dataset'
train_dir = './data/training'
val_dir = './data/validation'
test_dir = './data/test'

os.path.isdir(dataset_dir)

In [None]:
# clean the data directory
shutil.rmtree(train_dir, ignore_errors=True)
shutil.rmtree(val_dir, ignore_errors=True)
shutil.rmtree(test_dir, ignore_errors=True)

## Distribution of Images to Classes

In [None]:
data=dict()
for x in CATEGORIES:
    data[x] = len(os.listdir(os.path.join(dataset_dir,x)))
print(data)

In [None]:
plt.bar(x=data.keys(), height=data.values())

## Plot Pictures from the Categories

In [None]:
plt.figure(figsize=(15,15))

ROWS =  8

for x in CATEGORIES:
    image_names = glob.glob(dataset_dir+'/'+x +'/*.png')
    idx = int(x)+1
    for i in range(ROWS):
        img = keras_image.load_img(image_names[i],color_mode='rgb',target_size=(100,100),)
        ax = plt.subplot(ROWS,len(CATEGORIES),idx)
        idx += len(CATEGORIES)
        
        ax.imshow(img)
        ax.set_title('')
        ax.tick_params(bottom=False, left=False,  labelleft=False, labelbottom=False)
    
plt.tight_layout()
plt.show()

## Split Dataset in Training,Validation and Test  (80/10/10)

In [None]:
shutil.rmtree(train_dir, ignore_errors=True)
shutil.rmtree(val_dir, ignore_errors=True)
shutil.rmtree(test_dir, ignore_errors=True)

for x in CATEGORIES:
    # get an unsorted list of file names of dir
    image_names = os.listdir(os.path.join(dataset_dir,x))
    image_count = len(image_names)

    # use the first 80% from dataset for training
    image_name_train = image_names[:int(0.8*image_count)]
    # use the next 10% from dataset for validation
    image_name_val = image_names[int(0.8*image_count):int(0.9*image_count)]
    # use the last 10% from dataset for test
    image_name_test = image_names[int(0.9*image_count):]

    os.makedirs(os.path.join(train_dir,x))
    os.makedirs(os.path.join(val_dir,x))
    os.makedirs(os.path.join(test_dir,x))

    print('Spilt Class {} WITH {} IMAGES : {} TRAIN IMAGES | {} VAL IMAGES | {} TEST IMAGES'.format(x,
                                                                                                    image_count,
                                                                                                    len(image_name_train),
                                                                                                    len(image_name_val),
                                                                                                    len(image_name_test)))

    [shutil.copy(os.path.join(dataset_dir,x,file),os.path.join(train_dir,x,file)) for file in image_name_train]
    [shutil.copy(os.path.join(dataset_dir,x,file),os.path.join(val_dir,x,file)) for file in image_name_val]
    [shutil.copy(os.path.join(dataset_dir,x,file),os.path.join(test_dir,x,file)) for file in image_name_test]
    