In [1]:
import os
import sys
import math
import glob
import pathlib
import multiprocessing as mp
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import cv2

In [2]:
ROOT = os.path.abspath("../")
if ROOT not in sys.path:
    sys.path.append(ROOT)

from datasets.wafer import WM811K
from datasets.transforms import WM811KTransform
from baselines.wm811k.features import RadonFeatures
from baselines.wm811k.features import GeometryFeatures
from baselines.wm811k.features import DensityFeatures

In [3]:
TRAIN_DIR = os.path.join(ROOT, "data/wm811k/labeled/train")
VALID_DIR = os.path.join(ROOT, "data/wm811k/labeled/valid")
TEST_DIR  = os.path.join(ROOT, "data/wm811k/labeled/test")

In [4]:
train_data = glob.glob(os.path.join(TRAIN_DIR, "**/*.png"), recursive=True)
print(f"Size of train data: {len(train_data):,}")

valid_data = glob.glob(os.path.join(VALID_DIR, "**/*.png"), recursive=True)
print(f"Size of valid data: {len(valid_data):,}")

test_data = glob.glob(os.path.join(TEST_DIR, "**/*.png"), recursive=True)
print(f"Size of test data: {len(test_data):,}")

Size of train data: 138,360
Size of valid data: 17,295
Size of test data: 17,295


In [5]:
def create_sample(filename: str):
    idx = int(os.path.basename(filename).replace('.png', ''))
    label = pathlib.Path(filename).parent.name
    try:
        rad_data = pd.DataFrame(RadonFeatures(filename).data, index=[idx])
        geo_data = pd.DataFrame(GeometryFeatures(filename).data, index=[idx])
        den_data = pd.DataFrame(DensityFeatures(filename).data, index=[idx])
        label_data = pd.DataFrame([label], index=[idx], columns=['label'])
        data = pd.concat([rad_data, geo_data, den_data, label_data], axis=1)
        return data
    except ValueError as e:
        return None

In [6]:
def create_dataframe(filenames: list, p: int = 8):
    with mp.Pool(p) as pool:
        samples = pool.map(create_sample, filenames)    
    samples = [s for s in samples if s is not None]
    return pd.concat(samples, axis=0)

In [7]:
write_dir = os.path.join(ROOT, 'data/wm811k/baselines/wmfpr/')
os.makedirs(write_dir, exist_ok=True)

In [8]:
%%time
train_df = create_dataframe(train_data, p=8)
train_df.to_csv(os.path.join(write_dir, 'train.csv'), header=True, index=True)

CPU times: user 6min 58s, sys: 3.94 s, total: 7min 2s
Wall time: 14min 38s


In [9]:
%%time
valid_df = create_dataframe(valid_data, p=8)
valid_df.to_csv(os.path.join(write_dir, 'valid.csv'), header=True, index=True)

CPU times: user 52.1 s, sys: 873 ms, total: 53 s
Wall time: 1min 50s


In [10]:
%%time
test_df = create_dataframe(test_data, p=8)
test_df.to_csv(os.path.join(write_dir, 'test.csv'), header=True, index=True)

CPU times: user 52.7 s, sys: 637 ms, total: 53.4 s
Wall time: 1min 51s


In [11]:
train_df.shape

(138359, 75)

In [13]:
valid_df.shape

(17295, 75)

In [14]:
test_df.shape

(17295, 75)